# Modeling

- TODO: add notebook description

## Table of contents:

* [1. Data loading and preprocessing](#first-enumeration)

In [14]:
import os

import numpy as np
from sklearn.model_selection import TimeSeriesSplit
import pandas as pd
from datetime import datetime

from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error

import holidays

In [15]:
# Global variable for data relative path
DATA_PATH = os.path.abspath("../data/inputs")

## 1. Data loading and preprocessing

### 1.1 Data loading

In [23]:
data = pd.read_csv(os.path.join(DATA_PATH, "train.csv"))
bu_feat = pd.read_csv(os.path.join(DATA_PATH, "bu_feat.csv"))

### 1.2 Data preprocessing

In [24]:
def is_holiday_week(data):

    #get holiday dates in France from 2012 to 2017
    holidays_france = pd.DataFrame(
        holidays.France(years=range(2012, 2018)).keys(),
        dtype="datetime64[ns]",
        columns=["holiday_date"])

    # make a tuple of (year, week of year)
    holidays_france["year"] = holidays_france["holiday_date"].dt.year
    holidays_france["week"] = holidays_france["holiday_date"].dt.isocalendar().week

    year_week_tuple = list(holidays_france[["year", "week"]].itertuples(index=False, name=None))

    # check each row in the data if it belongs to (year, week of the year) tuple
    return pd.Series(list(zip(data.year, data.week)), index=data.index).isin(year_week_tuple)

In [25]:
def process_data(data):

    # set day_id adequate type
    data["day_id"] = pd.to_datetime(data["day_id"], infer_datetime_format=True)
    # set day_id as index
    data.set_index("day_id", inplace=True)

    # dates preprocessing
    data["year"] =  data.index.year
    data["month"] = data.index.month
    data["week"] = data.index.isocalendar().week
    data["quarter"] = data.index.quarter

    # define the 4 seasons of the year based on months
    seasons = [1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 1]
    month_to_season = dict(zip(range(1,13), seasons))
    data["season"] = data.index.month.map(month_to_season)

    #either a day in the weekly turnover belongs to a holiday
    data["is_holiday"] = is_holiday_week(data).astype(int)

    #one hot encoding categorical features
    data = pd.get_dummies(data,
                          columns=["dpt_num_department", "but_num_business_unit", "year", "month", "week", "season", "quarter"],
                          prefix= ["department", "business_unit", "year", "month", "week", "season", "quarter"])
    return data


In [26]:
data = process_data(data)

In [28]:
data.head()

Unnamed: 0_level_0,turnover,is_holiday,department_73,department_88,department_117,department_127,business_unit_1,business_unit_2,business_unit_4,business_unit_5,...,week_52,week_53,season_1,season_2,season_3,season_4,quarter_1,quarter_2,quarter_3,quarter_4
day_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-09-30,580.308443,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2017-09-30,1512.995918,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2017-09-30,668.593556,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,1,0
2017-09-30,0.0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2017-09-30,0.0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0


### 1.3 Split train/test set


In [9]:
# train on all data except the last month
train_idx = data.index.date <= datetime(year=2017, month=8, day=31).date()

In [10]:
X = data.drop(labels=['turnover'], axis=1)
y = data['turnover']

In [11]:
X_train, y_train = X.loc[train_idx], y.loc[train_idx]
X_test, y_test = X.loc[~train_idx], y.loc[~train_idx]

## 2. Train a simple regressor


In [99]:
reg = ExtraTreesRegressor(n_estimators=300, random_state=0, )

In [None]:
reg.fit(X_train, y_train, verbose= 2)

## 3. Evaluate the model

In [None]:
reg.score(X_test, y_test)

In [None]:
y_pred = reg.predict(X_test)

In [None]:
mean_absolute_error(y_test, y_pred)

In [None]:
mean_absolute_percentage_error(y_test, y_pred)

In [None]:
mean_squared_error(y_test, y_pred)