# Modeling

- TODO: add notebook description

## Table of contents:

* [1. Data loading and preprocessing](#first-enumeration)

In [85]:
import os

import numpy as np
from sklearn.model_selection import TimeSeriesSplit
import pandas as pd
from datetime import datetime

In [86]:
# Global variable for data relative path
DATA_PATH = os.path.abspath("../data/inputs")

## 1. Data loading and preprocessing

### 1.1 Data loading

In [90]:
data = pd.read_csv(os.path.join(DATA_PATH, "train.csv"))
bu_feat = pd.read_csv(os.path.join(DATA_PATH, "bu_feat.csv"))

In [91]:
data["day_id"] = pd.to_datetime(data["day_id"], infer_datetime_format=True)
# set day_id as index
data.set_index("day_id", inplace=True)
data.insert(0, "week_num", data.index.isocalendar().week)

In [92]:
data.columns

Index(['week_num', 'but_num_business_unit', 'dpt_num_department', 'turnover'], dtype='object')

In [94]:
data = pd.get_dummies(data, columns=["week_num", "but_num_business_unit", "dpt_num_department"], prefix=["week", "business_unit", "department"])

### 1.2 Data preprocessing

In [4]:
def preprocess_data(data):
    #format day_id as datetime type
    data["day_id"] = pd.to_datetime(data["day_id"], infer_datetime_format=True)
    # set day_id as index
    data.set_index("day_id", inplace=True)
    data.insert(0, "week_num", data.index.isocalendar().week)

    data_dummies = pd.get_dummies()
    # create one hot encoding of business units
    #business_units_one_hot_encoded = pd.get_dummies(train_data["but_num_business_unit"], prefix="business_unit")
    # create one hot encoding of departments
    #department_one_hot_encoded = pd.get_dummies(train_data["dpt_num_department"], prefix="department")
    # concat the one hot encoded features
    #data = pd.concat([department_one_hot_encoded, business_units_one_hot_encoded, data["turnover"]], axis=1)
    # add week number to data

    # sort data
    data.sort_index(inplace=True)
    return data

In [5]:
data = preprocess_data(data)

In [6]:
data

Unnamed: 0_level_0,week_num,but_num_business_unit,dpt_num_department,turnover
day_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2012-12-29,52,54,73,53.337413
2012-12-29,52,255,73,0.000000
2012-12-29,52,812,73,43.619478
2012-12-29,52,24,73,5.237134
2012-12-29,52,201,127,825.383999
...,...,...,...,...
2017-09-30,39,64,73,8.415006
2017-09-30,39,242,117,1.522201
2017-09-30,39,363,88,242.183759
2017-09-30,39,15,73,21.409645


In [95]:
train_idx = data.index.date <= datetime(year=2017, month=8, day=31).date()

In [96]:
X = data.drop(labels=['turnover'], axis=1)
y = data['turnover']

In [97]:
X_train, y_train = X.loc[train_idx], y.loc[train_idx]
X_test, y_test = X.loc[~train_idx], y.loc[~train_idx]

In [98]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, mean_squared_log_error

In [99]:
reg = ExtraTreesRegressor(n_estimators=300, random_state=0)

In [None]:
reg.fit(X_train, y_train)

In [None]:
reg.score(X_test, y_test)

In [None]:
y_pred = reg.predict(X_test)

In [None]:
mean_absolute_error(y_test, y_pred)

In [None]:
mean_absolute_percentage_error(y_test, y_pred)

In [None]:
mean_squared_error(y_test, y_pred)

In [None]:
y_pred.shape

In [None]:
y_test.to_numpy()

In [None]:
y_pred

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf_reg = RandomForestRegressor(n_estimators=100)

In [None]:
rf_reg.fit(X_train, y_train)

In [None]:
y_pred = rf_reg.predict(X_test)

In [None]:
mean_absolute_error(y_test, y_pred)