In [22]:
from jours_feries_france import JoursFeries
import os
import pandas as pd
from pathlib import Path
import re
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline

from vacances_scolaires_france import SchoolHolidayDates

In [5]:
if os.getcwd().split("/")[-1] == "modeling":
    os.chdir(os.path.join(os.getcwd(), '..'))

import problem

## Load data

In [6]:
X_train, y_train = problem.get_train_data()
X_test, y_test = problem.get_test_data()

## Functions for adding more variables

In [7]:
def _encode_dates(X, last_step: bool=True):
    X = X.copy()  # modify a copy of X
    # Encode the date information from the DateOfDeparture columns
    X.loc[:, "year"] = X["date"].dt.year
    X.loc[:, "month"] = X["date"].dt.month
    X.loc[:, "day"] = X["date"].dt.day
    X.loc[:, "weekday"] = X["date"].dt.weekday
    X.loc[:, "hour"] = X["date"].dt.hour

    if last_step: 
        # Finally we can drop the original columns from the dataframe
        return X.drop(columns=["date"])
    else:
        return X

In [8]:
def _additional_date_variables(X, last_step: bool=True):
    X = X.copy()  # modify a copy of X

    # add seasons
    seasons = {1: "winter", 2: "winter", 3: "spring", 4: "spring", 
               5: "spring", 6: "summer", 7: "summer", 8: "summer", 
               9: "autumn", 10: "autumn", 11: "autumn", 12: "winter"}
    X.loc[:, "season"] = X["date"].dt.month.map(seasons)

    public_holidays = []
    school_holidays = {}
    for year in X["date"].dt.year.unique():
        public_holidays.extend(JoursFeries.for_year(year).values())
        school_holidays.update(SchoolHolidayDates().holidays_for_year_and_zone(year, 'C'))
    
    # add public holidays
    X.loc[:, "public_holiday"] = X["date"].isin(public_holidays)

    # add school holidays names
    # school_holidays_name = {k: re.sub("\s+|'", '_', 
    #                                   re.sub('[éë]', 'e', v['nom_vacances'].lower())) 
    #                         for k, v in school_holidays.items() if v['vacances_zone_c']}
    # X.loc[:, "school_holiday_name"] = X["date"].map(school_holidays_name)

    # add school holidays
    school_holidays_bool = [k for k,v in school_holidays.items() if v['vacances_zone_c']]
    X.loc[:, "school_holiday"] = X["date"].isin(school_holidays_bool)

    if last_step: 
        # Finally we can drop the original columns from the dataframe
        return X.drop(columns=["date"])
    else:
        return X

## Linear Model

In [11]:
date_encoder = FunctionTransformer(_encode_dates, kw_args={"last_step": True})
date_cols = _encode_dates(X_train[["date"]]).columns.tolist()

categorical_encoder = OneHotEncoder(handle_unknown="ignore")
categorical_cols = ["counter_name", "site_name"]

preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        ("cat", categorical_encoder, categorical_cols),
    ]
)

regressor = Ridge()

pipe = make_pipeline(date_encoder, preprocessor, regressor)
pipe.fit(X_train, y_train)

In [12]:
print(
    f"Train set, RMSE={mean_squared_error(y_train, pipe.predict(X_train), squared=False):.2f}"
)
print(
    f"Test set, RMSE={mean_squared_error(y_test, pipe.predict(X_test), squared=False):.2f}"
)

Train set, RMSE=0.80
Test set, RMSE=0.73


In [9]:
date_encoder = FunctionTransformer(_encode_dates, kw_args={"last_step": False})
date_cols = ["year", "month", "day", "weekday", "hour"]

add_date_encoder = FunctionTransformer(_additional_date_variables, kw_args={"last_step": True})
add_date_cols = ["season"]

categorical_encoder = OneHotEncoder(handle_unknown="ignore")
categorical_cols = ["counter_name", "site_name"]

preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        ("add_date", OneHotEncoder(handle_unknown="ignore"), add_date_cols),
        ("cat", categorical_encoder, categorical_cols),
    ]
)

regressor = Ridge()

pipe = make_pipeline(date_encoder, add_date_encoder, preprocessor, regressor)
pipe.fit(X_train, y_train)

In [10]:
print(
    f"Train set, RMSE={mean_squared_error(y_train, pipe.predict(X_train), squared=False):.2f}"
)
print(
    f"Test set, RMSE={mean_squared_error(y_test, pipe.predict(X_test), squared=False):.2f}"
)

Train set, RMSE=0.80
Test set, RMSE=0.72


## Tree Models

In [26]:
date_encoder = FunctionTransformer(_encode_dates, kw_args={"last_step": False})
date_cols = ["year", "month", "day", "weekday", "hour"]

add_date_encoder = FunctionTransformer(_additional_date_variables, kw_args={"last_step": True})
add_date_cols = ["season"]

categorical_encoder = OneHotEncoder(handle_unknown="ignore")
categorical_cols = ["counter_name", "site_name"]

preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        ("add_date", OneHotEncoder(handle_unknown="ignore"), add_date_cols),
        ("cat", categorical_encoder, categorical_cols),
    ]
)

regressor = RandomForestRegressor(max_features="sqrt", verbose=100, n_jobs=4)

pipe = make_pipeline(date_encoder, add_date_encoder, preprocessor, regressor)
pipe.fit(X_train, y_train)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:   26.5s

building tree 6 of 100[Parallel(n_jobs=4)]: Done   2 tasks      | elapsed:   28.6s

building tree 7 of 100[Parallel(n_jobs=4)]: Done   3 tasks      | elapsed:   29.0s

building tree 8 of 100[Parallel(n_jobs=4)]: Done   4 tasks      | elapsed:   29.5s

building tree 9 of 100[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:   50.2s

building tree 10 of 100[Parallel(n_jobs=4)]: Done   6 tasks      | elapsed:   53.1s

building tree 11 of 100[Parallel(n_jobs=4)]: Done   7 tasks      | elapsed:   53.3s

building tree 12 of 100[Parallel(n_jobs=4)]: Done   8 tasks      | elapsed:   54.6s

building tree 13 of 100[Parallel(n_jobs=4)]: Done   9 tasks      | elapsed:  1.3min

building tree 14 of 100[Parallel(n_jobs=4)]: Done  10 tasks      | e

In [27]:
print(
    f"Train set, RMSE={mean_squared_error(y_train, pipe.predict(X_train), squared=False):.2f}"
)
print(
    f"Test set, RMSE={mean_squared_error(y_test, pipe.predict(X_test), squared=False):.2f}"
)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done   3 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done   4 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done   6 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done   7 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done   8 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done   9 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done  11 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done  12 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done  13 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done  14 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done  15 tasks      | elapsed:    0.5s
[Para

In [28]:
date_encoder = FunctionTransformer(_encode_dates, kw_args={"last_step": False})
date_cols = ["year", "month", "day", "weekday", "hour"]

add_date_encoder = FunctionTransformer(_additional_date_variables, kw_args={"last_step": True})
add_date_cols = ["season"]

categorical_encoder = OneHotEncoder(handle_unknown="ignore")
categorical_cols = ["counter_name", "site_name"]

preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        ("add_date", OneHotEncoder(handle_unknown="ignore"), add_date_cols),
        ("cat", categorical_encoder, categorical_cols),
    ]
)

regressor = RandomForestRegressor(max_features="sqrt", verbose=100, n_jobs=4, max_samples=0.5)

pipe = make_pipeline(date_encoder, add_date_encoder, preprocessor, regressor)
pipe.fit(X_train, y_train)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:   15.4s

building tree 6 of 100
[Parallel(n_jobs=4)]: Done   2 tasks      | elapsed:   15.5s
building tree 7 of 100[Parallel(n_jobs=4)]: Done   3 tasks      | elapsed:   16.0s

building tree 8 of 100
[Parallel(n_jobs=4)]: Done   4 tasks      | elapsed:   16.0s
building tree 9 of 100[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:   31.0s

building tree 10 of 100[Parallel(n_jobs=4)]: Done   6 tasks      | elapsed:   31.6s

building tree 11 of 100
[Parallel(n_jobs=4)]: Done   7 tasks      | elapsed:   31.8s
building tree 12 of 100
[Parallel(n_jobs=4)]: Done   8 tasks      | elapsed:   31.8s
building tree 13 of 100[Parallel(n_jobs=4)]: Done   9 tasks      | elapsed:   46.5s

building tree 14 of 100[Parallel(n_jobs=4)]: Done  10 tasks      | e

In [29]:
print(
    f"Train set, RMSE={mean_squared_error(y_train, pipe.predict(X_train), squared=False):.2f}"
)
print(
    f"Test set, RMSE={mean_squared_error(y_test, pipe.predict(X_test), squared=False):.2f}"
)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done   3 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done   4 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done   6 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done   7 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done   8 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done   9 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done  11 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done  12 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done  13 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done  14 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done  15 tasks      | elapsed:    0.4s
[Para