In [1]:
from pathlib import Path
import pandas as pd
import re

from jours_feries_france import JoursFeries
from vacances_scolaires_france import SchoolHolidayDates

In [49]:
def _encode_dates(X):
    X = X.copy()  # modify a copy of X
    # Encode the date information from the DateOfDeparture columns
    X.loc[:, "year"] = X["date"].dt.year
    X.loc[:, "month"] = X["date"].dt.month
    X.loc[:, "day"] = X["date"].dt.day
    X.loc[:, "weekday"] = X["date"].dt.weekday
    X.loc[:, "hour"] = X["date"].dt.hour

    # Finally we can drop the original columns from the dataframe
    return X # .drop(columns=["date"])

In [56]:
def _additional_date_variables(X):
    X = X.copy()  # modify a copy of X

    # add seasons
    seasons = {1: "winter", 2: "winter", 3: "spring", 4: "spring", 
               5: "spring", 6: "summer", 7: "summer", 8: "summer", 
               9: "autumn", 10: "autumn", 11: "autumn", 12: "winter"}
    X.loc[:, "season"] = X["date"].dt.month.map(seasons)

    public_holidays = []
    school_holidays = {}
    for year in X["date"].dt.year.unique():
        public_holidays.extend(JoursFeries.for_year(year).values())
        school_holidays.update(SchoolHolidayDates().holidays_for_year_and_zone(year, 'C'))
    
    # add public holidays
    X.loc[:, "public_holiday"] = X["date"].isin(public_holidays)

    # add school holidays names
    # school_holidays_name = {k: re.sub("\s+|'", '_', 
    #                                   re.sub('[éë]', 'e', v['nom_vacances'].lower())) 
    #                         for k, v in school_holidays.items() if v['vacances_zone_c']}
    # X.loc[:, "school_holiday_name"] = X["date"].map(school_holidays_name)

    # add school holidays
    school_holidays_bool = [k for k,v in school_holidays.items() if v['vacances_zone_c']]
    X.loc[:, "school_holiday"] = X["date"].isin(school_holidays_bool)

    # Finally we can drop the original columns from the dataframe
    return X.drop(columns=["date"])

## Linear Model

In [32]:
import os

In [36]:
os.chdir(os.path.join(os.getcwd(), '..'))

In [38]:
import problem

X_train, y_train = problem.get_train_data()
X_test, y_test = problem.get_test_data()

In [57]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline

date_encoder = FunctionTransformer(_encode_dates)
date_cols = ["year", "month", "day", "weekday", "hour"]

add_date_encoder = FunctionTransformer(_additional_date_variables)
# add_date_cols = _additional_date_variables(X_train[["date"]]).columns.tolist()
add_date_cols = ["season"]

categorical_encoder = OneHotEncoder(handle_unknown="ignore")
categorical_cols = ["counter_name", "site_name"]

preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        ("add_date", OneHotEncoder(handle_unknown="ignore"), add_date_cols),
        ("cat", categorical_encoder, categorical_cols),
    ]
)

regressor = Ridge()

pipe = make_pipeline(date_encoder, add_date_encoder, preprocessor, regressor)
pipe.fit(X_train, y_train)

In [58]:
from sklearn.metrics import mean_squared_error

print(
    f"Train set, RMSE={mean_squared_error(y_train, pipe.predict(X_train), squared=False):.2f}"
)
print(
    f"Test set, RMSE={mean_squared_error(y_test, pipe.predict(X_test), squared=False):.2f}"
)

Train set, RMSE=0.80
Test set, RMSE=0.72
