In [1]:
from jours_feries_france import JoursFeries
import os
import pandas as pd
from pathlib import Path
import re
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline

from vacances_scolaires_france import SchoolHolidayDates

In [2]:
if os.getcwd().split("\\")[-1] == "modeling":
    os.chdir(os.path.join(os.getcwd(), '..'))

import problem

In [3]:
X_train, y_train = problem.get_train_data()
X_test, y_test = problem.get_test_data()

In [4]:
weather_data_imp = problem.read_weather_data()

X_train_full = problem.merge_external_data(X_train)
X_test_full = pd.merge(X_test, weather_data_imp, on="date", how="left")

In [6]:
X_train_full


Unnamed: 0,counter_id,counter_name,site_id,site_name,date,counter_installation_date,counter_technical_id,latitude,longitude,temp,dwpt,rhum,prcp,wdir,wspd,pres
0,100049407-353255860,152 boulevard du Montparnasse E-O,100049407,152 boulevard du Montparnasse,2020-09-01 01:00:00,2018-12-07,Y2H19070373,48.840801,2.333233,13.6,9.6,77.0,0.0,NNW,3.6,1020.1
30,100049407-353255859,152 boulevard du Montparnasse O-E,100049407,152 boulevard du Montparnasse,2020-09-01 01:00:00,2018-12-07,Y2H19070373,48.840801,2.333233,13.6,9.6,77.0,0.0,NNW,3.6,1020.1
31,100036719-104036719,18 quai de l'Hôtel de Ville NO-SE,100036719,18 quai de l'Hôtel de Ville,2020-09-01 01:00:00,2017-07-12,Y2H19027732,48.853720,2.357020,13.6,9.6,77.0,0.0,NNW,3.6,1020.1
32,100036719-103036719,18 quai de l'Hôtel de Ville SE-NO,100036719,18 quai de l'Hôtel de Ville,2020-09-01 01:00:00,2017-07-12,Y2H19027732,48.853720,2.357020,13.6,9.6,77.0,0.0,NNW,3.6,1020.1
33,100063175-353277233,20 Avenue de Clichy NO-SE,100063175,20 Avenue de Clichy,2020-09-01 01:00:00,2020-07-22,Y2H20073268,48.885290,2.326660,13.6,9.6,77.0,0.0,NNW,3.6,1020.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455129,100057329-103057329,Totem 85 quai d'Austerlitz SE-NO,100057329,Totem 85 quai d'Austerlitz,2021-08-09 23:00:00,2020-02-18,YTH19111508,48.842010,2.367290,18.4,13.5,73.0,0.0,SSW,7.6,1017.8
455130,100057380-104057380,Totem Cours la Reine E-O,100057380,Totem Cours la Reine,2021-08-09 23:00:00,2020-02-11,YTH19111509,48.864620,2.314440,18.4,13.5,73.0,0.0,SSW,7.6,1017.8
455119,100057380-103057380,Totem Cours la Reine O-E,100057380,Totem Cours la Reine,2021-08-09 23:00:00,2020-02-11,YTH19111509,48.864620,2.314440,18.4,13.5,73.0,0.0,SSW,7.6,1017.8
455136,100042374-110042374,Voie Georges Pompidou NE-SO,100042374,Voie Georges Pompidou,2021-08-09 23:00:00,2017-12-15,Y2H21025335,48.848400,2.275860,18.4,13.5,73.0,0.0,SSW,7.6,1017.8


Basic Ridge

In [4]:
date_encoder = FunctionTransformer(problem._encode_dates, kw_args={"last_step": True})
date_cols = problem._encode_dates(X_train[["date"]]).columns.tolist()

categorical_encoder = OneHotEncoder(handle_unknown="ignore")
categorical_cols = ["counter_name", "site_name"]

preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        ("cat", categorical_encoder, categorical_cols),
    ]
)

regressor = Ridge()

pipe = make_pipeline(date_encoder, preprocessor, regressor)
pipe.fit(X_train, y_train)

In [13]:
print(
    f"Train set, RMSE={mean_squared_error(y_train, pipe.predict(X_train), squared=False):.2f}"
)
print(
    f"Test set, RMSE={mean_squared_error(y_test, pipe.predict(X_test), squared=False):.2f}"
)

Train set, RMSE=0.80
Test set, RMSE=0.73
