# Imports 

In [1]:
 import warnings
warnings.filterwarnings("ignore")

from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import utils
from pathlib import Path
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline, Pipeline
from lockdowndates.core import LockdownDates
from jours_feries_france import JoursFeries
from vacances_scolaires_france import SchoolHolidayDates

from catboost import CatBoostRegressor
import datetime
from datetime import timedelta, date

In [2]:
X, y = utils.get_train_data()
X_final_test = pd.read_parquet(Path("data") / "final_test.parquet")

In [3]:
X_train, y_train, X_test, y_test = utils.train_test_split_temporal(X, y, delta_threshold="30 days")

In [4]:
def train_test_split_temporal(X, y, delta_threshold="30 days"):
    cutoff_date = X["date"].max() - pd.Timedelta(delta_threshold)
    mask = X["date"] <= cutoff_date
    X_train, X_test = X.loc[mask], X.loc[~mask]
    y_train, y_test = y[mask], y[~mask]

    return X_train, y_train, X_test, y_test

def _encode_dates(X):
    X = X.copy()  # modify a copy of X
    # Encode the date information from the DateOfDeparture columns
    X["year"] = X["date"].dt.year
    X["month"] = X["date"].dt.month
    X["day"] = X["date"].dt.day
    X["weekday"] = X["date"].dt.weekday
    X["hour"] = X["date"].dt.hour

    # Finally we can drop the original columns from the dataframe
    return X.drop(columns=["date"])

def filter_columns(X):
    columns_to_keep = ['date', 'latitude', 'longitude']
    return X[columns_to_keep]

# Cat Boost

In [23]:
def create_rain(X):
    X['rain'] = X['icon_encoded'].map(lambda x: 1 if x == 1 else 0)
    return X

In [18]:
cat = CatBoostRegressor(
    depth=10,
    iterations=250,
    rsm=0.25,
    sampling_frequency="PerTree",
    subsample=0.7,
    verbose=0,
)

date_cols = FunctionTransformer(utils._encode_dates)
add_weather = FunctionTransformer(utils.create_x_weather)
keep_specific_columns = FunctionTransformer(utils.filter_columns)
add_vacances_et_jours_feries = FunctionTransformer(utils.is_jour_ferie)
check_lockdown = FunctionTransformer(utils.isLockdown)
cycle_encoding = FunctionTransformer(utils.encode_cyclical_features)
couvre_feu = FunctionTransformer(utils.apply_couvre_feu)

num_features = ['temp', 'precip', 'windspeed', 'visibility']
cat_features = ['france_stay_at_home']


col_transformer = ColumnTransformer(
    transformers=[
        
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(), cat_features)
    ],
    remainder='passthrough'  # This keeps all other columns unchanged
)



pipe = Pipeline([
       # ('add_vacances_et_jours_feries', add_vacances_et_jours_feries),
    ('filter_columns',keep_specific_columns),
    ('is_couvre_feu',couvre_feu),
    ('add_weather', add_weather),
    ('check_lockdown',check_lockdown),
    ('date_encode', date_cols),
    ('rain', FunctionTransformer(create_rain)),
    #('cycle_encoding',cycle_encoding),
    #('scaling_num_features', col_transformer),
    
    ('model', cat)
])


pipe.fit(X_train, y_train)

predictions = pipe.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(f'Root Mean Squared Error: {rmse}')

Fetching lockdown dates...
Fetched lockdown dates for: France


CatBoostError: features data: pandas.DataFrame column 'france_stay_at_home' has dtype 'category' but is not in  cat_features list

In [None]:
0.5610473766023862

In [16]:
def isLockdown(X):
    ld = LockdownDates("France", "2020-09-01", "2021-09-09", ("stay_at_home", "masks"))
    lockdown_dates = ld.dates()
    ld = lockdown_dates.reset_index().rename(columns={'timestamp': 'date_merge'})
    X['date_merge'] = pd.to_datetime(X['date'].dt.strftime('%Y-%m-%d'), format='%Y-%m-%d')
    X_ld = X.merge(ld, how = 'left', on = 'date_merge')
    X_ld = X_ld.drop(['france_masks', 'france_country_code', 'date_merge'], axis=1)
    #X_ld['france_stay_at_home'] = X_ld['france_stay_at_home'].map(lambda x: 1 if x == 2 else 0)\n",
    return X_ld

X_train

Unnamed: 0,counter_id,counter_name,site_id,site_name,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,date_merge
400125,100049407-353255860,152 boulevard du Montparnasse E-O,100049407,152 boulevard du Montparnasse,2020-09-01 01:00:00,2018-12-07,"48.840801,2.333233",Y2H19070373,48.840801,2.333233,2020-09-01
408305,100049407-353255859,152 boulevard du Montparnasse O-E,100049407,152 boulevard du Montparnasse,2020-09-01 01:00:00,2018-12-07,"48.840801,2.333233",Y2H19070373,48.840801,2.333233,2020-09-01
87516,100036719-104036719,18 quai de l'Hôtel de Ville NO-SE,100036719,18 quai de l'Hôtel de Ville,2020-09-01 01:00:00,2017-07-12,"48.85372,2.35702",Y2H19027732,48.853720,2.357020,2020-09-01
98518,100036719-103036719,18 quai de l'Hôtel de Ville SE-NO,100036719,18 quai de l'Hôtel de Ville,2020-09-01 01:00:00,2017-07-12,"48.85372,2.35702",Y2H19027732,48.853720,2.357020,2020-09-01
875137,100063175-353277233,20 Avenue de Clichy NO-SE,100063175,20 Avenue de Clichy,2020-09-01 01:00:00,2020-07-22,"48.88529,2.32666",Y2H20073268,48.885290,2.326660,2020-09-01
...,...,...,...,...,...,...,...,...,...,...,...
794947,100057329-103057329,Totem 85 quai d'Austerlitz SE-NO,100057329,Totem 85 quai d'Austerlitz,2021-08-10 23:00:00,2020-02-18,"48.84201,2.36729",YTH19111508,48.842010,2.367290,2021-08-10
804811,100057380-104057380,Totem Cours la Reine E-O,100057380,Totem Cours la Reine,2021-08-10 23:00:00,2020-02-11,"48.86462,2.31444",YTH19111509,48.864620,2.314440,2021-08-10
814413,100057380-103057380,Totem Cours la Reine O-E,100057380,Totem Cours la Reine,2021-08-10 23:00:00,2020-02-11,"48.86462,2.31444",YTH19111509,48.864620,2.314440,2021-08-10
125926,100042374-110042374,Voie Georges Pompidou NE-SO,100042374,Voie Georges Pompidou,2021-08-10 23:00:00,2017-12-15,"48.8484,2.27586",Y2H21025335,48.848400,2.275860,2021-08-10


# TEST PIPELINE OUTPUT

In [14]:
pipe = Pipeline([
    
    ('filter_columns',keep_specific_columns),
    #('add_vacances_et_jours_feries', add_vacances_et_jours_feries),
    ('add_weather', add_weather),
    ('check_lockdown',check_lockdown),
    ('date_encode', FunctionTransformer(_encode_dates)),
    #('cycle_encoding',cycle_encoding),
    #('scaling_num_features', col_transformer),
    #('rain', FunctionTransformer(create_rain))
])

pipe.fit_transform(X)

Unnamed: 0,latitude,longitude,temp,precip,windspeed,visibility,icon_encoded,index,Lockdown,year,month,day,weekday,hour
0,48.840801,2.333233,16.1,0.00,13.6,25.1,3,0,0,2020,9,1,1,1
1,48.840801,2.333233,16.1,0.00,13.6,25.1,3,0,0,2020,9,1,1,1
2,48.853720,2.357020,16.1,0.00,13.6,25.1,3,0,0,2020,9,1,1,1
3,48.853720,2.357020,16.1,0.00,13.6,25.1,3,0,0,2020,9,1,1,1
4,48.885290,2.326660,16.1,0.00,13.6,25.1,3,0,0,2020,9,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496822,48.842010,2.367290,22.1,2.54,16.6,13.7,1,373,0,2021,9,9,3,23
496823,48.864620,2.314440,22.1,2.54,16.6,13.7,1,373,0,2021,9,9,3,23
496824,48.864620,2.314440,22.1,2.54,16.6,13.7,1,373,0,2021,9,9,3,23
496825,48.848400,2.275860,22.1,2.54,16.6,13.7,1,373,0,2021,9,9,3,23


In [None]:
def apply_couvre_feu(df, date_column='date'):

    couvre_feu_periods = [
        {"start": datetime.datetime(2020, 3, 17), "end": datetime.datetime(2020, 5, 29), "start_time": datetime.time(21, 0), "end_time": datetime.time(6, 0)},
        {"start": datetime.datetime(2021, 1, 16), "end": datetime.datetime(2021, 6, 20), "start_time": datetime.time(21, 0), "end_time": datetime.time(6, 0)},
    ]

    def is_couvre_feu(date_heure):

        if not isinstance(date_heure, str):
            date_heure = date_heure.strftime('%Y-%m-%d %H:%M')

        dt = datetime.datetime.strptime(date_heure, '%Y-%m-%d %H:%M')
        for period in couvre_feu_periods:
            if period["start"].date() <= dt.date() <= period["end"].date():
                if ((dt.time() >= period["start_time"]) or (dt.time() < period["end_time"])):
                    return 1
        return 0



# Format Output

In [10]:
pipe.fit(X,y)
submission = pipe.predict(X_final_test)
pd.Series(submission).to_frame().rename_axis('Id').rename(columns={0:'log_bike_count'}).to_csv('submission17_91223.csv')