# Imports 

In [101]:
 import warnings
warnings.filterwarnings("ignore")

from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import utils
from pathlib import Path
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler
from sklearn.pipeline import make_pipeline, Pipeline
from lockdowndates.core import LockdownDates


from catboost import CatBoostRegressor
from jours_feries_france import JoursFeries
from vacances_scolaires_france import SchoolHolidayDates
from datetime import timedelta, date

## Importing Data

In [60]:
X, y = utils.get_train_data()
X_final_test = pd.read_parquet(Path("data") / "final_test.parquet")

In [61]:
X.shape

(496827, 10)

## Importing Starter Kit Functions

In [62]:
def train_test_split_temporal(X, y, delta_threshold="30 days"):
    cutoff_date = X["date"].max() - pd.Timedelta(delta_threshold)
    mask = X["date"] <= cutoff_date
    X_train, X_test = X.loc[mask], X.loc[~mask]
    y_train, y_test = y[mask], y[~mask]

    return X_train, y_train, X_test, y_test

def _encode_dates(X):
    X = X.copy()  # modify a copy of X
    # Encode the date information from the DateOfDeparture columns
    X["year"] = X["date"].dt.year
    X["month"] = X["date"].dt.month
    X["day"] = X["date"].dt.day
    X["weekday"] = X["date"].dt.weekday
    X["hour"] = X["date"].dt.hour

    # Finally we can drop the original columns from the dataframe
    return X.drop(columns=["date"])

def filter_columns(X):
    columns_to_keep = ['date', 'latitude', 'longitude']
    return X[columns_to_keep]

# Preprocessing

## Create X + Weather + Vacances

In [63]:
def create_x_weather(X):
    weather_wi = pd.read_csv('data/weather_data_paris_daily.csv')
    
    columns_to_keep = ['datetime', 'temp', 'precip', 'windspeed', 'visibility']
    weather = weather_wi[columns_to_keep].copy().rename(columns={'datetime':'date'})
    
    mapping = {'snow': 0, 'rain': 1, 'cloudy': 2, 'partly-cloudy-day': 3, 'clear-day': 4}
    weather.loc[:, 'icon_encoded'] = weather_wi['icon'].copy().map(mapping)
    
    
    weather['date'] = pd.to_datetime(weather['date'].values.astype('<M8[us]'), format='%Y-%m-%d')
    weather['date_merge'] = weather['date']
    X_weather = X.copy() 
    X_weather['date_merge'] = pd.to_datetime(X_weather['date'].dt.strftime('%Y-%m-%d'), format='%Y-%m-%d')
    X_weather = X_weather.merge(weather.drop(columns=['date']), how='left', on='date_merge').drop(columns=['date_merge'])
    
    return X_weather

In [64]:
def is_jour_ferie(X):
    d = SchoolHolidayDates()
    ho_20 = d.holidays_for_year_and_zone(2020, 'A')
    ho_21 = d.holidays_for_year_and_zone(2021, 'A')
    
    jf_20 = JoursFeries.for_year(2020)
    jf_21 = JoursFeries.for_year(2021)
    
    df_jf_21 =pd.DataFrame(jf_21.keys(),jf_21.values(),columns=['fete']).reset_index().rename(columns={'index':'date'})
    df_jf_20 =pd.DataFrame(jf_20.keys(),jf_20.values(),columns=['fete']).reset_index().rename(columns={'index':'date'})
    df_jf = pd.concat([df_jf_20,df_jf_21]).reset_index().drop('index',axis=1)
    df_jf['date'] = pd.to_datetime(df_jf['date'], format='%Y-%m-%d')
    
    df_ho20 = pd.DataFrame({'date': key, 'vacances': value['vacances_zone_a']} for key, value in ho_20.items())
    df_ho21 = pd.DataFrame({'date': key, 'vacances': value['vacances_zone_a']} for key, value in ho_20.items())
    df_ho = pd.concat([df_ho20,df_ho21]).reset_index().drop('index',axis=1)
    df_ho['date'] = pd.to_datetime(df_ho['date'], format='%Y-%m-%d')
    
    
    X['date_merge'] = pd.to_datetime(X['date'].dt.strftime('%Y-%m-%d'), format='%Y-%m-%d')
    
    X_final = X.merge(df_jf, how='left', left_on='date_merge', right_on='date', suffixes=('','_drop')).drop(columns={'date_drop'})
    X_final['is_ferie'] = X_final['fete'].map(lambda x: 0 if pd.isna(x) else 1)
    
    X_final = X_final.merge(df_ho, how='left', left_on='date_merge', right_on='date', suffixes=('','_drop')).drop(columns={'date_drop'})
    X_final['is_vacances'] = X_final['vacances'].map(lambda x: 0 if pd.isna(x) else 1)
    
    return X_final.drop_duplicates().drop(['fete','vacances'],axis=1)

In [120]:
def generate_lockdown_dates(X):
    date_range = pd.date_range(start='2020-09-01', end='2021-09-09', freq='D')
    df = pd.DataFrame({'Date': date_range})
    lockdown_periods = [('2020-10-31', '2020-12-14'), ('2021-04-04', '2021-05-02')]
    for period_start, period_end in lockdown_periods:
        df.loc[(df['Date'] >= period_start) & (df['Date'] <= period_end), 'Lockdown'] = True

    df['Lockdown'] = df['Lockdown'].fillna(0).map(lambda x: 1 if True else 0)
    df = df.reset_index().rename(columns={'Date': 'date_merge'})
    X['date_merge'] = pd.to_datetime(X['date'].dt.strftime('%Y-%m-%d'), format='%Y-%m-%d')
    X_ldd = X.merge(df, how = 'left', on = 'date_merge')
    X_ldd = X_ldd.drop(['date_merge'], axis=1)
    
    return X_ldd


In [113]:
def encode_cyclical_features(df):
    columns = ['hour', 'day', 'month', 'weekday']
    max_value = {'hour': 24, 'day': 31, 'month': 31, 'weekday': 7}

    for column in df:
        if column in columns:
            df[column + '_sin'] = np.sin(2 * np.pi * df[column]/max_value[column])
            df[column + '_cos'] = np.cos(2 * np.pi * df[column]/max_value[column])
            df.drop(columns=[column], inplace=True)  
    return df

# Pipelines

In [121]:
X_train, y_train, X_test, y_test = train_test_split_temporal(X, y, delta_threshold="30 days")

model = xgb.XGBRegressor(colsample_bynode=0.6,
        colsample_bytree=0.6,
        colsample_bylevel=0.5,
        learning_rate=0.1,
        n_estimators=400,
        max_depth=10,
        subsample=0.8)

date_cols = FunctionTransformer(_encode_dates)
add_weather = FunctionTransformer(create_x_weather)
keep_specific_columns = FunctionTransformer(filter_columns)
add_vacances_et_jours_feries = FunctionTransformer(is_jour_ferie)
check_lockdown = FunctionTransformer(generate_lockdown_dates)
cycle_encoding = FunctionTransformer(encode_cyclical_features)


#num_features = ['temp', 'precip', 'windspeed', 'visibility']

col_transformer = ColumnTransformer(
    transformers=[
        
        ('num', StandardScaler(), num_features)
    ],
    remainder='passthrough'  # This keeps all other columns unchanged
)

pipe = Pipeline([
    
    ('filter_columns',keep_specific_columns),
    #('add_vacances_et_jours_feries', add_vacances_et_jours_feries),
    #('add_weather', add_weather),
    ('check_lockdown',check_lockdown),
    ('date_encode', FunctionTransformer(_encode_dates)),
    ('cycle_encoding',cycle_encoding),
    #('scaling_num_features', col_transformer),
    ('model', model)
])


pipe.fit(X_train, y_train)

predictions = pipe.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(f'Root Mean Squared Error: {rmse}')

Root Mean Squared Error: 0.6154593175604542


# Cat Boost

In [66]:
is_jour_ferie(X).shape

(496827, 13)

In [129]:
cat = CatBoostRegressor(
    depth=10,
    iterations=1000,
    rsm=0.25,
    sampling_frequency="PerTree",
    subsample=0.7,
    verbose=0,
)

date_cols = FunctionTransformer(_encode_dates)
add_weather = FunctionTransformer(create_x_weather)
keep_specific_columns = FunctionTransformer(filter_columns)
add_vacances_et_jours_feries = FunctionTransformer(is_jour_ferie)
check_lockdown = FunctionTransformer(generate_lockdown_dates)
cycle_encoding = FunctionTransformer(encode_cyclical_features)


#num_features = ['temp', 'precip', 'windspeed', 'visibility']

col_transformer = ColumnTransformer(
    transformers=[
        
        ('num', StandardScaler(), num_features)
    ],
    remainder='passthrough'  # This keeps all other columns unchanged
)

pipe = Pipeline([
    
    ('filter_columns',keep_specific_columns),
    #('add_vacances_et_jours_feries', add_vacances_et_jours_feries),
    ('add_weather', add_weather),
    ('check_lockdown',check_lockdown),
    ('date_encode', FunctionTransformer(_encode_dates)),
    ('cycle_encoding',cycle_encoding),
    ('scaling_num_features', col_transformer),
    ('model', cat)
])


pipe.fit(X_train, y_train)

predictions = pipe.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(f'Root Mean Squared Error: {rmse}')

Root Mean Squared Error: 0.5813782282531081


# Format Output

In [130]:
pipe.fit(X,y)
submission = pipe.predict(X_final_test)
pd.Series(submission).to_frame().rename_axis('Id').rename(columns={0:'log_bike_count'}).to_csv('submission14_81223.csv')

In [68]:
pipe_test.fit_transform(X).to_csv('X_test_opti.csv'), pd.Series(y).to_csv('y_test_opti.csv')

(None, None)