# Imports 

In [17]:
 import warnings
warnings.filterwarnings("ignore")

from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import utils
from pathlib import Path
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline, Pipeline
from lockdowndates.core import LockdownDates
from jours_feries_france import JoursFeries
from vacances_scolaires_france import SchoolHolidayDates

from catboost import CatBoostRegressor

from datetime import timedelta, date

In [18]:
X, y = utils.get_train_data()
X_final_test = pd.read_parquet(Path("data") / "final_test.parquet")

In [19]:
X_train, y_train, X_test, y_test = utils.train_test_split_temporal(X, y, delta_threshold="30 days")

In [20]:
def train_test_split_temporal(X, y, delta_threshold="30 days"):
    cutoff_date = X["date"].max() - pd.Timedelta(delta_threshold)
    mask = X["date"] <= cutoff_date
    X_train, X_test = X.loc[mask], X.loc[~mask]
    y_train, y_test = y[mask], y[~mask]

    return X_train, y_train, X_test, y_test

def _encode_dates(X):
    X = X.copy()  # modify a copy of X
    # Encode the date information from the DateOfDeparture columns
    X["year"] = X["date"].dt.year
    X["month"] = X["date"].dt.month
    X["day"] = X["date"].dt.day
    X["weekday"] = X["date"].dt.weekday
    X["hour"] = X["date"].dt.hour

    # Finally we can drop the original columns from the dataframe
    return X.drop(columns=["date"])

def filter_columns(X):
    columns_to_keep = ['date', 'latitude', 'longitude']
    return X[columns_to_keep]

# Cat Boost

In [21]:
def create_rain(X):
    X['rain'] = X['icon_encoded'].map(lambda x: 1 if x == 1 else 0)
    return X

In [24]:
cat = CatBoostRegressor(
    depth=10,
    iterations=250,
    rsm=0.25,
    sampling_frequency="PerTree",
    subsample=0.7,
    verbose=0,
)

date_cols = FunctionTransformer(utils._encode_dates)
add_weather = FunctionTransformer(utils.create_x_weather)
keep_specific_columns = FunctionTransformer(utils.filter_columns)
add_vacances_et_jours_feries = FunctionTransformer(utils.is_jour_ferie)
check_lockdown = FunctionTransformer(utils.isLockdown)
cycle_encoding = FunctionTransformer(utils.encode_cyclical_features)


num_features = ['temp']#, 'precip', 'windspeed', 'visibility']

col_transformer = ColumnTransformer(
    transformers=[
        
        ('num', StandardScaler(), num_features),
    ],
    remainder='passthrough'  # This keeps all other columns unchanged
)



pipe = Pipeline([
        ('add_vacances_et_jours_feries', add_vacances_et_jours_feries),

    ('filter_columns',keep_specific_columns),
    ('add_weather', add_weather),
    ('check_lockdown',check_lockdown),
    ('date_encode', date_cols),
    ('rain', FunctionTransformer(create_rain)),
    ('cycle_encoding',cycle_encoding),
    ('scaling_num_features', col_transformer),
    
    ('model', cat)
])


pipe.fit(X_train, y_train)

predictions = pipe.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(f'Root Mean Squared Error: {rmse}')

Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France
Root Mean Squared Error: 0.5800550267212538


In [None]:
0.5610473766023862

# TEST PIPELINE OUTPUT

In [14]:
pipe = Pipeline([
    
    ('filter_columns',keep_specific_columns),
    #('add_vacances_et_jours_feries', add_vacances_et_jours_feries),
    ('add_weather', add_weather),
    ('check_lockdown',check_lockdown),
    ('date_encode', FunctionTransformer(_encode_dates)),
    #('cycle_encoding',cycle_encoding),
    #('scaling_num_features', col_transformer),
    #('rain', FunctionTransformer(create_rain))
])

pipe.fit_transform(X)

Unnamed: 0,latitude,longitude,temp,precip,windspeed,visibility,icon_encoded,index,Lockdown,year,month,day,weekday,hour
0,48.840801,2.333233,16.1,0.00,13.6,25.1,3,0,0,2020,9,1,1,1
1,48.840801,2.333233,16.1,0.00,13.6,25.1,3,0,0,2020,9,1,1,1
2,48.853720,2.357020,16.1,0.00,13.6,25.1,3,0,0,2020,9,1,1,1
3,48.853720,2.357020,16.1,0.00,13.6,25.1,3,0,0,2020,9,1,1,1
4,48.885290,2.326660,16.1,0.00,13.6,25.1,3,0,0,2020,9,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496822,48.842010,2.367290,22.1,2.54,16.6,13.7,1,373,0,2021,9,9,3,23
496823,48.864620,2.314440,22.1,2.54,16.6,13.7,1,373,0,2021,9,9,3,23
496824,48.864620,2.314440,22.1,2.54,16.6,13.7,1,373,0,2021,9,9,3,23
496825,48.848400,2.275860,22.1,2.54,16.6,13.7,1,373,0,2021,9,9,3,23


# Format Output

In [12]:
pipe.fit(X,y)
submission = pipe.predict(X_final_test)
pd.Series(submission).to_frame().rename_axis('Id').rename(columns={0:'log_bike_count'}).to_csv('submission16_81223.csv')

Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France
