# Imports 

In [1]:
 import warnings
warnings.filterwarnings("ignore")

from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import utils
from pathlib import Path
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline, Pipeline
from lockdowndates.core import LockdownDates
from jours_feries_france import JoursFeries
from vacances_scolaires_france import SchoolHolidayDates

from catboost import CatBoostRegressor
import datetime
from datetime import timedelta, date

In [2]:
X, y = utils.get_train_data()
X_final_test = pd.read_parquet(Path("data") / "final_test.parquet")

# Cat Boost

In [16]:
num_features = ['temp', 'precip', 'windspeed', 'visibility']
cat_features = ['counter_id']
time_features = ['hour','month','weekday','day']

col_transformer = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(sparse=False), cat_features),
        ('time', FunctionTransformer(utils.encode_cyclical_features), time_features)
    ],
    remainder='passthrough'
)

X, y = utils.get_train_data()

X_final_test = pd.read_parquet(Path("data") / "final_test.parquet")
X_train, y_train, X_test, y_test = utils.train_test_split_temporal(X, y, delta_threshold="30 days")
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
cat = CatBoostRegressor(
    depth=12,
    iterations=1500,
    rsm=0.35,
    subsample=0.7
)

pipe = Pipeline([
    ('prepro',FunctionTransformer(utils.prepro)),
    ('col', col_transformer),
    ('model', cat)
])


pipe.fit(X_train, y_train)

predictions = pipe.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(f'Root Mean Squared Error: {rmse}')

Learning rate set to 0.077484
0:	learn: 1.5839698	total: 43.4ms	remaining: 1m 5s
1:	learn: 1.5017353	total: 73ms	remaining: 54.7s
2:	learn: 1.4263158	total: 102ms	remaining: 51s
3:	learn: 1.3582209	total: 130ms	remaining: 48.5s
4:	learn: 1.2961657	total: 159ms	remaining: 47.6s
5:	learn: 1.2412031	total: 189ms	remaining: 47s
6:	learn: 1.1899232	total: 217ms	remaining: 46.2s
7:	learn: 1.1440986	total: 244ms	remaining: 45.6s
8:	learn: 1.1026956	total: 273ms	remaining: 45.3s
9:	learn: 1.0644140	total: 302ms	remaining: 45s
10:	learn: 1.0247639	total: 329ms	remaining: 44.5s
11:	learn: 0.9936916	total: 360ms	remaining: 44.7s
12:	learn: 0.9655013	total: 393ms	remaining: 45s
13:	learn: 0.9355865	total: 423ms	remaining: 44.9s
14:	learn: 0.9108782	total: 451ms	remaining: 44.7s
15:	learn: 0.8884971	total: 477ms	remaining: 44.2s
16:	learn: 0.8698259	total: 504ms	remaining: 43.9s
17:	learn: 0.8520303	total: 536ms	remaining: 44.1s
18:	learn: 0.8323591	total: 564ms	remaining: 44s
19:	learn: 0.8171307	

In [19]:
pipe.fit(X, y)
submission = pipe.predict(X_final_test)
pd.Series(submission).to_frame().rename_axis('Id').rename(columns={0:'log_bike_count'}).to_csv('submission42_101223.csv')

Learning rate set to 0.078527
0:	learn: 1.5690285	total: 37.1ms	remaining: 55.6s
1:	learn: 1.4862577	total: 65.8ms	remaining: 49.3s
2:	learn: 1.4105374	total: 95.4ms	remaining: 47.6s
3:	learn: 1.3422411	total: 123ms	remaining: 46s
4:	learn: 1.2811243	total: 153ms	remaining: 45.7s
5:	learn: 1.2264306	total: 185ms	remaining: 46s
6:	learn: 1.1757082	total: 216ms	remaining: 46s
7:	learn: 1.1292680	total: 243ms	remaining: 45.4s
8:	learn: 1.0885762	total: 273ms	remaining: 45.3s
9:	learn: 1.0512433	total: 303ms	remaining: 45.1s
10:	learn: 1.0178135	total: 332ms	remaining: 44.9s
11:	learn: 0.9876714	total: 363ms	remaining: 45s
12:	learn: 0.9614420	total: 395ms	remaining: 45.2s
13:	learn: 0.9378702	total: 428ms	remaining: 45.5s
14:	learn: 0.9149391	total: 458ms	remaining: 45.4s
15:	learn: 0.8933647	total: 486ms	remaining: 45.1s
16:	learn: 0.8712213	total: 515ms	remaining: 44.9s
17:	learn: 0.8484619	total: 544ms	remaining: 44.8s
18:	learn: 0.8284711	total: 574ms	remaining: 44.8s
19:	learn: 0.813

With cycle: 0.4325580227568971    1500 => 0.4403775311568222
Without cycle: 0.45848241663826306    1500 => 0.43950108361677287


CatBoostRegressor(
    depth=12,
    iterations=1000,
    rsm=0.3,
    subsample=0.7,   
) ==> 0.4266953375361533

BEST MODEL ON KAGGLE
0.42

cat = CatBoostRegressor(
    depth=12,
    iterations=1500,
    rsm=0.35,
    subsample=0.7,   
) ==> 0.42067563445685924