# Imports 

In [1]:
 import warnings
warnings.filterwarnings("ignore")

from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import utils
from pathlib import Path
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline, Pipeline
from lockdowndates.core import LockdownDates
from jours_feries_france import JoursFeries
from vacances_scolaires_france import SchoolHolidayDates

from catboost import CatBoostRegressor
import datetime
from datetime import timedelta, date

In [2]:
X, y = utils.get_train_data()
X_final_test = pd.read_parquet(Path("data") / "final_test.parquet")

# Cat Boost

In [3]:
num_features = ['temp', 'precip', 'windspeed', 'visibility']
cat_features = ['counter_id']
time_features = ['hour','month','weekday','day']

col_transformer = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(sparse=False), cat_features),
        ('time', FunctionTransformer(utils.encode_cyclical_features), time_features)
    ],
    remainder='passthrough'
)

X, y = utils.get_train_data()

X_final_test = pd.read_parquet(Path("data") / "final_test.parquet")
X_train, y_train, X_test, y_test = utils.train_test_split_temporal(X, y, delta_threshold="30 days")
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
cat = CatBoostRegressor(
    depth=12,
    iterations=1000,
    rsm=0.3,
    subsample=0.7,   
)

pipe = Pipeline([
    ('prepro',FunctionTransformer(utils.prepro)),
    ('col', col_transformer),
    ('model', cat)
])


pipe.fit(X_train, y_train)

predictions = pipe.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(f'Root Mean Squared Error: {rmse}')

Learning rate set to 0.107739
0:	learn: 1.5477099	total: 93.2ms	remaining: 1m 33s
1:	learn: 1.4371181	total: 122ms	remaining: 1m 1s
2:	learn: 1.3420752	total: 151ms	remaining: 50s
3:	learn: 1.2572751	total: 175ms	remaining: 43.6s
4:	learn: 1.1841485	total: 204ms	remaining: 40.5s
5:	learn: 1.1276760	total: 230ms	remaining: 38.1s
6:	learn: 1.0736074	total: 257ms	remaining: 36.5s
7:	learn: 1.0250647	total: 286ms	remaining: 35.5s
8:	learn: 0.9853124	total: 315ms	remaining: 34.7s
9:	learn: 0.9513224	total: 341ms	remaining: 33.8s
10:	learn: 0.9147655	total: 369ms	remaining: 33.2s
11:	learn: 0.8857464	total: 397ms	remaining: 32.7s
12:	learn: 0.8567974	total: 424ms	remaining: 32.2s
13:	learn: 0.8315996	total: 447ms	remaining: 31.4s
14:	learn: 0.8117770	total: 474ms	remaining: 31.1s
15:	learn: 0.7936416	total: 500ms	remaining: 30.7s
16:	learn: 0.7754401	total: 525ms	remaining: 30.3s
17:	learn: 0.7601766	total: 549ms	remaining: 30s
18:	learn: 0.7462145	total: 576ms	remaining: 29.7s
19:	learn: 0.

In [15]:
pipe.fit(X, y)
submission = pipe.predict(X_final_test)
pd.Series(submission).to_frame().rename_axis('Id').rename(columns={0:'log_bike_count'}).to_csv('submission41_101223.csv')

Learning rate set to 0.109189
0:	learn: 1.5339167	total: 34ms	remaining: 34s
1:	learn: 1.4220860	total: 65.2ms	remaining: 32.5s
2:	learn: 1.3292506	total: 94.2ms	remaining: 31.3s
3:	learn: 1.2490248	total: 121ms	remaining: 30.2s
4:	learn: 1.1764514	total: 151ms	remaining: 30s
5:	learn: 1.1195229	total: 180ms	remaining: 29.8s
6:	learn: 1.0639016	total: 208ms	remaining: 29.5s
7:	learn: 1.0157484	total: 238ms	remaining: 29.5s
8:	learn: 0.9752350	total: 267ms	remaining: 29.4s
9:	learn: 0.9399690	total: 296ms	remaining: 29.3s
10:	learn: 0.9076916	total: 324ms	remaining: 29.2s
11:	learn: 0.8752294	total: 350ms	remaining: 28.9s
12:	learn: 0.8472621	total: 379ms	remaining: 28.7s
13:	learn: 0.8263347	total: 407ms	remaining: 28.7s
14:	learn: 0.8107139	total: 436ms	remaining: 28.7s
15:	learn: 0.7940180	total: 474ms	remaining: 29.2s
16:	learn: 0.7754858	total: 500ms	remaining: 28.9s
17:	learn: 0.7611837	total: 525ms	remaining: 28.6s
18:	learn: 0.7471999	total: 552ms	remaining: 28.5s
19:	learn: 0.7

With cycle: 0.4325580227568971    1500 => 0.4403775311568222
Without cycle: 0.45848241663826306    1500 => 0.43950108361677287


CatBoostRegressor(
    depth=12,
    iterations=1000,
    rsm=0.3,
    subsample=0.7,   
) ==> 0.4266953375361533


0.42