In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate, cross_val_score, train_test_split
from sklearn.model_selection import cross_val_score

from xgboost import XGBRegressor


from bikesharing.ml_logic.model import get_folds, train_test_indices
from bikesharing.interface.main import *
from sklearn.metrics import mean_squared_error, r2_score, mean_squared_log_error

In [2]:
preprocess()

[34m
Load preprocessed data from local CSV...[0m


(       Unnamed: 0  is_holiday  is_weekend  temperature_2m  \
 0               0           1           0        0.355408   
 1               1           1           0        0.357616   
 2               2           1           0        0.359823   
 3               3           1           0        0.359823   
 4               4           1           0        0.359823   
 ...           ...         ...         ...             ...   
 35035       35035           0           1        0.426049   
 35036       35036           0           1        0.412804   
 35037       35037           0           1        0.410596   
 35038       35038           0           1        0.417219   
 35039       35039           0           1        0.415011   
 
        apparent_temperature  windspeed_10m  precipitation  hour_sin  hour_cos  \
 0                  0.342007       0.227848       0.017391  0.629410  0.982963   
 1                  0.340149       0.245570       0.008696  0.750000  0.933013   
 2      

In [3]:
df_X = pd.read_csv('~/.lewagon/bikesharing/data/processed/X_processed_from_2019_to_2022.csv')
df_y = pd.read_csv('~/.lewagon/bikesharing/data/processed/y_processed_from_2019_to_2022.csv')

In [4]:
columns = list(df_y.columns)

In [5]:
df_X_y = pd.concat((df_X, df_y) , axis=1)

In [6]:
fold_dfs = get_folds(df_X_y, fold_length=FOLD_LENGTH, fold_stride=FOLD_STRIDE)

In [7]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV


models = []
district_score_means = []


folds = fold_dfs[:]

test_df = folds[3]
del folds[3]
train_dfs = folds
train_df = pd.concat(train_dfs, axis=0)

In [8]:
X_test = test_df.drop(columns= columns)
y_test = test_df['Laim']
X_train = train_df.drop(columns= columns)
y_train = train_df['Laim']


# Instantiate model
model = XGBRegressor(objective ='reg:squarederror')

params = {
    # Parameters that we are going to tune.
    'max_depth':range(3,10,2),
    'min_child_weight': range(1, 6, 1),
    'eta':[0.1, 0.01, 0.05],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0.5, 1, 1.5, 2, 5]
}

# Instantiate Grid Search
search = GridSearchCV(
    model,
    param_grid=params, 
    scoring = 'neg_mean_squared_error',
    cv = 5,
    n_jobs=-1 # parallelize computation
) 

# Fit data to Grid Search
search.fit(X_train, y_train);

In [9]:
# Best score
print("Best score: ", search.best_score_)

# Best Params
print("Best params: ", search.best_params_)

# Best estimator
search.best_estimator_

Best score:  -2.469793002257732
Best params:  {'colsample_bytree': 0.6, 'eta': 0.05, 'gamma': 0.5, 'max_depth': 7, 'min_child_weight': 3, 'subsample': 1.0}


## Train Model

In [10]:
districts = ['Laim']

In [11]:
fold_dfs = get_folds(df_X_y, fold_length=FOLD_LENGTH, fold_stride=FOLD_STRIDE)

In [12]:
models = []
district_score_means_r2 = []
district_score_means_rmse = []

for dist in districts:
    scores_list_r2 = []
    scores_list_rmse = []
        
    for i, fold in enumerate(fold_dfs):
        xgb_r = XGBRegressor(objective ='reg:squarederror',
                  n_estimators = 100, seed = 123,
                  colsample_bytree=0.8, eta=0.1, gamma=5, max_depth=5, 
                  min_child_weight=3, subsample=0.6)        

        folds = fold_dfs[:]
        test_df = folds[i]
        del folds[i]
        train_dfs = folds  
        train_df = pd.concat(train_dfs, axis=0)  
        
        X_test = test_df.drop(columns= districts)
        y_test = test_df[dist]
        X_train = train_df.drop(columns= districts)
        y_train = train_df[dist]
        
        xgb_r.fit(X_train, y_train)
        y_pred = [x if x >= 0 else 0 for x in xgb_r.predict(X_test)]
        scores_r2 = r2_score(y_test , y_pred)
        scores_rmse = np.sqrt(mean_squared_error(y_test, xgb_r.predict(X_test)))
        scores_list_r2.append(scores_r2)
        scores_list_rmse.append(scores_rmse)
        print(f"{i} >>> scores R2={scores_list_r2}")
        print(f"{i} >>> scores RMSE={scores_list_rmse}")
        
        
    scores_array_r2 = np.array(scores_r2)
    scores_array_rmse = np.array(scores_rmse)
    district_score_means_r2.append(np.mean(scores_array_r2.flatten()))
    district_score_means_rmse.append(np.mean(scores_array_rmse.flatten()))

0 >>> scores R2=[0.4507314323467174]
0 >>> scores RMSE=[1.6038930892965435]
1 >>> scores R2=[0.4507314323467174, 0.507403321828391]
1 >>> scores RMSE=[1.6038930892965435, 1.5230810918736923]
2 >>> scores R2=[0.4507314323467174, 0.507403321828391, 0.48065654219718057]
2 >>> scores RMSE=[1.6038930892965435, 1.5230810918736923, 1.2813741644731926]
3 >>> scores R2=[0.4507314323467174, 0.507403321828391, 0.48065654219718057, 0.4213929518476013]
3 >>> scores RMSE=[1.6038930892965435, 1.5230810918736923, 1.2813741644731926, 1.3171290717277828]


In [13]:
print("LAIM:\n")
print(">>> R2 Scores", np.mean(district_score_means_r2))
print(">>> RMSE Scores", np.mean(district_score_means_rmse))

# scores_df_r2 = pd.DataFrame(district_score_means_r2, columns=['score'], index=districts)
# scores_df_r2.sort_values(by=['score'], inplace=True) 

# scores_df_rmse = pd.DataFrame(district_score_means_rmse, columns=['score'], index=districts)
# scores_df_rmse.sort_values(by=['score'], inplace=True) 



# fig, ax = plt.subplots(1,2, figsize=(14,5))
# ax[0].plot(scores_df_r2.index, scores_df_r2['score'])
# ax[0].set_title('r2 scores')
# ax[0].axhline(y=0, color='r', linestyle='-')
# ax[0].grid(c='#ededed')
# ax[0].tick_params(axis='x', rotation=90)

# ax[1].plot(scores_df_rmse.index, scores_df_rmse['score'])
# ax[1].set_title('rmse scores')
# ax[1].axhline(y=0, color='r', linestyle='-')
# ax[1].grid(c='#ededed')
# ax[1].tick_params(axis='x', rotation=90)



# plt.show()

LAIM:

>>> R2 Scores 0.4213929518476013
>>> RMSE Scores 1.3171290717277828
