In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from time import time

# XGBoost - Remplacement des Nans


On choisit XGBoost car on a déja obtenu de très bons résultats avec.


## Préparation des données V1 - 2023/11/19 - NaN en zéros


Stratégie :  
On remplace les heures par des floats, les identifiants de station par des entiers et on supprime la colonne 'date'.  
(La suppression de la colonne 'way' a été oubliée)
On a fait une division : 90% du dataset pour le train, 10% pour le test


- Import des données


In [67]:
path = "../../../data"
x_data = pd.read_csv(path + "/Xtrain_hgcGIrA.csv", sep=",")  # features
y_data = pd.read_csv(
    path + "/Ytrain_yL5OjS4.csv", sep=",", usecols=[1]
)  # occupancy rate

- Mise en forme des données


In [68]:
x_data = x_data.drop("date", axis=1)  # on supprime la colonne date
x_data = x_data.drop("way", axis=1) #  on supprime la colonne way
x_data["hour"] = x_data["hour"].apply(
    lambda x: 0 if type(x) == float else int(x[:2])
) # on remplace les Nan de hour par des 0 et les heures par des entiers

#on remplace les Nan des pxqx par 0
x_data[["p1q0", "p2q0", "p3q0", "p0q1", "p0q2", "p0q3"]] = x_data[["p1q0", "p2q0", "p3q0", "p0q1", "p0q2", "p0q3"]].fillna(0)

# on tranforme les identifiants de gare en entiers
x_data["station"] = x_data["station"].astype("category")
cat_columns = x_data.select_dtypes(["category"]).columns
x_data[cat_columns] = x_data[cat_columns].apply(lambda x: x.cat.codes)

- Création d'un dataset de test et de validation


In [69]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    x_data.values, y_data.values, train_size=0.8
)

## XGBoostREGRESSOR - 2023/11/20 - Préparation des données V1


On utilise le regresseur et pas le classifieur car on ne cherche pas la classe de p0q0 mais sa valeur précise.


- Import de XGBRegresseur


In [17]:
from xgboost import XGBRegressor

- Test sans jouer sur les paramètres


In [71]:
XGBR = XGBRegressor()
XGBR.fit(X_train, y_train)
XGBR.score(X_test, y_test)

0.9849680434807259

Le score est légèrement supérieur à celui qu'on avait quand on conservait les NaN. En jouant sur les paramètres comme expliqué ci-dessous, on va essayer de l'augmenter.


- Choix de n_estimators


In [72]:
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from time import time

start = time()

my_kfold = KFold(n_splits=5, shuffle=True, random_state=0)

n_estimators = np.arange(800, 1500, 20)

tuned_parameters = {"n_estimators": n_estimators}

XGBR_params = GridSearchCV(
    XGBRegressor(learning_rate=0.1), tuned_parameters, cv=my_kfold, n_jobs=-1
)

XGBR_params.fit(X_train, y_train)
print(XGBR_params.score(X_test, y_test), XGBR_params.best_params_)
print(str(time() - start) + " sec")

KeyboardInterrupt: 

Premier test avec

- n_estimators = [100, 300, 500, 1000, 2000]

On obtient :

0.9876833947427778 {'n_estimators': 1000}    
37.39019751548767 sec


Deuxième test avec :

- n_estimators = np.arange(800, 2000, 60)

On obtient :

0.9876712793712039 {'n_estimators': 860}   
158.15269088745117 sec


Troisième test avec :

- n_estimators = np.arange(800, 1500, 20)

On obtient :

0.9876745050395136 {'n_estimators': 840}    
236.91013956069946 sec

On choisit n_estimators = 840 pour un learning rate de 0.1. On reverra cet équilibre à la fin.

- choix de max_depth et min_child_weight


In [None]:
start = time()

my_kfold = KFold(n_splits=5, shuffle=True, random_state=0)

max_depth = np.arange(1,15,1)
min_child_weight = np.arange(1,15,1)

tuned_parameters = {
    "max_depth": max_depth,
    "min_child_weight": min_child_weight,
}

XGBR_params = GridSearchCV(
    XGBRegressor(n_estimators=840, learning_rate=0.1),
    tuned_parameters,
    cv=my_kfold,
    n_jobs=-1,
)

XGBR_params.fit(X_train, y_train)
print(XGBR_params.score(X_test, y_test), XGBR_params.best_params_)
print(str(time() - start) + " sec")

0.9876745050395136 {'max_depth': 6, 'min_child_weight': 1}
1005.280415058136 sec


Premier test avec

- max_depth = np.arange(1,15,1)
- min_child_weight = np.arange(1,15,1)

On obtient :

0.9876745050395136 {'max_depth': 6, 'min_child_weight': 1}    
1005.280415058136 sec

Le score n'a pas changé.


- choix de gamma

In [None]:
start = time()

my_kfold = KFold(n_splits=5, shuffle=True, random_state=0)

gamma = np.linspace(0.00001,0.008,15)

tuned_parameters = {"gamma": gamma}

XGBR_params = GridSearchCV(
    XGBRegressor(
        n_estimators=840, learning_rate=0.1, max_depth=6, min_child_weight=1
    ),
    tuned_parameters,
    cv=my_kfold,
    n_jobs=-1,
)

XGBR_params.fit(X_train, y_train)
print(XGBR_params.score(X_test, y_test), XGBR_params.best_params_)
print(str(time() - start) + " sec")

0.9876429928422733 {'gamma': 1e-05}
65.31180882453918 sec


Premier test avec

- gamma = [0.1,0.5,1,5,10,30,50,100]

On obtient :

0.9751867200255847 {'gamma': 0.1}    
45.6101713180542 sec


Deuxième test avec

- gamma = np.linspace(0.01,1.1,15)

On obtient :

0.9830294800009547 {'gamma': 0.01}    
165.9118287563324 sec


Troisième test avec

- gamma = np.linspace(0.001,0.8,15)

On obtient :

0.9872501162550665 {'gamma': 0.001}    
76.28652811050415 sec


Quatrième test avec

- gamma = np.linspace(0.0001,0.08,15)

On obtient :

0.9878007798386781 {'gamma': 0.0001}
71.47756004333496 sec


Cinquième test avec

- gamma = np.linspace(0.00001,0.008,15)

On obtient :

0.9876429928422733 {'gamma': 1e-05}
65.31180882453918 sec

Le score a baissé, on choisira donc plutôt 0.0001

- choix de subsample et colsample_bytree

In [None]:
start = time()

my_kfold = KFold(n_splits=5, shuffle=True, random_state=0)

subsample = np.linspace(0.8,0.9,20)

tuned_parameters = {"subsample": subsample}

XGBR_params = GridSearchCV(
    XGBRegressor(
        n_estimators=840, learning_rate=0.1, max_depth=6, min_child_weight=1, gamma = 0.0001, colsample_bytree=1
    ),
    tuned_parameters,
    cv=my_kfold,
    n_jobs=-1,
)

XGBR_params.fit(X_train, y_train)
print(XGBR_params.score(X_test, y_test), XGBR_params.best_params_)
print(str(time() - start) + " sec")

0.9878386938908292 {'subsample': 0.8210526315789474}
98.14557838439941 sec


Premier test avec

- subsample = np.linspace(0.01,1,10)
- colsample_bytree = np.linspace(0.01,1,10)

On obtient :

0.9877374857092629 {'colsample_bytree': 1.0, 'subsample': 0.78}    
261.4477083683014 sec


Deuxième test avec

- subsample = np.linspace(0.2,1,10)
- colsample_bytree = np.linspace(0.1,1,10)

On obtient :

0.9878834537187015 {'colsample_bytree': 1.0, 'subsample': 0.8222222222222222}    
301.5654721260071 sec


Troisième test avec

- subsample = np.linspace(0.5,1,10)
- colsample_bytree = np.linspace(0.5,1,10)

On obtient :

0.9879713624111819 {'colsample_bytree': 1.0, 'subsample': 0.8888888888888888}    
354.5157027244568 sec


Quatrième test avec

- subsample = np.linspace(0.7,0.9,15)
- colsample_bytree = np.linspace(0.8,1,15)

On obtient :

0.9880263402463824 {'colsample_bytree': 1.0, 'subsample': 0.8714285714285714}    
936.7044553756714 sec

On garde colsample_bytree = 1 et on essaye encore d'affiner subsample.

Cinquième test avec

- subsample = np.linspace(0.8,0.9,20)

On obtient :

0.9878386938908292 {'subsample': 0.8210526315789474}    
98.14557838439941 sec

Le score a diminué, on garde donc subsample = 0.8714285714285714

- choix de reg_alpha et reg_lambda


In [None]:
start = time()

my_kfold = KFold(n_splits=5, shuffle=True, random_state=0)

reg_alpha = np.linspace(0.03,0.1,15)
reg_lambda = np.linspace(0.01,0.5,15)

tuned_parameters = {"reg_alpha": reg_alpha, "reg_lambda": reg_lambda}

XGBR_params = GridSearchCV(
    XGBRegressor(
        n_estimators=840, learning_rate=0.1, max_depth=6, min_child_weight=1, gamma = 0.0001, colsample_bytree=1, subsample=0.8714285714285714 
    ),
    tuned_parameters,
    cv=my_kfold,
    n_jobs=-1,
)

XGBR_params.fit(X_train, y_train)
print(XGBR_params.score(X_test, y_test), XGBR_params.best_params_)
print(str(time() - start) + " sec")

0.9880085018467166 {'reg_alpha': 0.034999999999999996, 'reg_lambda': 0.185}
1070.048173904419 sec


Premier test avec

- reg_alpha = [0.1,1,5,10,30,50,100]
- reg_lambda = [0.1,1,5,10,30,50,100]

On obtient :

0.9878612773031041 {'reg_alpha': 0.1, 'reg_lambda': 0.1}
119.39949417114258 sec

Deuxième test avec

- reg_alpha = np.linspace(0.01,1,15)
- reg_lambda = np.linspace(0.01,1,15)

On obtient :

0.9881048969848781 {'reg_alpha': 0.08071428571428571, 'reg_lambda': 0.15142857142857144}    
1078.1922311782837 sec

Troisième test avec

- reg_alpha = np.linspace(0.03,0.1,15)
- reg_lambda = np.linspace(0.01,0.5,15)

On obtient :

0.9880085018467166 {'reg_alpha': 0.034999999999999996, 'reg_lambda': 0.185}
1070.048173904419 sec


On conserve le deuxième test.

- Diminution du learning_rate et adaptation de n_estimators

Ayant réalisé cette recherche de meilleur paramètres sur deux jours, le split du dataset en dataset d'apprentissage et dataset de test a changé et le score en a été modifié. Il est ici plus bas que précédemment.

In [None]:
start = time()

my_kfold = KFold(n_splits=5, shuffle=True, random_state=0)

n_estimators = range(3500,6000,250)
tuned_parameters = {"n_estimators": n_estimators}

XGBR_params = GridSearchCV(
    XGBRegressor( 
        learning_rate = 0.015,
        max_depth=6, min_child_weight=1, 
        gamma = 0.0001, colsample_bytree=1, subsample=0.8714285714285714,
        reg_alpha = 0.08071428571428571, reg_lambda= 0.15142857142857144
    ),
    tuned_parameters,
    cv=my_kfold,
    n_jobs=-1,
)

XGBR_params.fit(X_train, y_train)
print(XGBR_params.score(X_test, y_test), XGBR_params.best_params_)
print(str(time() - start) + " sec")

0.9868053089033629 {'n_estimators': 5750}
323.44272923469543 sec


Premier test avec

- n_estimators = [840,1000, 2000,3000]
- learning_rate = [0.02,0.05,0.08,0.1]

On obtient :

0.9867792770846329 {'learning_rate': 0.02, 'n_estimators': 3000}
194.5658085346222 sec

Deuxième test avec

- n_estimators = [2000,3000, 4000, 5000]
- learning_rate = [0.01,0.02,0.05,0.08]

On obtient :

0.9868273520059674 {'learning_rate': 0.02, 'n_estimators': 4000}
360.37468552589417 sec

Troisième test avec

- n_estimators = [3000,3500, 4000, 4500, 5000]
- learning_rate = [0.01,0.015,0.02,0.025, 0.03]

On obtient :

0.986800966197621 {'learning_rate': 0.015, 'n_estimators': 5000}
595.9117505550385 sec

Quatrième test avec

- n_estimators = range(3500,6000,250)

On a fixé learning_rate à 0.015    
On obtient :

0.9868053089033629 {'n_estimators': 5750}    
323.44272923469543 sec

##### Test finaux


Par curiosité, on compare avec les learning rate et n_estimators choisis au premier test de XGBRegressor.

In [None]:
XGBR1 = XGBRegressor( 
        n_estimators=4000,learning_rate=0.025,
        max_depth=6, min_child_weight=1, 
        gamma = 0.0001, colsample_bytree=1, subsample=0.8714285714285714,
        reg_alpha = 0.08071428571428571, reg_lambda= 0.15142857142857144
    )
XGBR1.fit(X_train, y_train)
XGBR1.score(X_test, y_test)

0.9865361034411432

In [None]:
XGBR2 = XGBRegressor( 
        n_estimators = 5750,learning_rate = 0.015,
        max_depth=6, min_child_weight=1, 
        gamma = 0.0001, colsample_bytree=1, subsample=0.8714285714285714,
        reg_alpha = 0.08071428571428571, reg_lambda= 0.15142857142857144
    )
XGBR2.fit(X_train, y_train)
XGBR2.score(X_test, y_test)

0.9868053089033629

In [None]:
XGBR3 = XGBRegressor( 
        n_estimators = 4000,learning_rate = 0.02,
        max_depth=6, min_child_weight=1, 
        gamma = 0.0001, colsample_bytree=1, subsample=0.8714285714285714,
        reg_alpha = 0.08071428571428571, reg_lambda= 0.15142857142857144
    )
XGBR3.fit(X_train, y_train)
XGBR3.score(X_test, y_test)

0.9868273520059674

On obtient un score meilleur pour XGBR3 donc on concervera ce modèle.


## Préparation des données V2 - 2023/11/20 - SimpleImputer - Mean

- Import des données

In [3]:
path = "../../../data"
x_data = pd.read_csv(path + "/Xtrain_hgcGIrA.csv", sep=",")  # features
y_data = pd.read_csv(
    path + "/Ytrain_yL5OjS4.csv", sep=",", usecols=[1]
)  # occupancy rate

- Préparation des données

In [4]:
x_data = x_data.drop("date", axis=1)  # on supprime la colonne date
x_data = x_data.drop("way", axis=1) #  on supprime la colonne way
x_data["hour"] = x_data["hour"].apply(
    lambda x: np.nan if type(x) == float else int(x[:2])
) # on remplace les heures par des entiers en conservant les NaNs

# on tranforme les identifiants de gare en entiers
x_data["station"] = x_data["station"].astype("category")
cat_columns = x_data.select_dtypes(["category"]).columns
x_data[cat_columns] = x_data[cat_columns].apply(lambda x: x.cat.codes)

- Remplacement des NaN par la moyenne de la colonne

In [5]:
from sklearn.impute import SimpleImputer

In [6]:
# on cherche à remplacer les valeurs NaN par une moyenne, 
# et on veut ajouter une colonne de features qui indique si a valeur était manquante en premier lieu
SIM = SimpleImputer(missing_values=np.nan, strategy='mean', add_indicator=True)
new_x_data = SIM.fit_transform(x_data)


In [7]:
x_data.head(1)

Unnamed: 0,train,station,hour,composition,p1q0,p2q0,p3q0,p0q1,p0q2,p0q3
0,1,3,6.0,2,,,,0.201,0.138,0.091


In [8]:
new_x_data[406] # les valeurs NaNs ont bien été remplies. 7 colonnes ont été ajoutées

array([ 1.        , 23.        ,  7.3830557 ,  2.        ,  0.23476916,
        0.25139248,  0.31686684,  0.20720034,  0.22216716,  0.20984504,
        1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
        1.        ,  1.        ])

In [9]:
SIM.get_feature_names_out(SIM.feature_names_in_) 
# les 7 colonnes correspondent bien au missing_indicator pour les colonnes où nous avions des NaNs

array(['train', 'station', 'hour', 'composition', 'p1q0', 'p2q0', 'p3q0',
       'p0q1', 'p0q2', 'p0q3', 'missingindicator_hour',
       'missingindicator_p1q0', 'missingindicator_p2q0',
       'missingindicator_p3q0', 'missingindicator_p0q1',
       'missingindicator_p0q2', 'missingindicator_p0q3'], dtype=object)

In [10]:
new_x_data[406]

array([ 1.        , 23.        ,  7.3830557 ,  2.        ,  0.23476916,
        0.25139248,  0.31686684,  0.20720034,  0.22216716,  0.20984504,
        1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
        1.        ,  1.        ])

On rencontre cependant un problème pour hour, qui est une variable qu'on peut considérer catégorielle, mais qui à cause des NaNs, est considérée comme un float. La moyenne crée donc des valeurs probématiques. On va donc utiliser un autre Simple Imputer exclusivement pour hour.

In [11]:
x_data_full = SIM.fit_transform(x_data[['train', 'station', 'composition', 'p1q0', 'p2q0', 'p3q0', 'p0q1', 'p0q2', 'p0q3']])
# on utilise bien SIM pour les autres variables

In [12]:
x_data_full[0]

array([1.        , 3.        , 2.        , 0.23476916, 0.25139248,
       0.31686684, 0.201     , 0.138     , 0.091     , 1.        ,
       1.        , 1.        , 0.        , 0.        , 0.        ])

In [13]:
SIM.get_feature_names_out(SIM.feature_names_in_) 

array(['train', 'station', 'composition', 'p1q0', 'p2q0', 'p3q0', 'p0q1',
       'p0q2', 'p0q3', 'missingindicator_p1q0', 'missingindicator_p2q0',
       'missingindicator_p3q0', 'missingindicator_p0q1',
       'missingindicator_p0q2', 'missingindicator_p0q3'], dtype=object)

In [14]:
SIMF = SimpleImputer(missing_values=np.nan, strategy='most_frequent', add_indicator=True)
x_data_full = np.append(x_data_full, SIMF.fit_transform(x_data['hour'].to_numpy().reshape(-1, 1)), axis = 1)
x_data_full[406]

array([ 1.        , 23.        ,  2.        ,  0.23476916,  0.25139248,
        0.31686684,  0.20720034,  0.22216716,  0.20984504,  1.        ,
        1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
        7.        ,  1.        ])

In [15]:
x_data_full[0]

array([1.        , 3.        , 2.        , 0.23476916, 0.25139248,
       0.31686684, 0.201     , 0.138     , 0.091     , 1.        ,
       1.        , 1.        , 0.        , 0.        , 0.        ,
       6.        , 0.        ])

L'heure et sa colonne missing indicator son maintenant à la fin.

- Création d'un dataset de test et de validation

In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    x_data_full, y_data.values, train_size=0.8
)

## XGBoostREGRESSOR - 2023/11/20 - Préparation des données V2

- Test sans jouer sur les paramètres

In [26]:
from xgboost import XGBRegressor
XGBR = XGBRegressor()
XGBR.fit(X_train, y_train)
XGBR.score(X_test, y_test)

0.9869137961334914

- Choix de n_estimators

In [135]:
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from time import time

start = time()

my_kfold = KFold(n_splits=5, shuffle=True, random_state=0)

n_estimators = [800,825,850,875,900,925,950]
tuned_parameters = {"n_estimators": n_estimators}

XGBR_params = GridSearchCV(
    XGBRegressor(learning_rate=0.1), tuned_parameters, cv=my_kfold, n_jobs=-1
)

XGBR_params.fit(X_train, y_train)
print(XGBR_params.score(X_test, y_test), XGBR_params.best_params_)
print(str(time() - start) + " sec")

0.9876457285653106 {'n_estimators': 875}
60.46472883224487 sec


Premier test avec

- n_estimators = [100, 300, 500, 1000, 2000]

On obtient :

0.987631585074734 {'n_estimators': 1000}    
49.5891170501709 sec

Deuxième test avec

- n_estimators = [800,900, 1000, 1100, 1200,1500,1700, 2000]

On obtient :

0.9876367789297232 {'n_estimators': 900}    
86.77404356002808 sec

Troisième test avec

- n_estimators = [800,850,900,950,1000,1050,1100,1150,1200]

On obtient :

0.9876386771444841 {'n_estimators': 850}    
73.95417332649231 sec

Quatrième test avec

- n_estimators = [800,825,850,875,900,925,950]

On obtient :

0.9876457285653106 {'n_estimators': 875}
60.46472883224487 sec

On s'arrête là pour l'instant.

- choix de max_depth et min_child_weight

In [143]:
start = time()

my_kfold = KFold(n_splits=5, shuffle=True, random_state=0)

max_depth = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, None]
min_child_weight = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, None]

tuned_parameters = {
    "max_depth": max_depth,
    "min_child_weight": min_child_weight,
}

XGBR_params = GridSearchCV(
    XGBRegressor(n_estimators=875, learning_rate=0.1),
    tuned_parameters,
    cv=my_kfold,
    n_jobs=-1,
)

XGBR_params.fit(X_train, y_train)
print(XGBR_params.score(X_test, y_test), XGBR_params.best_params_)
print(str(time() - start) + " sec")

0.9875100229787886 {'max_depth': 8, 'min_child_weight': 5}
1796.5100202560425 sec


Premier test avec :

- max_depth = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
- min_child_weight = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]

On obtient :
0.9875100229787886 {'max_depth': 8, 'min_child_weight': 5}
1796.5100202560425 sec

- Choix de gamma

In [148]:
start = time()

my_kfold = KFold(n_splits=5, shuffle=True, random_state=0)

gamma = np.linspace(0.00001,0.0011,15)

tuned_parameters = {"gamma": gamma}

XGBR_params = GridSearchCV(
    XGBRegressor(
        n_estimators=875, learning_rate=0.1, max_depth=8, min_child_weight=5
    ),
    tuned_parameters,
    cv=my_kfold,
    n_jobs=-1,
)

XGBR_params.fit(X_train, y_train)
print(XGBR_params.score(X_test, y_test), XGBR_params.best_params_)
print(str(time() - start) + " sec")

0.9874671853255977 {'gamma': 1e-05}
160.78628015518188 sec


Premier test avec :

- gamma = [0.1,0.5,1,5,10,30,50,100]

On obtient :
0.9778057482492879 {'gamma': 0.1}
80.03496360778809 sec

Deuxième test avec

- gamma = np.linspace(0.01,1.1,15)

On obtient :

0.9850280320319525 {'gamma': 0.01}
140.01156759262085 sec

Troisième test avec

- gamma = np.linspace(0.001,0.11,15)

On obtient :

0.9872425524482036 {'gamma': 0.001}
136.6466999053955 sec

Quatrième test avec

- gamma = np.linspace(0.0001,0.011,15)

On obtient :

0.9874916840730203 {'gamma': 0.0001}
162.4254322052002 sec

Cinquième test avec

- gamma = np.linspace(0.00001,0.0011,15)

On obtient :

0.9874671853255977 {'gamma': 1e-05}
160.78628015518188 sec

Le score ayant légèrement baissé, on prendra gamma = 0.0001

- choix de subsample et colsample_bytree

In [153]:
start = time()

my_kfold = KFold(n_splits=5, shuffle=True, random_state=0)

subsample = np.linspace(0.8,1,10)
colsample_bytree = np.linspace(0.8,1,10)

tuned_parameters = {"subsample": subsample, "colsample_bytree": colsample_bytree}

XGBR_params = GridSearchCV(
    XGBRegressor(
        n_estimators=875, learning_rate=0.1, max_depth=8, min_child_weight=5, gamma = 0.0001
    ),
    tuned_parameters,
    cv=my_kfold,
    n_jobs=-1,
)

XGBR_params.fit(X_train, y_train)
print(XGBR_params.score(X_test, y_test), XGBR_params.best_params_)
print(str(time() - start) + " sec")

0.9874694820852845 {'colsample_bytree': 1.0, 'subsample': 0.9555555555555555}
926.539966583252 sec


Premier test avec :

- subsample = np.linspace(0.1,1,10)
- colsample_bytree = np.linspace(0.1,1,10)

On obtient :
0.9874820676563881 {'colsample_bytree': 1.0, 'subsample': 0.9}
578.6458947658539 sec

Deuxième test avec :

- subsample = np.linspace(0.5,1,10)
- colsample_bytree = np.linspace(0.5,1,10)

On obtient :
0.9873701809144972 {'colsample_bytree': 0.8333333333333333, 'subsample': 0.8888888888888888}
766.0954196453094 sec

Troisième test avec :

- subsample = np.linspace(0.7,1,10)
- colsample_bytree = np.linspace(0.7,1,10)

On obtient :
0.9877211237215808 {'colsample_bytree': 1.0, 'subsample': 0.8666666666666667}
881.7824146747589 sec

Quatrième test avec :

- subsample = np.linspace(0.8,1,10)
- colsample_bytree = np.linspace(0.8,1,10)

On obtient :
0.9874694820852845 {'colsample_bytree': 1.0, 'subsample': 0.9555555555555555}
926.539966583252 sec

On décide de garder le troisième test : colsample_bytree = 1, subsample = 0.8666666666666667

- choix de reg_alpha et reg_lambda

In [37]:
start = time()

my_kfold = KFold(n_splits=5, shuffle=True, random_state=0)

reg_alpha = np.linspace(0.05,0.3,10)
reg_lambda = np.linspace(0.6,1,10)

tuned_parameters = {"reg_alpha": reg_alpha, "reg_lambda": reg_lambda}

XGBR_params = GridSearchCV(
    XGBRegressor(
        n_estimators=875, learning_rate=0.1, max_depth=8, min_child_weight=5, gamma = 0.0001, colsample_bytree=1, subsample=0.8666666666666667
    ),
    tuned_parameters,
    cv=my_kfold,
    n_jobs=-1,
)

XGBR_params.fit(X_train, y_train)
print(XGBR_params.score(X_test, y_test), XGBR_params.best_params_)
print(str(time() - start) + " sec")

0.9882135749094555 {'reg_alpha': 0.13333333333333333, 'reg_lambda': 0.8222222222222222}
967.8804669380188 sec


Premier test avec
- reg_alpha = [0.1,1,5,10,30,50,100]
- reg_lambda = [0.1,1,5,10,30,50,100]

On obtient :
0.9882620721116913 {'reg_alpha': 0.1, 'reg_lambda': 1}
219.1620478630066 sec

Deuxième test avec

- reg_alpha = np.linspace(0.01,1,10)
- reg_lambda = np.linspace(0.01,1,10)

On obtient :

0.9882449355818865 {'reg_alpha': 0.12, 'reg_lambda': 1.0}
1016.8563673496246 sec

Troisième test avec

- reg_alpha = np.linspace(0.01,0.7,10)
- reg_lambda = np.linspace(0.5,1,10)

On obtient :

0.9879991356922498 {'reg_alpha': 0.16333333333333333, 'reg_lambda': 0.8888888888888888}
1026.3961491584778 sec

Quatrième test avec

- reg_alpha = np.linspace(0.05,0.3,10)
- reg_lambda = np.linspace(0.6,1,10)

On obtient :

0.9882135749094555 {'reg_alpha': 0.13333333333333333, 'reg_lambda': 0.8222222222222222}
967.8804669380188 sec

On garde le quatrième test.

- Diminution du learning rate et adaptation de n_estimators

In [31]:
start = time()

my_kfold = KFold(n_splits=5, shuffle=True, random_state=0)

n_estimators = np.arange(2000,3000,150)
learning_rate = np.linspace(0.02,0.03,10)
tuned_parameters = {"n_estimators": n_estimators, 'learning_rate': learning_rate}

XGBR_params = GridSearchCV(
    XGBRegressor( 
        max_depth=8, min_child_weight=5, gamma = 0.0001, colsample_bytree=1, subsample=0.8666666666666667,
        reg_alpha= 0.13333333333333333, reg_lambda= 0.8222222222222222
    ),
    tuned_parameters,
    cv=my_kfold,
    n_jobs=-1,
)

XGBR_params.fit(X_train, y_train) 
print(XGBR_params.score(X_test, y_test), XGBR_params.best_params_)
print(str(time() - start) + " sec")

0.9890216139609457 {'learning_rate': 0.025555555555555554, 'n_estimators': 2300}
1798.641760110855 sec


Premier test avec

- n_estimators = [875,1000, 2000,3000]
- learning_rate = [0.02,0.05,0.08,0.1]

On obtient :

0.989016337558562 {'learning_rate': 0.02, 'n_estimators': 3000}
331.3111243247986 sec

Deuxième test avec

- n_estimators = np.arange(1000,5000,500)
- learning_rate = np.linspace(0.01,0.05,10)

On obtient :

0.9890176223521759 {'learning_rate': 0.02333333333333333, 'n_estimators': 2500}
2382.4112935066223 sec

Troisième test avec

- n_estimators = np.arange(1500,4000,250)
- learning_rate = np.linspace(0.01,0.035,10)

On obtient :

0.989030871254338 {'learning_rate': 0.02388888888888889, 'n_estimators': 2250}
2727.478348970413 sec

Quatrième test avec
- n_estimators = np.arange(1500,3000,150)
- learning_rate = np.linspace(0.015,0.03,10)

On obtient

0.9891347744501855 {'learning_rate': 0.024999999999999998, 'n_estimators': 2250}
2188.679739713669 sec

Quatrième test avec
- n_estimators = np.arange(2000,3000,150)
- learning_rate = np.linspace(0.02,0.03,10)

On obtient
0.9890216139609457 {'learning_rate': 0.025555555555555554, 'n_estimators': 2300}    
1798.641760110855 sec

On garde le troisième test.

## Préparation des données V3 - 2023/11/22 - KNNImputer

- Import des données

In [74]:
path = "../../../data"
x_data = pd.read_csv(path + "/Xtrain_hgcGIrA.csv", sep=",")  # features
y_data = pd.read_csv(
    path + "/Ytrain_yL5OjS4.csv", sep=",", usecols=[1]
)  # occupancy rate

- Préparation des données

On décide de garder la première stratégie : mettre des zéros à la place des heures NaNs. Pour les pxqx vides, on utilisera par contre bien les KNN

In [75]:
x_data = x_data.drop("date", axis=1)  # on supprime la colonne date
x_data = x_data.drop("way", axis=1) #  on supprime la colonne way
x_data["hour"] = x_data["hour"].apply(
 lambda x: 0 if type(x) == float else int(x[:2])
) # on remplace les heures par des entiers en conservant les NaNs

# on tranforme les identifiants de gare en entiers
x_data["station"] = x_data["station"].astype("category")
cat_columns = x_data.select_dtypes(["category"]).columns
x_data[cat_columns] = x_data[cat_columns].apply(lambda x: x.cat.codes)

- Remplacement des NaN par KNN

In [76]:
from sklearn.impute import KNNImputer
KNNI = KNNImputer(missing_values=np.nan, add_indicator=True)
x_data_full = KNNI.fit_transform(x_data)

In [77]:
x_data.head(1)

Unnamed: 0,train,station,hour,composition,p1q0,p2q0,p3q0,p0q1,p0q2,p0q3
0,1,3,6,2,,,,0.201,0.138,0.091


In [78]:
x_data_full[0] # les valeurs NaNs ont bien été remplies. 7 colonnes ont été ajoutées

array([1.        , 3.        , 6.        , 2.        , 0.2286    ,
       0.25389724, 0.29123332, 0.201     , 0.138     , 0.091     ,
       1.        , 1.        , 1.        , 0.        , 0.        ,
       0.        ])

In [79]:
KNNI.get_feature_names_out(KNNI.feature_names_in_) 
# les 7 colonnes correspondent bien au missing_indicator pour les colonnes où nous avions des NaNs

array(['train', 'station', 'hour', 'composition', 'p1q0', 'p2q0', 'p3q0',
       'p0q1', 'p0q2', 'p0q3', 'missingindicator_p1q0',
       'missingindicator_p2q0', 'missingindicator_p3q0',
       'missingindicator_p0q1', 'missingindicator_p0q2',
       'missingindicator_p0q3'], dtype=object)

- Création d'un dataset de test et de validation

In [80]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    x_data_full, y_data.values, train_size=0.8
)

## XGBoostREGRESSOR - 2023/11/22 - Préparation des données V3

- Test sans jouer sur les paramètres

In [81]:
from xgboost import XGBRegressor
XGBR = XGBRegressor()
XGBR.fit(X_train, y_train)
XGBR.score(X_test, y_test)

0.9866072699818789

- choix de n_estimators

In [84]:
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from time import time

start = time()

my_kfold = KFold(n_splits=5, shuffle=True, random_state=0)

n_estimators = [850,860,870,880,890,900,910,920,930,940,950,960,970,980,990,1000]
tuned_parameters = {"n_estimators": n_estimators}

XGBR_params = GridSearchCV(
    XGBRegressor(learning_rate=0.1), tuned_parameters, cv=my_kfold, n_jobs=-1
)

XGBR_params.fit(X_train, y_train)
print(XGBR_params.score(X_test, y_test), XGBR_params.best_params_)
print(str(time() - start) + " sec")

0.9886325473238434 {'n_estimators': 930}
114.22175073623657 sec


Premier test avec

- n_estimators = [100, 300, 500, 1000, 2000]

On obtient :

0.9886483994296702 {'n_estimators': 1000}
42.81796193122864 sec

Deuxième test avec

- n_estimators = [800,850,900,950,1000,1050,1100,1150,1200,1500]

On obtient :

0.9886360824954366 {'n_estimators': 900}
81.56852078437805 sec

Troisième test avec

- n_estimators = [850,860,870,880,890,900,910,920,930,940,950,960,970,980,990,1000]

On obtient :

0.9886325473238434 {'n_estimators': 930}
114.22175073623657 sec

- choix de max_depth et min_child_weight

In [95]:
start = time()

my_kfold = KFold(n_splits=5, shuffle=True, random_state=0)

max_depth = np.arange(4,8,1)
min_child_weight = np.linspace(3,6,10)

tuned_parameters = {
    "max_depth": max_depth,
    "min_child_weight": min_child_weight,
}

XGBR_params = GridSearchCV(
    XGBRegressor(n_estimators=930, learning_rate=0.1),
    tuned_parameters,
    cv=my_kfold,
    n_jobs=-1,
)

XGBR_params.fit(X_train, y_train)
print(XGBR_params.score(X_test, y_test), XGBR_params.best_params_)
print(str(time() - start) + " sec")

0.988217001877991 {'max_depth': 6, 'min_child_weight': 4.333333333333333}
238.46584463119507 sec


Premier test avec 

- max_depth = np.arange(1,15,1)
- min_child_weight = np.arange(1,15,1)

On obtient
0.988217001877991 {'max_depth': 6, 'min_child_weight': 5}
1339.3490736484528 sec

Deuxième test avec 

- max_depth = np.arange(1,10,1)
- min_child_weight = np.linspace(1,10,15)

On obtient
0.988217001877991 {'max_depth': 6, 'min_child_weight': 4.214285714285714}
750.9402289390564 sec

Troisième test avec 

- max_depth = np.arange(4,8,1)
- min_child_weight = np.linspace(3,6,10)

On obtient
0.988217001877991 {'max_depth': 6, 'min_child_weight': 4.333333333333333}
238.46584463119507 sec

Le résultat ne change pas du tout

- choix de gamma


In [104]:
start = time()

my_kfold = KFold(n_splits=5, shuffle=True, random_state=0)

gamma = np.linspace(0.00001,0.0001,50)

tuned_parameters = {"gamma": gamma}

XGBR_params = GridSearchCV(
    XGBRegressor(
        n_estimators=930, learning_rate=0.1, max_depth=6, min_child_weight=4.333333333333333
    ),
    tuned_parameters,
    cv=my_kfold,
    n_jobs=-1,
)

XGBR_params.fit(X_train, y_train)
print(XGBR_params.score(X_test, y_test), XGBR_params.best_params_)
print(str(time() - start) + " sec")

0.9882517157839599 {'gamma': 8.53061224489796e-05}
282.04319047927856 sec


Premier test avec 
- gamma = np.linspace(0.1,1,15)
On obtient
0.9749181204994332 {'gamma': 0.1}
105.45138382911682 sec

Deuxième test avec 
- gamma = np.linspace(0.01,0.1,15)
On obtient
0.9835233622305644 {'gamma': 0.01}
92.13753747940063 sec

Troisième test avec 
- gamma = np.linspace(0.001,0.01,15)
On obtient
0.9876352652279332 {'gamma': 0.001}
102.16498756408691 sec

Quatrième test avec 
- gamma = np.linspace(0.0001,0.001,15)
On obtient
0.9881168961617299 {'gamma': 0.0001}
96.87253713607788 sec

Cinquième test avec 
- gamma = np.linspace(0.00001,0.0001,15)
On obtient
0.9880947378321867 {'gamma': 7.428571428571429e-05}
99.55928111076355 sec

Sixième test avec 
- gamma = np.linspace(0.00001,0.0001,50)
On obtient
0.9882517157839599 {'gamma': 8.53061224489796e-05}
282.04319047927856 sec

- choix de subsample et colsample_bytree

In [106]:
start = time()

my_kfold = KFold(n_splits=5, shuffle=True, random_state=0)

subsample = np.linspace(0.5,1,10)
colsample_bytree = np.linspace(0.5,1,10)

tuned_parameters = {"subsample": subsample, "colsample_bytree": colsample_bytree}

XGBR_params = GridSearchCV(
    XGBRegressor(
        n_estimators=930, learning_rate=0.1, max_depth=6, min_child_weight=4.333333333333333, gamma= 8.53061224489796e-05
    ),
    tuned_parameters,
    cv=my_kfold,
    n_jobs=-1,
)

XGBR_params.fit(X_train, y_train)
print(XGBR_params.score(X_test, y_test), XGBR_params.best_params_)
print(str(time() - start) + " sec")

0.9882517157839599 {'colsample_bytree': 1.0, 'subsample': 1.0}
595.0831518173218 sec


Premier test avec :
- subsample = np.linspace(0.1,1,10)
- colsample_bytree = np.linspace(0.1,1,10)
On obtient
0.9882517157839599 {'colsample_bytree': 1.0, 'subsample': 1.0}
383.3048412799835 sec

Deuxième test avec :
- subsample = np.linspace(0.5,1,10)
- colsample_bytree = np.linspace(0.5,1,10)
On obtient
0.9882517157839599 {'colsample_bytree': 1.0, 'subsample': 1.0}
595.0831518173218 sec

- choix de reg_alpha et reg_lambda

In [115]:
start = time()

my_kfold = KFold(n_splits=5, shuffle=True, random_state=0)

reg_alpha = np.linspace(0.1,0.2,20)
reg_lambda = np.linspace(0.04,0.1,20)

tuned_parameters = {"reg_alpha": reg_alpha, "reg_lambda": reg_lambda}

XGBR_params = GridSearchCV(
    XGBRegressor(
        n_estimators=930, learning_rate=0.1, max_depth=6, min_child_weight=4.333333333333333, gamma= 8.53061224489796e-05,
        colsample_bytree=1, subsample=1
    ),
    tuned_parameters,
    cv=my_kfold,
    n_jobs=-1,
)

XGBR_params.fit(X_train, y_train)
print(XGBR_params.score(X_test, y_test), XGBR_params.best_params_)
print(str(time() - start) + " sec")

0.9886270986331853 {'reg_alpha': 0.14444444444444443, 'reg_lambda': 0.052222222222222225}
63.54713201522827 sec


Premier test avec
- reg_alpha = [0.1,1,5,10,30,50,100]
- reg_lambda = [0.1,1,5,10,30,50,100]

On obtient
0.9885051436678425 {'reg_alpha': 0.1, 'reg_lambda': 10}
129.20628786087036 sec

Deuxième test avec
- reg_alpha = np.linspace(0.01,10,10)
- reg_lambda = np.linspace(0.1,50,10)

On obtient
0.9881017231449334 {'reg_alpha': 0.01, 'reg_lambda': 16.733333333333334}
244.0652163028717 sec

Troisième test avec
- reg_alpha = np.linspace(0.01,1,10)
- reg_lambda = np.linspace(0.1,20,10)

On obtient
0.9886693138278095 {'reg_alpha': 0.23, 'reg_lambda': 0.1}
416.92906308174133 sec

Quatrième test avec
- reg_alpha = np.linspace(0.05,0.8,10)
- reg_lambda = np.linspace(0.01,5,10)

On obtient
0.9884720588664335 {'reg_alpha': 0.13333333333333333, 'reg_lambda': 3.336666666666667}
423.8156259059906 sec

Cinquième test avec
- reg_alpha = np.linspace(0.1,0.5,10)
- reg_lambda = np.linspace(0.01,1,10)

On obtient
0.9885652780109305 {'reg_alpha': 0.14444444444444446, 'reg_lambda': 0.12}
476.68485736846924 sec

Sixième test avec
- reg_alpha = np.linspace(0.1,0.3,10)
- reg_lambda = np.linspace(0.01,0.2,10)

On obtient
0.9886270986331853 {'reg_alpha': 0.14444444444444443, 'reg_lambda': 0.052222222222222225}
570.3046953678131 sec

Sixième test avec
- reg_alpha = np.linspace(0.1,0.2,20)
- reg_lambda = np.linspace(0.04,0.1,20)

On obtient
0.9886043134006149 {'reg_alpha': 0.15263157894736842, 'reg_lambda': 0.08105263157894738}
2558.2963654994965 sec

- diminution de learning_rate et adaptation de n_estimators

In [120]:
start = time()

my_kfold = KFold(n_splits=5, shuffle=True, random_state=0)

n_estimators = np.arange(3700,4000,10)
learning_rate = [0.02]
tuned_parameters = {"n_estimators": n_estimators, 'learning_rate': learning_rate}

XGBR_params = GridSearchCV(
    XGBRegressor( 
        max_depth=6, min_child_weight=4.333333333333333, gamma= 8.53061224489796e-05,
        colsample_bytree=1, subsample=1,
        reg_alpha= 0.15263157894736842, reg_lambda= 0.08105263157894738
    ),
    tuned_parameters,
    cv=my_kfold,
    n_jobs=-1,
)

XGBR_params.fit(X_train, y_train) 
print(XGBR_params.score(X_test, y_test), XGBR_params.best_params_)
print(str(time() - start) + " sec")

0.9885325947953947 {'learning_rate': 0.02, 'n_estimators': 3760}
861.2512314319611 sec


Premier test avec

- n_estimators = np.arange(930,5000,500)
- learning_rate = [0.02]

On obtient :

0.9885404675773862 {'learning_rate': 0.02, 'n_estimators': 3930}
275.73846888542175 sec

Deuxième test avec

- n_estimators = np.arange(3000,5000,200)
- learning_rate = [0.02]

On obtient :

0.9885321172470223 {'learning_rate': 0.02, 'n_estimators': 3800}
318.14975929260254 sec

Troisième test avec

- n_estimators = np.arange(3000,4000,50)
- learning_rate = [0.02]

On obtient :

0.9885328509718507 {'learning_rate': 0.02, 'n_estimators': 3750}
588.039425611496 sec

Quatrième test avec

- n_estimators = np.arange(3700,4000,10)
- learning_rate = [0.02]

On obtient :

0.9885325947953947 {'learning_rate': 0.02, 'n_estimators': 3760}
861.2512314319611 sec

On s'arrête au quatrième test.