# Stack Regressor


In [1]:
import pandas as pd
import numpy as np

- chargement des données


In [7]:
path = "./../../data"
x_data = pd.read_csv(path + "/Xtrain_hgcGIrA.csv", sep=",")  # features
y_data = pd.read_csv(
    path + "/Ytrain_yL5OjS4.csv", sep=",", usecols=[1]
)  # occupancy rate

- traitement des données


In [8]:
x_data = x_data.drop("date", axis=1)  # suppression de la colonne date
x_data = x_data.drop("way", axis=1)  # suppression de la colonne way

# on tranforme les identifiants de gare en entiers
x_data["station"] = x_data["station"].astype("category")
cat_columns = x_data.select_dtypes(["category"]).columns
x_data[cat_columns] = x_data[cat_columns].apply(lambda x: x.cat.codes)


x_data["hour"] = x_data["hour"].apply(
    lambda x: 0 if type(x) == float else int(x[:2])
)  # on remplace les Nan de hour par des 0 et les heures par des entiers

# on remplace les Nan des pxqx par 0
x_data[["p1q0", "p2q0", "p3q0", "p0q1", "p0q2", "p0q3"]] = x_data[
    ["p1q0", "p2q0", "p3q0", "p0q1", "p0q2", "p0q3"]
].fillna(0)

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    x_data.values, y_data.values, train_size=0.8
)

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from xgboost import XGBRFRegressor, XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV

XGBRF = XGBRFRegressor(
    n_estimators=650,
    learning_rate=1,
    max_depth=25,
    min_child_weight=3,
    reg_alpha=0.001,
    reg_lambda=0.001,
)
XGBR = XGBRegressor(
    n_estimators=5000,
    learning_rate=0.025,
    max_depth=7,
    min_child_weight=8,
    reg_alpha=0.15,
    reg_lambda=0.21,
)
KNN = KNeighborsRegressor(n_neighbors=4, weights="distance", p=1.0)

In [7]:
estimators = [
    ("xgbrf", XGBRF),
    ("xgbr", XGBR),
]

reg = StackingRegressor(
    estimators=estimators,
    final_estimator=RandomForestRegressor(n_estimators=100, random_state=42),
)


reg.fit(X_train, y_train)
reg.score(X_test, y_test)

  y = column_or_1d(y, warn=True)


In [None]:
estimators = [
    ("knn", KNN),
    ("xgbr", XGBR),
]


reg = StackingRegressor(
    estimators=estimators,
    final_estimator=RandomForestRegressor(n_estimators=250, random_state=42),
)

reg.fit(X_train, y_train)
reg.score(X_test, y_test)

  y = column_or_1d(y, warn=True)


0.9869493291949266

Score xgbr + kn & final_estimator=RandomForestRegressor(n_estimators=100, random_state=42)
0.9846857499934524


In [None]:
from sklearn.svm import LinearSVR

estimators = [
    ("knn", KNN),
    ("xgbr", XGBR),
]

reg = StackingRegressor(
    estimators=estimators,
    final_estimator=LinearSVR(
        dual="auto",
        random_state=400,
        max_iter=2000,
        epsilon=0.01,
        C=10,
    ),
)

reg.fit(X_train, y_train)
reg.score(X_test, y_test)

  y = column_or_1d(y, warn=True)


0.988891523316289

Premier test LinearSVR(dual="auto", random_state=42, epsilon=0.01)
score: 0.9864272252953467


Premier test LinearSVR(dual="auto", random_state=100, epsilon=0.01)
score: 0.988847060761807


Troisième test LinearSVR(dual="auto", random_state=200, epsilon=0.001,) 0.9889214270757255


0.988891523316289


## Optimisation de XGBRFRegressor

On fait des boucles for puisqu'il n'est pas possible de faire de GridSearch avec des StackRegressor


In [None]:
from sklearn.model_selection import KFold

estimators = [
    ("knn", KNN),
    ("xgbr", XGBR),
]

n_estimators = np.arange(700, 1000, 100)
learning_rate = np.linspace(1.0, 1.1, 3)
for estimator in n_estimators:
    for lr in learning_rate:
        reg = StackingRegressor(
            estimators=estimators,
            final_estimator=XGBRFRegressor(
                max_depth=10,
                learning_rate=lr,
                n_estimators=estimator,
            ),
        )

        reg.fit(X_train, y_train)
        print(
            reg.score(X_test, y_test),
            {"n_estimators": estimator, "learning_rate": lr},
        )

  y = column_or_1d(y, warn=True)


0.9884138692715975 {'n_estimators': 700, 'learning_rate': 1.0}


  y = column_or_1d(y, warn=True)


0.9794734069995766 {'n_estimators': 700, 'learning_rate': 1.05}


  y = column_or_1d(y, warn=True)


0.9524499946993443 {'n_estimators': 700, 'learning_rate': 1.1}


  y = column_or_1d(y, warn=True)


0.9884022731408081 {'n_estimators': 800, 'learning_rate': 1.0}


  y = column_or_1d(y, warn=True)


0.9794625413680064 {'n_estimators': 800, 'learning_rate': 1.05}


  y = column_or_1d(y, warn=True)


0.9524401021590339 {'n_estimators': 800, 'learning_rate': 1.1}


  y = column_or_1d(y, warn=True)


0.9884044885363472 {'n_estimators': 900, 'learning_rate': 1.0}


  y = column_or_1d(y, warn=True)


0.9794682141970142 {'n_estimators': 900, 'learning_rate': 1.05}


  y = column_or_1d(y, warn=True)


0.9524496481343714 {'n_estimators': 900, 'learning_rate': 1.1}


choisissons {'n_estimators': 700, 'learning_rate': 1.0}
Score : 0.9884138692715975


In [6]:
estimators = [
    ("knn", KNN),
    ("xgbr", XGBR),
]

max_depth = [7, 8, 9, 10, 11, 12]
min_child_weight = [7, 8, 9]

for md in max_depth:
    for mcw in min_child_weight:
        reg = StackingRegressor(
            estimators=estimators,
            final_estimator=XGBRFRegressor(
                learning_rate=1.0,
                n_estimators=700,
                min_child_weight=mcw,
                max_depth=md,
            ),
        )

        reg.fit(X_train, y_train)
        print(
            reg.score(X_test, y_test),
            {"max_depth": md, "min_child_weight": mcw},
        )

  y = column_or_1d(y, warn=True)


0.9886825772531682 {'max_depth': 7, 'min_child_weight': 7}


  y = column_or_1d(y, warn=True)


KeyboardInterrupt: 

In [11]:
estimators = [
    ("knn", KNN),
    ("xgbr", XGBR),
]


reg_alpha = [0.3, 0.14, 0.15, 0.16, 0.17]
reg_lambda = [0.18, 0.19, 0.2, 0.21, 0.22]
scores = []
params = []

for ra in reg_alpha:
    for rl in reg_lambda:
        reg = StackingRegressor(
            estimators=estimators,
            final_estimator=XGBRFRegressor(
                learning_rate=1.0,
                n_estimators=700,
                min_child_weight=7,
                max_depth=9,
                reg_lambda=rl,
                reg_alpha=ra,
            ),
        )

        reg.fit(X_train, y_train)
        score = (reg.score(X_test, y_test),)
        print(
            score,
            {"reg_alpha": ra, "reg_lambda": rl},
        )
        scores.append(score)
        params.append({"reg_alpha": ra, "reg_lambda": rl})

  y = column_or_1d(y, warn=True)


(0.9845420758660834,) {'reg_alpha': 0.3, 'reg_lambda': 0.18}


  y = column_or_1d(y, warn=True)


(0.9845349788633778,) {'reg_alpha': 0.3, 'reg_lambda': 0.19}


  y = column_or_1d(y, warn=True)


(0.984530829947362,) {'reg_alpha': 0.3, 'reg_lambda': 0.2}


  y = column_or_1d(y, warn=True)


(0.9845224917442604,) {'reg_alpha': 0.3, 'reg_lambda': 0.21}


  y = column_or_1d(y, warn=True)


(0.9845166931669567,) {'reg_alpha': 0.3, 'reg_lambda': 0.22}


  y = column_or_1d(y, warn=True)


(0.985705828132948,) {'reg_alpha': 0.14, 'reg_lambda': 0.18}


  y = column_or_1d(y, warn=True)


(0.9856590245964629,) {'reg_alpha': 0.14, 'reg_lambda': 0.19}


  y = column_or_1d(y, warn=True)


(0.9855983486254292,) {'reg_alpha': 0.14, 'reg_lambda': 0.2}


  y = column_or_1d(y, warn=True)


(0.9855455047950413,) {'reg_alpha': 0.14, 'reg_lambda': 0.21}


  y = column_or_1d(y, warn=True)


(0.985505895369699,) {'reg_alpha': 0.14, 'reg_lambda': 0.22}


  y = column_or_1d(y, warn=True)


(0.9854644240939443,) {'reg_alpha': 0.15, 'reg_lambda': 0.18}


  y = column_or_1d(y, warn=True)


(0.9854363166328363,) {'reg_alpha': 0.15, 'reg_lambda': 0.19}


  y = column_or_1d(y, warn=True)


(0.9854208122548788,) {'reg_alpha': 0.15, 'reg_lambda': 0.2}


  y = column_or_1d(y, warn=True)


(0.9854054290777985,) {'reg_alpha': 0.15, 'reg_lambda': 0.21}


  y = column_or_1d(y, warn=True)


(0.9853932845033373,) {'reg_alpha': 0.15, 'reg_lambda': 0.22}


  y = column_or_1d(y, warn=True)


(0.9853620328326478,) {'reg_alpha': 0.16, 'reg_lambda': 0.18}


  y = column_or_1d(y, warn=True)


(0.9853474638476797,) {'reg_alpha': 0.16, 'reg_lambda': 0.19}


  y = column_or_1d(y, warn=True)


(0.985333713851679,) {'reg_alpha': 0.16, 'reg_lambda': 0.2}


  y = column_or_1d(y, warn=True)


(0.9853259947502511,) {'reg_alpha': 0.16, 'reg_lambda': 0.21}


  y = column_or_1d(y, warn=True)


(0.9853107925211361,) {'reg_alpha': 0.16, 'reg_lambda': 0.22}


  y = column_or_1d(y, warn=True)


(0.9852804601482108,) {'reg_alpha': 0.17, 'reg_lambda': 0.18}


  y = column_or_1d(y, warn=True)


(0.9852697294712933,) {'reg_alpha': 0.17, 'reg_lambda': 0.19}


  y = column_or_1d(y, warn=True)


(0.985263322454009,) {'reg_alpha': 0.17, 'reg_lambda': 0.2}


  y = column_or_1d(y, warn=True)


(0.9852485225389802,) {'reg_alpha': 0.17, 'reg_lambda': 0.21}


  y = column_or_1d(y, warn=True)


(0.9852380866640378,) {'reg_alpha': 0.17, 'reg_lambda': 0.22}


Premier essai reg_alpha': 0.14, 'reg_lambda': 0.18 Score : 0.985705828132948


Les résultats sont très variables puisque le score n'est pas évalué avec un KFold pour gagner en temps de calcul.

On conserve LinearCRV qui donne de meilleurs résultats
