# Stacking des meilleurs modèles

On va faire du stacking des meilleurs modèles avec les hyperparamètres optimisés qu'on a trouvé au notebok précédent. 

On va utiliser un modèle de régression logistique pour faire le stacking.

In [1]:
import pandas as pd
import scipy.stats as stats
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

data = pd.read_csv('train.csv')

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# On sélectionne les colonnes d'entrée (features) et la cible
X = data.drop(columns=['Cover_Type'])
y = data['Cover_Type']

# On sélectionne les colonnes d'entrée (features) et la cible
X = data.drop(columns=['Cover_Type'])
y = data['Cover_Type']

# On divise les données en ensemble d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# On normalise les données continues 
from sklearn.preprocessing import StandardScaler

continuous_columns = ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 
                      'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 
                      'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points']
scaler = StandardScaler()

# On applique le scaler uniquement sur les colonnes continues
#X_train[continuous_columns] = scaler.fit_transform(X_train[continuous_columns])
#X_test[continuous_columns] = scaler.transform(X_test[continuous_columns])

# On ajuste les labels pour qu'ils commencent à 0 pour XGBoost
y_train_adj = y_train - 1
y_test_adj = y_test - 1

In [3]:
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Définir les modèles de base avec les meilleurs hyperparamètres
random_forest = RandomForestClassifier(
    n_estimators=300,  
    max_depth=None,      
    random_state=42,
    max_features='sqrt',
    min_samples_split=2,
    min_samples_leaf=1,
)

xgboost = XGBClassifier(
    colsample_bytree=0.9284956459967804,
    gamma=0.04924511492281908,
    learning_rate=0.24122412239323296,
    max_depth=9,
    n_estimators=300,
    subsample=0.9833984902058166,
    objective='multi:softmax',
    num_class=7,
    random_state=42
)

lightgbm = LGBMClassifier(
    colsample_bytree=0.9488048655607615,
    learning_rate=0.09224159033088929,
    max_depth=15,
    n_estimators=300,
    num_leaves=100,
    subsample=0.5190158468602316,
    objective='multiclass',
    num_class=7,
    random_state=42,
    verbosity=-1
)

# Définir le méta-modèle (un modèle simple comme la régression logistique)
meta_model = RandomForestClassifier(n_estimators=200, max_depth=5, random_state=42)# LogisticRegression(max_iter=1000)

# Créer le StackingClassifier
stacking_model = StackingClassifier(
    estimators=[
        ('rf', random_forest),
        ('xgb', xgboost),
        ('lgbm', lightgbm)
    ],
    final_estimator=meta_model,
    cv=10  # Utiliser une validation croisée pour le stacking
)

# Entraîner le modèle de stacking
stacking_model.fit(X_train, y_train_adj)

# Prédire sur l'ensemble de test
y_pred_stack = stacking_model.predict(X_test)

# Évaluer les performances du modèle de stacking
accuracy_stack = accuracy_score(y_test_adj, y_pred_stack)
print("Précision du modèle de stacking :", accuracy_stack)
print("Rapport de classification du modèle de stacking :\n", classification_report(y_test_adj, y_pred_stack))

KeyboardInterrupt: 

## Stacking de plus de modèles (5)

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report

# Définir les modèles de base avec les meilleurs hyperparamètres
random_forest = RandomForestClassifier(
    n_estimators=300,  
    max_depth=None,      
    random_state=42,
    max_features='sqrt',
    min_samples_split=2,
    min_samples_leaf=1,
)

xgboost = XGBClassifier(
    colsample_bytree=0.9284956459967804,
    gamma=0.04924511492281908,
    learning_rate=0.24122412239323296,
    max_depth=9,
    n_estimators=300,
    subsample=0.9833984902058166,
    objective='multi:softmax',
    num_class=7,
    random_state=42
)

lightgbm = LGBMClassifier(
    colsample_bytree=0.9488048655607615,
    learning_rate=0.09224159033088929,
    max_depth=15,
    n_estimators=300,
    num_leaves=100,
    subsample=0.5190158468602316,
    objective='multiclass',
    num_class=7,
    random_state=42,
    verbosity=-1
)

extra_trees = ExtraTreesClassifier(
    n_estimators=300,
    max_depth=30,
    random_state=42,
    max_features=None,
    min_samples_split=2,
    min_samples_leaf=1,
    n_jobs=-1,
)

gradient_boosting = GradientBoostingClassifier(
    n_estimators=400,
    learning_rate=0.0526378911793133,
    max_depth=12,
    random_state=42,
    min_samples_split=10,
    min_samples_leaf=4, 
    subsample=0.7897940854001194 
)

# Définir le méta-modèle (un modèle simple comme la régression logistique)
meta_model = XGBClassifier(
    n_estimators=175, #150 #200,
    max_depth= 2,  #3,#4
    learning_rate= 0.1,  #0.1,
    colsample_bytree=0.8,
    subsample=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss'
)
    # LGBMClassifier(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42)
    # RandomForestClassifier(n_estimators=200, max_depth=5, random_state=42)
    # LogisticRegression(max_iter=1000)

# Créer le StackingClassifier
stacking_model = StackingClassifier(
    estimators=[
        ('rf', random_forest),
        ('xgb', xgboost),
        ('lgbm', lightgbm),
        ('et', extra_trees),
        ('gb', gradient_boosting)
    ],
    final_estimator=meta_model,
    cv=10  # Utiliser une validation croisée pour le stacking
)

In [68]:
# Entraîner le modèle de stacking
stacking_model.fit(X_train, y_train_adj)

# Prédire sur l'ensemble de test
y_pred_stack = stacking_model.predict(X_test)

# Évaluer les performances du modèle de stacking
accuracy_stack = accuracy_score(y_test_adj, y_pred_stack)
print("Précision du modèle de stacking :", accuracy_stack)
print("Rapport de classification du modèle de stacking :\n", classification_report(y_test_adj, y_pred_stack))

Parameters: { "use_label_encoder" } are not used.



Précision du modèle de stacking : 0.9018959435626103
Rapport de classification du modèle de stacking :
               precision    recall  f1-score   support

           0       0.83      0.82      0.82       648
           1       0.81      0.80      0.80       648
           2       0.89      0.92      0.91       648
           3       0.97      0.97      0.97       648
           4       0.95      0.94      0.94       648
           5       0.91      0.92      0.92       648
           6       0.96      0.94      0.95       648

    accuracy                           0.90      4536
   macro avg       0.90      0.90      0.90      4536
weighted avg       0.90      0.90      0.90      4536



Test complet

In [34]:
y_adj = y - 1

X_test_full = pd.read_csv('test-full.csv')

In [63]:
# On entraîne le modèle de stacking
stacking_model.fit(X, y_adj)

# On prédit sur l'ensemble de test
y_pred_stack = stacking_model.predict(X_test_full)

y_pred_stack = y_pred_stack + 1

Parameters: { "use_label_encoder" } are not used.



In [64]:
# On crée un DataFrame avec les Id et les Cover_Type prédits
submission_df = pd.DataFrame({
    'Id': X_test_full['Id'],
    'Cover_Type': y_pred_stack
})

# On sauvegarde le DataFrame en fichier CSV
submission_df.to_csv('soumissions/submission_big_stackingXGB2_125_0.1.csv', index=False)

## Stacking de 6 modèles

In [20]:
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report
from catboost import CatBoostClassifier

# Définir les modèles de base avec les meilleurs hyperparamètres
random_forest = RandomForestClassifier(
    n_estimators=300,  
    max_depth=None,      
    random_state=42,
    max_features='sqrt',
    min_samples_split=2,
    min_samples_leaf=1,
)

xgboost = XGBClassifier(
    colsample_bytree=0.9284956459967804,
    gamma=0.04924511492281908,
    learning_rate=0.24122412239323296,
    max_depth=9,
    n_estimators=300,
    subsample=0.9833984902058166,
    objective='multi:softmax',
    num_class=7,
    random_state=42
)

lightgbm = LGBMClassifier(
    colsample_bytree=0.9488048655607615,
    learning_rate=0.09224159033088929,
    max_depth=15,
    n_estimators=300,
    num_leaves=100,
    subsample=0.5190158468602316,
    objective='multiclass',
    num_class=7,
    random_state=42,
    verbosity=-1
)

extra_trees = ExtraTreesClassifier(
    n_estimators=300,
    max_depth=30,
    random_state=42,
    max_features=None,
    min_samples_split=2,
    min_samples_leaf=1,
    n_jobs=-1,
)

gradient_boosting = GradientBoostingClassifier(
    n_estimators=400,
    learning_rate=0.0526378911793133,
    max_depth=12,
    random_state=42,
    min_samples_split=10,
    min_samples_leaf=4,
    subsample=0.7897940854001194 
)

catboost = CatBoostClassifier(
    iterations=500,
    learning_rate=0.24345784647020918,
    depth=8,
    verbose=0,
    random_state=42,
    l2_leaf_reg=1.0016215299386924,
    bagging_temperature=0.2195369395721741,
)

# Définir le méta-modèle (un modèle simple comme la régression logistique)
meta_model = LogisticRegression(max_iter=1000)

# Créer le StackingClassifier
stacking_model = StackingClassifier(
    estimators=[
        ('rf', random_forest),
        ('xgb', xgboost),
        ('lgbm', lightgbm),
        ('et', extra_trees),
        ('gb', gradient_boosting),
        ('cat', catboost)
    ],
    final_estimator=meta_model,
    cv=10  # Utiliser une validation croisée pour le stacking
)

In [13]:
# Entraîner le modèle de stacking
stacking_model.fit(X_train, y_train_adj)

# Prédire sur l'ensemble de test
y_pred_stack = stacking_model.predict(X_test)

# Évaluer les performances du modèle de stacking
accuracy_stack = accuracy_score(y_test_adj, y_pred_stack)
print("Précision du modèle de stacking :", accuracy_stack)
print("Rapport de classification du modèle de stacking :\n", classification_report(y_test_adj, y_pred_stack))

Précision du modèle de stacking : 0.8957231040564374
Rapport de classification du modèle de stacking :
               precision    recall  f1-score   support

           0       0.81      0.81      0.81       648
           1       0.80      0.77      0.79       648
           2       0.89      0.91      0.90       648
           3       0.96      0.97      0.97       648
           4       0.94      0.94      0.94       648
           5       0.91      0.91      0.91       648
           6       0.95      0.94      0.95       648

    accuracy                           0.90      4536
   macro avg       0.90      0.90      0.90      4536
weighted avg       0.90      0.90      0.90      4536



In [5]:
y_adj = y - 1

X_test_full = pd.read_csv('test-full.csv')

In [22]:
# On entraîne le modèle de stacking
stacking_model.fit(X, y_adj)

# On prédit sur l'ensemble de test
y_pred_stack = stacking_model.predict(X_test_full)

y_pred_stack = y_pred_stack + 1

In [23]:
# On crée un DataFrame avec les Id et les Cover_Type prédits
submission_df = pd.DataFrame({
    'Id': X_test_full['Id'],
    'Cover_Type': y_pred_stack
})

# On sauvegarde le DataFrame en fichier CSV
submission_df.to_csv('soumissions/submission_big6_stacking.csv', index=False)

## Suppression de Hillshade_9am

In [8]:
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report

# Définir les modèles de base avec les meilleurs hyperparamètres
random_forest = RandomForestClassifier(
    n_estimators=300,  
    max_depth=None,      
    random_state=42,
    max_features='sqrt',
    min_samples_split=2,
    min_samples_leaf=1,
)

xgboost = XGBClassifier(
    colsample_bytree=0.9284956459967804,
    gamma=0.04924511492281908,
    learning_rate=0.24122412239323296,
    max_depth=9,
    n_estimators=300,
    subsample=0.9833984902058166,
    objective='multi:softmax',
    num_class=7,
    random_state=42
)

lightgbm = LGBMClassifier(
    colsample_bytree=0.9488048655607615,
    learning_rate=0.09224159033088929,
    max_depth=15,
    n_estimators=300,
    num_leaves=100,
    subsample=0.5190158468602316,
    objective='multiclass',
    num_class=7,
    random_state=42,
    verbosity=-1
)

extra_trees = ExtraTreesClassifier(
    n_estimators=300,
    max_depth=30,
    random_state=42,
    max_features=None,
    min_samples_split=2,
    min_samples_leaf=1,
    n_jobs=-1,
)

gradient_boosting = GradientBoostingClassifier(
    n_estimators=400,
    learning_rate=0.0526378911793133,
    max_depth=12,
    random_state=42,
    min_samples_split=10,
    min_samples_leaf=4, 
    subsample=0.7897940854001194 
)

# Définir le méta-modèle (un modèle simple comme la régression logistique)
meta_model = XGBClassifier(
    n_estimators=150, #150 #200,
    max_depth= 2,  #3,#4
    learning_rate= 0.1,  #0.1,
    colsample_bytree=0.8,
    subsample=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss'
)
    # LGBMClassifier(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42)
    # RandomForestClassifier(n_estimators=200, max_depth=5, random_state=42)
    # LogisticRegression(max_iter=1000)

# Créer le StackingClassifier
stacking_model = StackingClassifier(
    estimators=[
        ('rf', random_forest),
        ('xgb', xgboost),
        ('lgbm', lightgbm),
        ('et', extra_trees),
        ('gb', gradient_boosting)
    ],
    final_estimator=meta_model,
    cv=10  # Utiliser une validation croisée pour le stacking
)

In [6]:
y_adj = y - 1

X_test_full = pd.read_csv('test-full.csv')

In [7]:
X = X.drop(columns=['Hillshade_9am'])
X_test_full = X_test_full.drop(columns=['Hillshade_9am'])

In [9]:
# On entraîne le modèle de stacking
stacking_model.fit(X, y_adj)

# On prédit sur l'ensemble de test
y_pred_stack = stacking_model.predict(X_test_full)

y_pred_stack = y_pred_stack + 1

Parameters: { "use_label_encoder" } are not used.



In [10]:
# On crée un DataFrame avec les Id et les Cover_Type prédits
submission_df = pd.DataFrame({
    'Id': X_test_full['Id'],
    'Cover_Type': y_pred_stack
})

# On sauvegarde le DataFrame en fichier CSV
submission_df.to_csv('soumissions/submission_stacking_Hillshade.csv', index=False)