In [4]:
# Chargement des donn√©es brutes
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier
import joblib



In [5]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

# On garde uniquement les lignes du train qui ont un score `sii`
train = train.dropna(subset=["sii"]).copy()
train["sii"] = train["sii"].astype(int)

In [6]:
# Colonnes num√©riques
num_cols = [
    "Basic_Demos-Age",
    "Physical-BMI",
    "PreInt_EduHx-computerinternet_hoursday",
    "SDS-SDS_Total_T"
]

# Colonnes cat√©gorielles simples
cat_cols = [
    "Basic_Demos-Sex",                 # 0 = Gar√ßon, 1 = Fille
    "Basic_Demos-Enroll_Season",       # Saison d'inscription
    "Physical-Season"                  # Saison des mesures physiques
]

# S√©lection des colonnes + la cible
features = num_cols + cat_cols
target = "sii"

# ‚öôÔ∏è Cr√©ation des X, y
X = train[features].copy()
y = train[target]
X_test = test[features].copy()


In [7]:
print("üîç NaN dans X :\n", X.isnull().sum())

üîç NaN dans X :
 Basic_Demos-Age                             0
Physical-BMI                              209
PreInt_EduHx-computerinternet_hoursday     82
SDS-SDS_Total_T                           211
Basic_Demos-Sex                             0
Basic_Demos-Enroll_Season                   0
Physical-Season                           141
dtype: int64


In [8]:
# S√©paration num√©rique / cat√©goriel
num_cols = ["Basic_Demos-Age", "Physical-BMI", "PreInt_EduHx-computerinternet_hoursday", "SDS-SDS_Total_T"]
cat_cols = ["Basic_Demos-Sex", "Basic_Demos-Enroll_Season", "Physical-Season"]

# Imputation num√©rique (m√©diane)
num_imputer = SimpleImputer(strategy="median")
X[num_cols] = num_imputer.fit_transform(X[num_cols])
X_test[num_cols] = num_imputer.transform(X_test[num_cols])

# Imputation cat√©gorielle (mode)
cat_imputer = SimpleImputer(strategy="most_frequent")
X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])
X_test[cat_cols] = cat_imputer.transform(X_test[cat_cols])

# V√©rification
print(" V√©rif NaN apr√®s imputation :")
print(X.isnull().sum())

 V√©rif NaN apr√®s imputation :
Basic_Demos-Age                           0
Physical-BMI                              0
PreInt_EduHx-computerinternet_hoursday    0
SDS-SDS_Total_T                           0
Basic_Demos-Sex                           0
Basic_Demos-Enroll_Season                 0
Physical-Season                           0
dtype: int64


In [9]:
# On encode chaque colonne cat√©gorielle une par une
encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    X_test[col] = le.transform(X_test[col])  # attention : le test doit suivre l‚Äôordre du train
    encoders[col] = le  # on les garde si besoin de d√©coder plus tard

print("Encodage termin√©.")
print(X[cat_cols].head())

Encodage termin√©.
   Basic_Demos-Sex  Basic_Demos-Enroll_Season  Physical-Season
0                0                          0                0
1                0                          2                0
2                1                          2                0
3                0                          3                2
5                1                          1                2


In [10]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# üåü Mod√®le XGBoost de base
xgb = XGBClassifier(random_state=42, n_jobs=1)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# üîç Grille d'hyperparam√®tres
param_grid = {
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200],
    'subsample': [0.7, 1.0]
}

# ‚öñÔ∏è GridSearch avec scoring QWK
grid = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='roc_auc_ovr',  # approximation multiclasses
    cv=3,
    verbose=1,
    n_jobs=-1
)

# Entra√Ænement avec GridSearch
grid.fit(X, y)

# Meilleurs param√®tres trouv√©s
print("Meilleurs param√®tres XGBoost:", grid.best_params_)

# Mod√®le final avec les meilleurs param√®tres
final_xgb = XGBClassifier(**grid.best_params_, random_state=42, n_jobs=1)
final_xgb.fit(X_train, y_train)

# üìà Pr√©dictions
y_pred = final_xgb.predict(X_val)

# üß™ √âvaluation
qwk = cohen_kappa_score(y_val, y_pred, weights='quadratic')
print(f"XGBoost - QWK Score : {qwk:.4f}")


Fitting 3 folds for each of 36 candidates, totalling 108 fits




Meilleurs param√®tres XGBoost: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.7}
XGBoost - QWK Score : 0.3355


##  Conclusion ‚Äì Mod√®le XGBoost

Dans cette √©tape, nous avons test√© le mod√®le **XGBoost** avec une recherche d'hyperparam√®tres (GridSearch) :

- **Meilleurs param√®tres** trouv√©s :
  - `learning_rate = 0.1`
  - `max_depth = 3`
  - `subsample = 0.7`
  - `n_estimators = 100`
- **QWK score** avec XGBoost : **0.3355**

Bien que XGBoost soit un mod√®le performant, **LightGBM reste l√©g√®rement au-dessus avec un score QWK de 0.3499**. 

Conclusion : 
- **XGBoost est une bonne alternative**, mais pour ce cas pr√©cis, **LightGBM est l√©g√®rement plus performant** apr√®s optimisation.



In [12]:
# Sauvegarder le mod√®le XGBoost
joblib.dump(final_xgb, '../models/model_XGBoost.pkl')

['../models/model_XGBoost.pkl']