In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier
import joblib


In [2]:
# Chargement des donn√©es brutes
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

# On garde uniquement les lignes du train qui ont un score `sii`
train = train.dropna(subset=["sii"]).copy()
train["sii"] = train["sii"].astype(int)

In [3]:
# Colonnes num√©riques
num_cols = [
    "Basic_Demos-Age",
    "Physical-BMI",
    "PreInt_EduHx-computerinternet_hoursday",
    "SDS-SDS_Total_T"
]

# Colonnes cat√©gorielles simples
cat_cols = [
    "Basic_Demos-Sex",                 # 0 = Gar√ßon, 1 = Fille
    "Basic_Demos-Enroll_Season",       # Saison d'inscription
    "Physical-Season"                  # Saison des mesures physiques
]

# S√©lection des colonnes + la cible
features = num_cols + cat_cols
target = "sii"

# ‚öôÔ∏è Cr√©ation des X, y
X = train[features].copy()
y = train[target]
X_test = test[features].copy()


In [4]:
print("üîç NaN dans X :\n", X.isnull().sum())

üîç NaN dans X :
 Basic_Demos-Age                             0
Physical-BMI                              209
PreInt_EduHx-computerinternet_hoursday     82
SDS-SDS_Total_T                           211
Basic_Demos-Sex                             0
Basic_Demos-Enroll_Season                   0
Physical-Season                           141
dtype: int64


In [5]:
# S√©paration num√©rique / cat√©goriel
num_cols = ["Basic_Demos-Age", "Physical-BMI", "PreInt_EduHx-computerinternet_hoursday", "SDS-SDS_Total_T"]
cat_cols = ["Basic_Demos-Sex", "Basic_Demos-Enroll_Season", "Physical-Season"]

# Imputation num√©rique (m√©diane)
num_imputer = SimpleImputer(strategy="median")
X[num_cols] = num_imputer.fit_transform(X[num_cols])
X_test[num_cols] = num_imputer.transform(X_test[num_cols])

# Imputation cat√©gorielle (mode)
cat_imputer = SimpleImputer(strategy="most_frequent")
X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])
X_test[cat_cols] = cat_imputer.transform(X_test[cat_cols])

# V√©rification
print(" V√©rif NaN apr√®s imputation :")
print(X.isnull().sum())

 V√©rif NaN apr√®s imputation :
Basic_Demos-Age                           0
Physical-BMI                              0
PreInt_EduHx-computerinternet_hoursday    0
SDS-SDS_Total_T                           0
Basic_Demos-Sex                           0
Basic_Demos-Enroll_Season                 0
Physical-Season                           0
dtype: int64


In [6]:
# On encode chaque colonne cat√©gorielle une par une
encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    X_test[col] = le.transform(X_test[col])  # attention : le test doit suivre l‚Äôordre du train
    encoders[col] = le  # on les garde si besoin de d√©coder plus tard

print("Encodage termin√©.")
print(X[cat_cols].head())

Encodage termin√©.
   Basic_Demos-Sex  Basic_Demos-Enroll_Season  Physical-Season
0                0                          0                0
1                0                          2                0
2                1                          2                0
3                0                          3                2
5                1                          1                2


In [7]:
# ‚öñÔ∏è √âquilibrage + split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# üöÄ Mod√®le LightGBM simple
model = LGBMClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=1 
)
model.fit(X_train, y_train)

# üìà Pr√©dictions
y_pred = model.predict(X_val)

# üß™ √âvaluation
qwk = cohen_kappa_score(y_val, y_pred, weights='quadratic')
print(f"‚úÖ LightGBM - QWK Score : {qwk:.4f}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000678 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 334
[LightGBM] [Info] Number of data points in the train set: 2188, number of used features: 7
[LightGBM] [Info] Start training from score -0.540042
[LightGBM] [Info] Start training from score -1.320842
[LightGBM] [Info] Start training from score -1.980316
[LightGBM] [Info] Start training from score -4.394906
‚úÖ LightGBM - QWK Score : 0.3258


## Conclusion ‚Äì Mod√®le V2 avec LightGBM

Dans cette version, on a enrichi notre mod√®le en :

- ajoutant des **variables cat√©gorielles pertinentes** (`Sex`, `Season`, etc.)
- utilisant un mod√®le plus puissant : **LightGBM**

R√©sultat : un score **QWK de 0.3258**, meilleur que notre baseline RandomForest (0.3088)

Cela montre que des choix simples et logiques peuvent **am√©liorer significativement la qualit√© du mod√®le**, m√™me sans tuning avanc√©.

prochaine √©tape possible : optimiser les hyperparam√®tres et tester plus de features.


In [8]:
# Mod√®le LightGBM de base
lgbm = LGBMClassifier(random_state=42, n_jobs=1)

# Grille d'hyperparam√®tres
param_grid = {
    'num_leaves': [15, 31, 50],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.05, 0.1, 0.2],
    'n_estimators': [100, 200]
}

# GridSearch avec scoring personnalis√©
grid = GridSearchCV(
    estimator=lgbm,
    param_grid=param_grid,
    scoring='roc_auc_ovr',  # approximation pour multiclass
    cv=3,
    verbose=1,
    n_jobs=-1
)

grid.fit(X, y)
print(" Best params:", grid.best_params_)

Fitting 3 folds for each of 54 candidates, totalling 162 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000065 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 337
[LightGBM] [Info] Number of data points in the train set: 2736, number of used features: 7
[LightGBM] [Info] Start training from score -0.540250
[LightGBM] [Info] Start training from score -1.321208
[LightGBM] [Info] Start training from score -1.979358
[LightGBM] [Info] Start training from score -4.387892
 Best params: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100, 'num_leaves': 15}


In [9]:
# üåü Meilleurs param√®tres du GridSearch
best_params = grid.best_params_
print("Meilleurs param√®tres trouv√©s :", best_params)

# üî• Entra√Ænement final avec les meilleurs param√®tres
final_model = LGBMClassifier(**best_params, random_state=42, n_jobs=1)
final_model.fit(X_train, y_train)

# üìà Pr√©dictions
y_pred = final_model.predict(X_val)

# üß™ √âvaluation
qwk = cohen_kappa_score(y_val, y_pred, weights='quadratic')
print(f" QWK score avec optimisation : {qwk:.4f}")


Meilleurs param√®tres trouv√©s : {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100, 'num_leaves': 15}
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000070 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 334
[LightGBM] [Info] Number of data points in the train set: 2188, number of used features: 7
[LightGBM] [Info] Start training from score -0.540042
[LightGBM] [Info] Start training from score -1.320842
[LightGBM] [Info] Start training from score -1.980316
[LightGBM] [Info] Start training from score -4.394906
 QWK score avec optimisation : 0.3499


In [14]:
# Sauvegarder le mod√®le LightGBM
joblib.dump(final_model, '../models/model_lightgbm.pkl')


['../models/model_lightgbm.pkl']