In [1]:
# ============================================================
# NOTEBOOK 1 — Feature Engineering (encodage, scaling, création, sélection)
# Dataset : Titanic (sans survie) ou Wine
# ============================================================

import numpy as np
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style="whitegrid")


* 1. Charger un dataset réel

In [2]:
data = load_wine()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

X.head()


Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


* 2. Séparation des features numériques / catégorielles

In [3]:
num_features = X.columns.tolist()
cat_features = []


* Standardisation

In [4]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled[:3]


array([[ 1.51861254, -0.5622498 ,  0.23205254, -1.16959318,  1.91390522,
         0.80899739,  1.03481896, -0.65956311,  1.22488398,  0.25171685,
         0.36217728,  1.84791957,  1.01300893],
       [ 0.24628963, -0.49941338, -0.82799632, -2.49084714,  0.01814502,
         0.56864766,  0.73362894, -0.82071924, -0.54472099, -0.29332133,
         0.40605066,  1.1134493 ,  0.96524152],
       [ 0.19687903,  0.02123125,  1.10933436, -0.2687382 ,  0.08835836,
         0.80899739,  1.21553297, -0.49840699,  2.13596773,  0.26901965,
         0.31830389,  0.78858745,  1.39514818]])

* 4. Polynomial Features (construction de features)

In [5]:
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X_scaled)

print("Shape initial :", X_scaled.shape)
print("Shape avec polynômes :", X_poly.shape)


Shape initial : (178, 13)
Shape avec polynômes : (178, 104)


* 5. Sélection de features (ANOVA F-test)

In [6]:
selector = SelectKBest(score_func=f_classif, k=10)
X_best = selector.fit_transform(X_scaled, y)

selected_features = np.array(X.columns)[selector.get_support()]
selected_features


array(['alcohol', 'malic_acid', 'alcalinity_of_ash', 'total_phenols',
       'flavanoids', 'proanthocyanins', 'color_intensity', 'hue',
       'od280/od315_of_diluted_wines', 'proline'], dtype=object)

6. Pipeline complet

In [7]:
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("selector", SelectKBest(score_func=f_classif, k=8)),
    ("model", LogisticRegression(max_iter=2000))
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)


0.9722222222222222

### TD : allez plus loin

1. Teste SelectKBest avec k=5, 10, 12.
    → Impact sur la performance ?

2. Teste PolynomialFeatures (degree=2 puis 3).
    → Gain ou sur-apprentissage ?

3. Ajoute une étape RobustScaler pour comparer.

4. Retire les features les moins corrélées.
    → La performance s’améliore-t-elle ?

5. Crée manuellement une nouvelle feature :
       ratio = proanthocyanins / flavanoids