In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

## Bibliothèques

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split

np.set_printoptions(suppress=True)

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, LassoCV, ElasticNetCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

## Lecture des données

In [None]:
df = pd.read_csv('../data/ozone.csv')
df.head()

In [None]:
len(df)

In [None]:
sns.pairplot(df)

## Préparation des données

In [None]:
y = df['ozone'].values # Variable dépendante
y

In [None]:
X = df.drop(columns = ['ozone']).values # Variables indépendantes
X

## Séparation Train-Test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2023)

## Modélisation

#### Modèle de référence

In [None]:
baseline = y_train.mean()
baseline

In [None]:
mean_squared_error(y_train, baseline * np.ones(len(y_train)), squared=False)

#### Regression linéaire

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)

scores = cross_val_score(lr, X_train, y_train, cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1)

print(f'LR - RMSE = {scores.max()}')

#### Régression Ridge

[RidgeCV](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeCV.html)

In [None]:
ordres = [1,2,3,4,5]
for ordre in ordres:
    poly = PolynomialFeatures(ordre)
    X_poly = poly.fit_transform(X_train)
    print(X_poly.shape)

In [None]:
ordres = [1,2,3,4,5]
for ordre in ordres:
    poly = PolynomialFeatures(ordre, include_bias=False)
    X_poly = poly.fit_transform(X_train)
    ridge = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1, 10], cv=5).fit(X_poly, y_train)
    score = ridge.score(X_poly, y_train)
    print(score)

[Metrics and scoring](https://scikit-learn.org/stable/modules/model_evaluation.html)

In [None]:
scaler = StandardScaler()

ordres = [1,2,3,4,5]
for ordre in ordres:
    poly = PolynomialFeatures(ordre, include_bias=False)
    X_poly = poly.fit_transform(X_train)
    X_scale = scaler.fit_transform(X_poly)  
    ridge = RidgeCV(alphas=[1e-9, 1e-1, 1, 10, 50], cv=5, scoring="neg_root_mean_squared_error").fit(X_scale, y_train)
    print(f'Ordre {ordre} - alpha = {ridge.alpha_} - RMSE = {ridge.best_score_}')

#### Régression Lasso

* [LassoCV](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoCV.html)
* [cross_val_score](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html#sklearn.model_selection.cross_val_score)

In [None]:
scaler = StandardScaler()

ordres = [1,2,3,4,5]
for ordre in ordres:
    poly = PolynomialFeatures(ordre, include_bias=False)
    X_poly = poly.fit_transform(X_train)
    X_scale = scaler.fit_transform(X_poly)  
    lasso = LassoCV(alphas=[1e-2, 1e-1, 1, 5, 7.5], cv=5, max_iter=1000, n_jobs=-1).fit(X_scale, y_train)
    
    scores = cross_val_score(lasso, X_scale, y_train, cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1)
    best_score = scores.mean()
    
    print(f'Ordre {ordre} - alpha = {lasso.alpha_}, mse = {best_score}')

In [None]:
results = []

scaler = StandardScaler()

ordres = [1,2,3,4,5]
for ordre in ordres:
    poly = PolynomialFeatures(ordre, include_bias=False)
    X_poly = poly.fit_transform(X_train)
    X_scale = scaler.fit_transform(X_poly)  
    lasso = LassoCV(alphas=[1e-3, 1e-2, 1e-1, 1], cv=5, max_iter=1000, n_jobs=-1).fit(X_scale, y_train)
    
    scores = cross_val_score(lasso, X_scale, y_train, cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1)
    best_score = scores.mean()
    
    results.append(f'Ordre {ordre} - alpha = {lasso.alpha_}, mse = {best_score}')

In [None]:
for r in results: print(r)

#### Elastic Net

[Elastic Net](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNetCV.html)

In [None]:
results = []

scaler = StandardScaler()

ordres = [1,2,3,4,5]
for ordre in ordres:
    poly = PolynomialFeatures(ordre, include_bias=False)
    X_poly = poly.fit_transform(X_train)
    X_scale = scaler.fit_transform(X_poly)  
    en = ElasticNetCV(l1_ratio=[0.1, 0.25, 0.5, 0.75, 0.9], alphas=[1e-3, 1e-2, 1e-1, 1], cv=5, max_iter=1000, n_jobs=-1).fit(X_scale, y_train)
    
    scores = cross_val_score(en, X_scale, y_train, cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1)
    best_score = scores.mean()
    
    results.append(f'Ordre {ordre} - alpha = {en.alpha_}, l1_ratio = {en.l1_ratio_}, mse = {best_score}')

In [None]:
for r in results: print(r)

#### Plus proches voisins

In [None]:
K = np.linspace(1, 100, 100).astype(int)

In [None]:
results = []

for k in K:
    neigh = KNeighborsRegressor(n_neighbors=k)
    neigh.fit(X_train, y_train)
    scores = cross_val_score(neigh, X_train, y_train, cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1)
    best_score = scores.mean()
    
    results.append(f'K {k} - mse = {best_score}')

In [None]:
for r in results: print(r)

And the winner is **Ridge Ordre 3 - alpha = 10, mse = 3.9801757450439816**

## Entraînement du modèle sélectionné

In [None]:
scaler = StandardScaler()
poly = PolynomialFeatures(3, include_bias=False)
X_poly = poly.fit_transform(X_train)
X_scale = scaler.fit_transform(X_poly)

In [None]:
model_final = Ridge(alpha=10, random_state=2023, max_iter=10000)
model_final.fit(X_scale, y_train)

In [None]:
model_final.coef_

## Performances sur le jeu de test

In [None]:
X_test_poly = poly.fit_transform(X_test)
X_test_scale = scaler.fit_transform(X_test_poly)

In [None]:
model_final.score(X_test_scale, y_test)

In [None]:
mean_squared_error(y_test, model_final.predict(X_test_scale), squared=False)