### Load Dataset & imports

In [105]:
import numpy as np
import sklearn
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score


# Charger les données
X_train = np.load("../data/classification/X_train.npy")
y_train = np.load("../data/classification/y_train.npy")
X_test = np.load('../data/classification/X_test.npy')
y_test = np.load('../data/classification/y_test.npy')

## Gridsearch for each model

### Logistic Regression

In [None]:
# Paramètres du modèle
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(max_iter=5000))
])
param_grid = {"clf__C": [0.01, 0.05, 0.1, 0.2 ,0.4, 0.8, 1, 2, 5, 10], "clf__penalty": ["l2", "l1", "elasticnet"]}

# Exécuter GridSearchCV
grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid.fit(X_train, y_train)
print(f"Logistic Regression:\n  Best CV Accuracy: {grid.best_score_:.4f}")
print(f"  Best Params: {grid.best_params_}")
best_model = grid.best_estimator_

# Évaluation sur le test set
y_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)

print(f"  Test set accuracy: {test_accuracy}\n")
    

### SVC

With some basic sets of hyperparameters we observed that the most performing kernel was "rbf" with a test accuracy of ~0.80. But this couldn't be improved since "rbf" kernel doesn't handle other hyperparameters that could be optimized unless we try to find other values with a very little gap. So we decided to focus on "poly" kernel, which was the second most performing kernel, this was nice because we can now try to optimize 'coef0' and 'degree' which exclusively used with "poly" kernel. We noticed that the best degree was always 3 which is the default one so we don't try to optimize it more. We reach a peak of 0.822 of test accuracy with params: {'clf__C': 1.0, 'clf__coef0': 0.12, 'clf__gamma': 'scale', 'clf__kernel': 'poly'}

In [138]:
# Model parameters

# First step
# pipe = Pipeline([
#     ('scaler', StandardScaler()),
#     ('clf', SVC())
# ])
# param_grid = {"clf__C": [0.05, 0.1, 0.5, 1.0, 3.0, 5.0, 10.0], "clf__kernel": ["poly", "rbf", "linear", "sigmoid"], "clf__gamma": ["scale", "auto", 0, 0.5, 1, 5, 10]}

# Focus on "poly" kernel
pipe = Pipeline([
    ('clf', SVC())
])
param_grid = {"clf__C": [0.9, 1.0, 1.2, 1.7, 1.8], "clf__kernel": ["poly"], "clf__gamma": ["scale", "auto", 5], "clf__coef0": [0.05, 0.07, 0.12, 0.13, 0.15, 0.17, 0.2]}

# Exécuter GridSearchCV
grid = GridSearchCV(pipe, param_grid, cv=7, scoring='accuracy', n_jobs=-1, verbose=3)
grid.fit(X_train, y_train)
print(f"SVC:\n  Best CV Accuracy: {grid.best_score_:.4f}")
print(f"  Best Params: {grid.best_params_}")
best_model = grid.best_estimator_

# Évaluation sur le test set
y_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)

print(f"  Test set accuracy: {test_accuracy}\n")


Fitting 7 folds for each of 105 candidates, totalling 735 fits
[CV 1/7] END clf__C=0.9, clf__coef0=0.05, clf__gamma=scale, clf__kernel=poly;, score=0.783 total time=   0.1s
[CV 4/7] END clf__C=0.9, clf__coef0=0.05, clf__gamma=scale, clf__kernel=poly;, score=0.780 total time=   0.1s
[CV 6/7] END clf__C=0.9, clf__coef0=0.05, clf__gamma=scale, clf__kernel=poly;, score=0.761 total time=   0.1s
[CV 7/7] END clf__C=0.9, clf__coef0=0.05, clf__gamma=scale, clf__kernel=poly;, score=0.779 total time=   0.1s
[CV 1/7] END clf__C=0.9, clf__coef0=0.05, clf__gamma=auto, clf__kernel=poly;, score=0.808 total time=   0.1s
[CV 5/7] END clf__C=0.9, clf__coef0=0.05, clf__gamma=scale, clf__kernel=poly;, score=0.811 total time=   0.1s
[CV 3/7] END clf__C=0.9, clf__coef0=0.05, clf__gamma=scale, clf__kernel=poly;, score=0.755 total time=   0.1s
[CV 2/7] END clf__C=0.9, clf__coef0=0.05, clf__gamma=scale, clf__kernel=poly;, score=0.780 total time=   0.1s
[CV 4/7] END clf__C=0.9, clf__coef0=0.05, clf__gamma=auto,

### KNeighbors

All the possible 'metrics' (no warning, compatible with the data) are used. We observ better performance arounf 30 neighbors, ~0.786 test accuracy.

In [127]:
# Paramètres du modèle
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', KNeighborsClassifier())
])
param_grid = {"clf__n_neighbors": [5, 15, 25, 27, 29, 30, 35, 50], "clf__weights": ["uniform", "distance"], "clf__metric": ["euclidean", "manhattan", "cityblock", "l1", "l2"], "clf__algorithm": ["auto", "ball_tree", "kd_tree", "brute"]}

# Exécuter GridSearchCV
grid = GridSearchCV(pipe, param_grid, cv=7, scoring='accuracy', n_jobs=-1, verbose=3)
grid.fit(X_train, y_train)
print(f"KNeighbors:\n  Best CV Accuracy: {grid.best_score_:.4f}")
print(f"  Best Params: {grid.best_params_}")
best_model = grid.best_estimator_

# Évaluation sur le test set
y_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)

print(f"  Test set accuracy: {test_accuracy}\n")


Fitting 7 folds for each of 320 candidates, totalling 2240 fits
[CV 2/7] END clf__algorithm=auto, clf__metric=euclidean, clf__n_neighbors=5, clf__weights=uniform;, score=0.710 total time=   0.0s
[CV 3/7] END clf__algorithm=auto, clf__metric=euclidean, clf__n_neighbors=5, clf__weights=uniform;, score=0.710 total time=   0.0s
[CV 1/7] END clf__algorithm=auto, clf__metric=euclidean, clf__n_neighbors=5, clf__weights=uniform;, score=0.738 total time=   0.0s
[CV 5/7] END clf__algorithm=auto, clf__metric=euclidean, clf__n_neighbors=5, clf__weights=uniform;, score=0.759 total time=   0.0s
[CV 6/7] END clf__algorithm=auto, clf__metric=euclidean, clf__n_neighbors=5, clf__weights=uniform;, score=0.705 total time=   0.1s
[CV 7/7] END clf__algorithm=auto, clf__metric=euclidean, clf__n_neighbors=5, clf__weights=uniform;, score=0.688 total time=   0.1s
[CV 1/7] END clf__algorithm=auto, clf__metric=euclidean, clf__n_neighbors=5, clf__weights=distance;, score=0.738 total time=   0.1s
[CV 4/7] END clf__

### MLPClassifier

In [135]:
# Paramètres du modèle
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', MLPClassifier(max_iter=5000))
])
param_grid = {"clf__hidden_layer_sizes": [(50,), (100,), (100, 50)], "clf__alpha": [0.1, 1, 2, 5], "clf__learning_rate": ["invscaling", "adaptive"], "clf__activation": ["relu", "tanh", "logistic"]}

# Exécuter GridSearchCV
grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=3)
grid.fit(X_train, y_train)
print(f"MLPClassifier:\n  Best CV Accuracy: {grid.best_score_:.4f}")
print(f"  Best Params: {grid.best_params_}")
best_model = grid.best_estimator_

# Évaluation sur le test set
y_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)

print(f"  Test set accuracy: {test_accuracy}\n")


Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV 2/5] END clf__activation=relu, clf__alpha=0.1, clf__hidden_layer_sizes=(50,), clf__learning_rate=adaptive;, score=0.723 total time=   4.0s
[CV 3/5] END clf__activation=relu, clf__alpha=0.1, clf__hidden_layer_sizes=(50,), clf__learning_rate=adaptive;, score=0.728 total time=   4.1s
[CV 5/5] END clf__activation=relu, clf__alpha=0.1, clf__hidden_layer_sizes=(50,), clf__learning_rate=invscaling;, score=0.645 total time=   4.2s
[CV 3/5] END clf__activation=relu, clf__alpha=0.1, clf__hidden_layer_sizes=(50,), clf__learning_rate=invscaling;, score=0.720 total time=   4.2s
[CV 2/5] END clf__activation=relu, clf__alpha=0.1, clf__hidden_layer_sizes=(50,), clf__learning_rate=invscaling;, score=0.713 total time=   4.3s
[CV 1/5] END clf__activation=relu, clf__alpha=0.1, clf__hidden_layer_sizes=(50,), clf__learning_rate=invscaling;, score=0.710 total time=   4.4s
[CV 4/5] END clf__activation=relu, clf__alpha=0.1, clf__hidden_layer_siz

### AdaBoost

In [137]:
# Paramètres du modèle
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', AdaBoostClassifier())
])
param_grid = {"clf__n_estimators": [100, 200, 300, 500, 700], "clf__learning_rate": [0.01, 0.02, 0.05, 0.1, 0.05, 1]}

# Exécuter GridSearchCV
grid = GridSearchCV(pipe, param_grid, cv=7, scoring='accuracy', n_jobs=-1, verbose=3)
grid.fit(X_train, y_train)
print(f"AdaBoost:\n  Best CV Accuracy: {grid.best_score_:.4f}")
print(f"  Best Params: {grid.best_params_}")
best_model = grid.best_estimator_

# Évaluation sur le test set
y_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)

print(f"  Test set accuracy: {test_accuracy}\n")

Fitting 7 folds for each of 30 candidates, totalling 210 fits
[CV 2/7] END clf__learning_rate=0.01, clf__n_estimators=100;, score=0.692 total time=   0.9s
[CV 7/7] END clf__learning_rate=0.01, clf__n_estimators=100;, score=0.698 total time=   1.0s
[CV 3/7] END clf__learning_rate=0.01, clf__n_estimators=100;, score=0.650 total time=   0.9s
[CV 1/7] END clf__learning_rate=0.01, clf__n_estimators=100;, score=0.647 total time=   1.0s
[CV 4/7] END clf__learning_rate=0.01, clf__n_estimators=100;, score=0.668 total time=   1.0s
[CV 6/7] END clf__learning_rate=0.01, clf__n_estimators=100;, score=0.653 total time=   1.0s
[CV 5/7] END clf__learning_rate=0.01, clf__n_estimators=100;, score=0.710 total time=   1.0s
[CV 1/7] END clf__learning_rate=0.01, clf__n_estimators=200;, score=0.650 total time=   2.1s
[CV 2/7] END clf__learning_rate=0.01, clf__n_estimators=200;, score=0.717 total time=   2.2s
[CV 7/7] END clf__learning_rate=0.01, clf__n_estimators=200;, score=0.712 total time=   2.1s
[CV 3/7]