# Python tests
Some examples of tests applied during the development of ML models :
- preprocessing
- training
- gridsearch & hyperparameters tuning

## Preprocessing of data with sklearn

In [14]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import numpy as np

# Fonction pour normaliser les données
def normalize_data(data):
    scaler = StandardScaler()
    return scaler.fit_transform(data)

# Fonction pour encoder les variables catégorielles
def encode_categorical(data):
    encoder = OneHotEncoder(sparse_output=False)
    return encoder.fit_transform(data)

# Exemple de test pour la normalisation des données
def test_normalize_data():
    data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
    # Generate ERROR with :
    #data = np.array([[0, 0, 0], [4, 5, 6], [7, 8, 9]])
    normalized_data = normalize_data(data)
    assert np.allclose(normalized_data.mean(axis=0), 0, rtol=0, atol=0), "Error : scaling (with mean) process failed"
    assert np.allclose(normalized_data.std(axis=0), 1, rtol=0, atol=0), "Error : scaling (with sd) process failed"

# Exemple de test pour l'encodage des variables catégorielles
def test_encode_categorical():
    data = np.array([['cat'], ['dog'], ['cat']])
    encoded_data = encode_categorical(data)
    expected_data = np.array([[1, 0], [0, 1], [1, 0]])
    # Generate ERROR with :
    # expected_data = np.array([[0, 0], [0, 1], [1, 0]])
    assert np.array_equal(encoded_data, expected_data), "Error : encoding process failed"


In [15]:
test_normalize_data()

In [7]:
test_encode_categorical()

## Train a classifier model with sklearn

In [18]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Fonction pour entraîner un modèle de classification
def train_classifier(X_train, y_train):
    model = SVC()
    model.fit(X_train, y_train)
    return model

# Exemple de test pour l'entraînement du modèle
def test_train_classifier():
    X, y = load_iris(return_X_y=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = train_classifier(X_train, y_train)
    
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    assert accuracy > 0.7, "Error - Performance not greater than 0.7 (70%)"
    # Generate ERROR with :
    assert accuracy > 1, "Error - Performance not greater than 1 (100%) :)"

In [19]:
test_train_classifier()

AssertionError: Error - Performance not greater than 1 (100%) :)

## Grid search & Hyperparameters tuning with GridSearchCV & sklearn

In [20]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Fonction pour effectuer une recherche d'hyperparamètres
def hyperparameter_search(X, y):
    param_grid = {'n_estimators': [10, 50, 100], 'max_depth': [None, 10, 20, 30]}
    grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=3)
    grid_search.fit(X, y)
    return grid_search

# Exemple de test pour la recherche d'hyperparamètres
def test_hyperparameter_search():
    X, y = load_iris(return_X_y=True)
    grid_search = hyperparameter_search(X, y)
    
    best_params = grid_search.best_params_
    assert 'n_estimators' in best_params, "Error - Failed to retrieve 'n_estimators' in best params"
    assert 'max_depth' in best_params, "Error - Failed to retrieve 'max_depth' in best params"

# Exemple de test pour évaluer le modèle avec les meilleurs paramètres
def test_best_model_performance():
    X, y = load_iris(return_X_y=True)
    grid_search = hyperparameter_search(X, y)
    
    best_model = grid_search.best_estimator_
    baseline_model = RandomForestClassifier()
    baseline_model.fit(X, y)
    
    best_score = best_model.score(X, y)
    baseline_score = baseline_model.score(X, y)
    
    assert best_score >= baseline_score
    # Generate ERROR with :
    assert best_score > 1, "Error - Performance of best model not greater than baseline model :)"


In [21]:
test_hyperparameter_search()

In [22]:
test_best_model_performance()

AssertionError: Error - Performance of best model not greater than baseline model :)