In [1]:
import pandas as pd  
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier  
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, f1_score, recall_score
import mlflow
from mlflow.tracking import MlflowClient 
import pickle
import datetime as dt
from xgboost import XGBClassifier


import warnings
warnings.filterwarnings("ignore")

RANDOM_STATE = 42

# Clients data non based on CO2 data

In [2]:
clients = pd.read_pickle('clients_labeled_cleaned_08_03_2023.pkl') 
X, y = clients.drop(columns=["label"]), clients['label']

In [3]:
# Load data preprocessor
with open("data_transformer.pkl", 'rb') as file:
    transformer = pickle.load(file)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
X_train = transformer.fit_transform(X_train)
X_test = transformer.transform(X_test)

In [5]:
def compute_metrics(y_train, pred_train, y_test, pred_test, model="", verbose=True) :
    acc_train = accuracy_score(y_train, pred_train)
    acc_test = accuracy_score(y_test, pred_test)
    prec_train = precision_score(y_train, pred_train, average='macro')
    prec_test = precision_score(y_test, pred_test, average='macro') 
    f1_train = f1_score(y_train, pred_train, average='micro')
    f1_test = f1_score(y_test, pred_test, average='micro') 

    mlflow.log_metric("accuracy_train", acc_train)
    mlflow.log_metric("accuracy_test", acc_test)
    mlflow.log_metric("precision_train", prec_train)
    mlflow.log_metric("precision_test", prec_test)
    mlflow.log_metric("f1_score_train", f1_train)
    mlflow.log_metric("f1_score_test", f1_test)

    if(verbose):
        display_metrics(acc_train, acc_test, prec_train, prec_test, f1_train, f1_test)


def display_metrics(acc_train, acc_test, prec_train, prec_test, f1_train, f1_test):
    print(f"train accuracy : {acc_train*100:.2f}%")
    print(f"test accuracy : {acc_test*100:.2f}%")
    print()
    print(f"train precision : {prec_train*100:.2f}%")
    print(f"test precision : {prec_test*100:.2f}%")
    print()
    print(f"train f1 : {f1_train*100:.2f}%")
    print(f"test f1 : {f1_test*100:.2f}%")
    print()

In [6]:
# Define models
p = {'C': 1, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'lbfgs'}
model1 = LogisticRegression(**p, random_state=RANDOM_STATE, n_jobs=-1)

p = {'criterion': 'gini', 'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 10, 'min_samples_split': 2, 'n_estimators': 300}
model2 = RandomForestClassifier(**p, bootstrap=True, n_jobs=-1, random_state=RANDOM_STATE) #bootsrap=Fasle : use all dataset to train each tree

p =  {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 200}
model3 = XGBClassifier(**p, random_state=RANDOM_STATE)

p = {'n_neighbors': 7, 'weights': 'uniform'}
model5 = KNeighborsClassifier(n_jobs=-1)


In [7]:
EXPERIMENT_ID = 2

# Load expermient or create new
experiment_name = 'experiment_'+ str(EXPERIMENT_ID)+ '_voting_clf_on_data_without_co2'
experiment_id = mlflow.get_experiment_by_name(experiment_name)
experiment_id = experiment_id.experiment_id if experiment_id else mlflow.create_experiment(experiment_name)
    
with mlflow.start_run(experiment_id=experiment_id, run_name="voting_classifier_" + str(dt.datetime.now())[0:-10] ) :  
    # Define voting classifier and train
    voting_classifier = VotingClassifier(estimators=[('lr', model1), ('rf', model2), ('ada', model3), ('knn', model5)], voting='soft')
    voting_classifier.fit(X_train, y_train)
    
    pred_train = voting_classifier.predict(X_train)
    pred_test = voting_classifier.predict(X_test)
    
    # compute and logs metrics for each model
    compute_metrics(y_train, pred_train, y_test, pred_test, model="voting_clf")
    
    mlflow.sklearn.log_model(voting_classifier, "voting_clf")
    mlflow.log_param("model", "voting_clf (logistic_regression - random_forest - adaboost - gaussian_nb - knn)")
    mlflow.set_tag('estimator_class', type(voting_classifier))

train accuracy : 76.47%
test accuracy : 71.51%

train precision : 79.42%
test precision : 72.67%

train f1 : 76.47%
test f1 : 71.51%



# Clients data based on CO2 data

In [11]:
clients = pd.read_pickle('clients_labeled_cleaned_05_04_2023_co2_based.pkl') 
X, y = clients.drop(columns=["label"]), clients['label']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
X_train = transformer.fit_transform(X_train)
X_test = transformer.transform(X_test)

In [13]:
EXPERIMENT_ID = 3
# Load expermient or create new
experiment_name = 'experiment_'+ str(EXPERIMENT_ID)+ '_voting_clf_on_data_with_co2'
experiment_id = mlflow.get_experiment_by_name(experiment_name)
experiment_id = experiment_id.experiment_id if experiment_id else mlflow.create_experiment(experiment_name)
    
with mlflow.start_run(experiment_id=experiment_id, run_name="voting_classifier_" + str(dt.datetime.now())[0:-10] ) :  
    # Define voting classifier and train
    voting_classifier = VotingClassifier(estimators=[('lr', model1), ('rf', model2), ('ada', model3), ('knn', model5)], voting='soft')
    voting_classifier.fit(X_train, y_train)
    
    pred_train = voting_classifier.predict(X_train)
    pred_test = voting_classifier.predict(X_test)
    
    # compute and logs metrics for each model
    compute_metrics(y_train, pred_train, y_test, pred_test, model="voting_clf")
    
    mlflow.sklearn.log_model(voting_classifier, "voting_clf")
    mlflow.log_param("model", "voting_clf (logistic_regression - random_forest - adaboost - gaussian_nb - knn)")
    mlflow.set_tag('estimator_class', type(voting_classifier))

train accuracy : 73.25%
test accuracy : 69.48%

train precision : 70.16%
test precision : 65.40%

train f1 : 73.25%
test f1 : 69.48%



<font color=blue size=5>L'ajout des données de CO2 semble impacter négativement les perfomances des modèles </font>