# <a id='1'>0. Sommaire</a>

- <a href='#1'>0. Sommaire</a>  
- <a href='#2'>1. Librairies</a>
- <a href='#3'>2. Paramètres MLFLOW</a>
- <a href='#4'>3. Data</a>
- <a href='#5'>4. Modelisation</a>

# <a id='1'>1. Librairies</a>

In [15]:
import os

import lightgbm_with_simple_features as fe
from importlib import reload


import numpy as np
import pandas as pd
import pickle
import mlflow
import dagshub
import verstack
from sklearn.metrics import fbeta_score, make_scorer


from pycaret.classification import *

# <a id='2'>2. Paramètres MLFLOW</a>

In [2]:
dagshub.init("OC-DS-P7_mlflow", "sefirotha", mlflow=True)
mlflow.set_tracking_uri("https://dagshub.com/sefirotha/OC-DS-P7_mlflow.mlflow")
mlflow.start_run()



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=54b8d1cb-433e-4d48-9a3d-67f7ad55ac67&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=f3a84616b3946c8053a9e95bc214e226f85559ff271118eeb111115a61e43cf4




Output()

<ActiveRun: >

# <a id='3'>3. Data</a>

In [4]:
df = pd.read_pickle(r"..\Data\Processed_data\df_final_prot5.pkl")

In [7]:
with open(r'..\Data\Processed_data\var_cons_df.pickle', 'rb') as f:
    features_to_keep = pickle.load(f)

In [8]:
features_to_keep = features_to_keep['variables'].to_list()

In [9]:
features_to_ignore = list(set(df.keys().to_list()) - set(features_to_keep))
features_to_ignore.remove('TARGET')

In [10]:
train_df = df[df['TARGET'].notnull()]
test_df = df[df['TARGET'].isnull()]

In [11]:
test_df.drop('TARGET', axis = 1, inplace = True)

# <a id='5'>4. Modelisation</a>

In [12]:
s = setup(data = train_df, 
          target = 'TARGET', 
          train_size = 0.8,
          ignore_features = features_to_ignore,
          log_data = True,
          log_experiment = "dagshub",
          experiment_name = "final comparison",
          normalize = True,
          normalize_method="robust",
          fix_imbalance = True,
          fix_imbalance_method = 'SMOTE',
          session_id=42)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,TARGET
2,Target type,Binary
3,Original data shape,"(307507, 546)"
4,Transformed data shape,"(513792, 160)"
5,Transformed train set shape,"(452290, 160)"
6,Transformed test set shape,"(61502, 160)"
7,Ignore features,386
8,Numeric features,159
9,Preprocess,True


Please insert your repository owner_name/repo_name: sefirotha/OC-DS-P7_mlflow


2023/06/21 16:45:05 INFO mlflow.tracking.fluent: Experiment with name 'final comparison' does not exist. Creating a new experiment.


In [13]:
def Score_Metier(y_true, y_pred):
    # coût d'un faux positif et un faux négatif
    cout_fp = 1
    cout_fn = 10
    
    # nombre total d'exemples positifs et négatifs
    n_pos = (y_true==1).sum()
    n_neg = (y_true==0).sum()
    # calcul du coût maximum possible
    max_cout = cout_fp * n_neg + cout_fn * n_pos
    
    # Calcul du nombre de faux positifs et faux négatifs
    fp = ((y_pred == 1) & (y_true == 0)).sum()
    fn = ((y_pred == 0) & (y_true == 1)).sum()
    
    # calcul du coût total    
    cout = (cout_fp * fp + cout_fn * fn)  # / (fn + fp)
    cout_notmalise = cout/max_cout
    score = round(1 - cout_notmalise, 2)
    return score

#score_metier = make_scorer(Score_Metier, greater_is_better=True)

In [16]:
add_metric("f10", "F10", fbeta_score, beta = 3.16)
add_metric("score_metier", "Score Métier", Score_Metier)

Name                                                  Score Métier
Display Name                                          Score Métier
Score Function       <function Score_Metier at 0x0000022ED5624310>
Scorer                                   make_scorer(Score_Metier)
Target                                                        pred
Args                                                            {}
Greater is Better                                             True
Multiclass                                                    True
Custom                                                        True
Name: score_metier, dtype: object

In [17]:
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,F10,Score Métier,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9194,0.7714,0.0222,0.5278,0.0425,0.0362,0.0956,0.0243,0.54,13.587
rf,Random Forest Classifier,0.9193,0.7227,0.0025,0.5305,0.0049,0.0041,0.0313,0.0027,0.53,123.68
gbc,Gradient Boosting Classifier,0.9193,0.7364,0.0075,0.4957,0.0148,0.0124,0.0527,0.0082,0.537,561.487
et,Extra Trees Classifier,0.9193,0.7245,0.0028,0.5372,0.0056,0.0048,0.0342,0.0031,0.53,67.239
dummy,Dummy Classifier,0.9193,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.53,1.97
xgboost,Extreme Gradient Boosting,0.9184,0.7673,0.0532,0.4568,0.0953,0.0798,0.1345,0.0579,0.553,358.59
ada,Ada Boost Classifier,0.911,0.6976,0.0461,0.2378,0.0769,0.0521,0.0723,0.0497,0.547,106.047
dt,Decision Tree Classifier,0.8432,0.5383,0.1747,0.1352,0.1524,0.0676,0.0682,0.1702,0.562,29.863
knn,K Neighbors Classifier,0.7415,0.5848,0.335,0.1167,0.173,0.0605,0.0724,0.2862,0.571,491.975
lda,Linear Discriminant Analysis,0.7002,0.761,0.6855,0.1678,0.2696,0.1608,0.2242,0.5352,0.696,14.014


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [18]:
results = pull()

In [19]:
results.to_csv("model_comparison_final.csv")

In [20]:
mlflow.end_run()