In [17]:
# import libraries
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import scipy.stats as stats
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import recall_score, accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN
from sklearn.feature_selection import SelectKBest
from collections import Counter

# ignore warning
import warnings
warnings.filterwarnings('ignore')
import matplotlib.ticker as mtick # pour voir les pourcentages dans les plots
import pickle # pour sauvegarder le modèle

In [18]:
data = pd.read_csv("../data/processed/df_encoded.csv")
data.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,0,1,0,1,0,1,0,0,2,0,0,0,0,0,1,2,29.85,29.85,0
1,1,0,0,0,34,1,0,0,2,0,2,0,0,0,1,0,3,56.95,1889.5,0
2,1,0,0,0,2,1,0,0,2,2,0,0,0,0,0,1,3,53.85,108.15,1
3,1,0,0,0,45,0,1,0,2,0,2,2,0,0,1,0,0,42.3,1840.75,0
4,0,0,0,0,2,1,0,1,0,0,0,0,0,0,0,1,2,70.7,151.65,1


In [60]:
df_en = pd.read_csv("../data/processed/df_encoded.csv")

<small>

#### Séparation en train et test

In [19]:
# splitting dataset into dependent and independent feature
X = data.drop(columns='Churn')
y = data['Churn']

In [20]:
X.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,0,0,1,0,1,0,1,0,0,2,0,0,0,0,0,1,2,29.85,29.85
1,1,0,0,0,34,1,0,0,2,0,2,0,0,0,1,0,3,56.95,1889.5
2,1,0,0,0,2,1,0,0,2,2,0,0,0,0,0,1,3,53.85,108.15
3,1,0,0,0,45,0,1,0,2,0,2,2,0,0,1,0,0,42.3,1840.75
4,0,0,0,0,2,1,0,1,0,0,0,0,0,0,0,1,2,70.7,151.65


#### Sélection des variables

<small>

Nous selectionnons que 10 variables ayant les plus fortes corrélations

In [21]:
# Feature Selection
selection = SelectKBest()  # k=10 par défaut
X = selection.fit_transform(X,y)

In [22]:
#  
selection.get_support()

array([False, False, False,  True,  True, False, False, False,  True,
        True,  True,  True, False, False,  True,  True, False,  True,
        True])

<small>


Selon la sélection de variables, nous avons retenu **10 variables sur 21**.  

Les variables sélectionnées sont :  

- **Dependents**  
- **tenure**  
- **OnlineSecurity**  
- **OnlineBackup**  
- **DeviceProtection**  
- **TechSupport**  
- **Contract**  
- **PaperlessBilling**  
- **MonthlyCharges**  
- **TotalCharges**


In [23]:
# Separation for train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [24]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((5634, 10), (1409, 10), (5634,), (1409,))

In [25]:
y.value_counts()

Churn
0    5174
1    1869
Name: count, dtype: int64

<small>

#### Apply into machine learning algorithm:


In [26]:
Log_reg = LogisticRegression(C=150, max_iter=150)
Log_reg.fit(X_train, y_train)
log_pred = Log_reg.predict(X_test)

print(f'Accuracy score : {accuracy_score(log_pred, y_test)}')
print(f'Confusion matrix :\n {confusion_matrix(log_pred, y_test)}')
print(f'Classification report :\n {classification_report(log_pred, y_test)}')

Accuracy score : 0.7991483321504613
Confusion matrix :
 [[925 173]
 [110 201]]
Classification report :
               precision    recall  f1-score   support

           0       0.89      0.84      0.87      1098
           1       0.54      0.65      0.59       311

    accuracy                           0.80      1409
   macro avg       0.72      0.74      0.73      1409
weighted avg       0.82      0.80      0.81      1409



In [27]:
# Random forest classifier
Rfc = RandomForestClassifier(n_estimators=120,criterion='gini', max_depth=15, min_samples_leaf=10, min_samples_split=5)
Rfc.fit(X_train, y_train)
rfc_pred = Rfc.predict(X_test)

print(f'Accuracy score : {accuracy_score(rfc_pred, y_test)}')
print(f'Confusion matrix :\n {confusion_matrix(rfc_pred, y_test)}')
print(f'Classification report :\n {classification_report(rfc_pred, y_test)}')

Accuracy score : 0.7885024840312278
Confusion matrix :
 [[932 195]
 [103 179]]
Classification report :
               precision    recall  f1-score   support

           0       0.90      0.83      0.86      1127
           1       0.48      0.63      0.55       282

    accuracy                           0.79      1409
   macro avg       0.69      0.73      0.70      1409
weighted avg       0.82      0.79      0.80      1409



In [28]:
# decisionTree Classifier
Dtc = DecisionTreeClassifier(criterion='gini', splitter='random', min_samples_leaf=15)
Dtc.fit(X_train, y_train)
dtc_pred = Dtc.predict(X_test)

print(f'Accuracy score : {accuracy_score(dtc_pred, y_test)}')
print(f'Confusion matrix :\n {confusion_matrix(dtc_pred, y_test)}')
print(f'Classification report :\n {classification_report(dtc_pred, y_test)}')

Accuracy score : 0.7778566359119943
Confusion matrix :
 [[913 191]
 [122 183]]
Classification report :
               precision    recall  f1-score   support

           0       0.88      0.83      0.85      1104
           1       0.49      0.60      0.54       305

    accuracy                           0.78      1409
   macro avg       0.69      0.71      0.70      1409
weighted avg       0.80      0.78      0.79      1409



<small>

#### Gestion du déséquilibre des classes

En comparant avec le jeu de données déséquilibré, notre modèle obtient des performances correctes,  
mais pas suffisamment solides pour un projet de bout en bout.  

Nous devons donc appliquer une **technique de sur-échantillonnage** afin de :  
- Réduire les **TN** (True Negative) et **FN** (False Negative),  
- Augmenter les **FP** (False Positive) et **TP** (True Positive).  

Cela permet d’améliorer l’apprentissage du modèle et d’obtenir des prédictions plus équilibrées.


<small>


#### Utilisation de SMOTEENN pour le jeu de données déséquilibré

- **SMOTE (Synthetic Minority Over-sampling Technique)** : permet le sur-échantillonnage en générant de nouvelles instances synthétiques de la classe minoritaire.  
- **ENN (Edited Nearest Neighbours)** : nettoie les données en supprimant les exemples ambigus ou mal classés, ce qui correspond à un sous-échantillonnage.  

La méthode **SMOTEENN** combine ces deux approches pour :  
- **Sur-échantillonner** la classe minoritaire,  
- **Sous-échantillonner/nettoyer** la classe majoritaire,  
- Produire ainsi un jeu de données mieux équilibré et plus adapté à l’entraînement du modèle.


In [29]:
st=SMOTEENN()
X_train_st,y_train_st = st.fit_resample(X_train, y_train)
print("The number of classes before fit {}".format(Counter(y_train)))
print("The number of classes after fit {}".format(Counter(y_train_st)))

The number of classes before fit Counter({0: 4139, 1: 1495})
The number of classes after fit Counter({1: 2429, 0: 2173})


<small>

#### Répartition des classes avant et après SMOTEENN

- **Avant l’application de SMOTEENN** :  
  - Classe 0 : 4139  
  - Classe 1 : 1495  

Forte dominance de la classe 0 (déséquilibre important).

- **Après l’application de SMOTEENN** :  
  - Classe 0 : 2173  
  - Classe 1 : 2429  

Les classes sont désormais beaucoup plus équilibrées, ce qui améliore l’apprentissage du modèle et sa capacité à détecter correctement la classe minoritaire.


In [30]:
# splitting the over sampling dataset 
X_train_sap, X_test_sap, y_train_sap, y_test_sap = train_test_split(X_train_st, y_train_st, test_size=0.2)

In [31]:
# decisionTree Classifier
Dtc_sampling = DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=7, min_samples_leaf=15)
Dtc_sampling.fit(X_train_sap, y_train_sap)
dtc_sampling_pred = Dtc_sampling.predict(X_test_sap)

print(f'Accuracy score : {accuracy_score(dtc_sampling_pred, y_test_sap)}')
print(f'Confusion matrix :\n {confusion_matrix(dtc_sampling_pred, y_test_sap)}')
print(f'Classification report :\n {classification_report(dtc_sampling_pred, y_test_sap)}')

Accuracy score : 0.9196525515743756
Confusion matrix :
 [[402  30]
 [ 44 445]]
Classification report :
               precision    recall  f1-score   support

           0       0.90      0.93      0.92       432
           1       0.94      0.91      0.92       489

    accuracy                           0.92       921
   macro avg       0.92      0.92      0.92       921
weighted avg       0.92      0.92      0.92       921



In [32]:
# Random forest classifier
Rfc_sampling = RandomForestClassifier(n_estimators=150,criterion='gini', max_depth=15, min_samples_leaf=10, min_samples_split=6)
Rfc_sampling.fit(X_train_sap, y_train_sap)
rfc_sampling_pred = Rfc_sampling.predict(X_test_sap)

print(f'Accuracy score : {accuracy_score(rfc_sampling_pred, y_test_sap)}')
print(f'Confusion matrix :\n {confusion_matrix(rfc_sampling_pred, y_test_sap)}')
print(f'Classification report :\n {classification_report(rfc_sampling_pred, y_test_sap)}')

Accuracy score : 0.9348534201954397
Confusion matrix :
 [[411  25]
 [ 35 450]]
Classification report :
               precision    recall  f1-score   support

           0       0.92      0.94      0.93       436
           1       0.95      0.93      0.94       485

    accuracy                           0.93       921
   macro avg       0.93      0.94      0.93       921
weighted avg       0.94      0.93      0.93       921



In [34]:
# logistic regression
Log_reg_sampling = LogisticRegression(C=10, max_iter=150)
Log_reg_sampling.fit(X_train_sap, y_train_sap)
Log_sampling_pred = Log_reg_sampling.predict(X_test_sap)

print(f'Accuracy score : {accuracy_score(Log_sampling_pred, y_test_sap)}')
print(f'Confusion matrix :\n {confusion_matrix(Log_sampling_pred, y_test_sap)}')
print(f'Classification report :\n {classification_report(Log_sampling_pred, y_test_sap)}')

Accuracy score : 0.9033659066232356
Confusion matrix :
 [[392  35]
 [ 54 440]]
Classification report :
               precision    recall  f1-score   support

           0       0.88      0.92      0.90       427
           1       0.93      0.89      0.91       494

    accuracy                           0.90       921
   macro avg       0.90      0.90      0.90       921
weighted avg       0.90      0.90      0.90       921



In [35]:
# GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(X_train_sap, y_train_sap)
pred = gbc.predict(X_test_sap)

print(f'Accuracy score : {accuracy_score(pred, y_test_sap)}')
print(f'Confusion matrix :\n {confusion_matrix(pred, y_test_sap)}')
print(f'Classification report :\n {classification_report(pred, y_test_sap)}')

Accuracy score : 0.9457111834961998
Confusion matrix :
 [[415  19]
 [ 31 456]]
Classification report :
               precision    recall  f1-score   support

           0       0.93      0.96      0.94       434
           1       0.96      0.94      0.95       487

    accuracy                           0.95       921
   macro avg       0.95      0.95      0.95       921
weighted avg       0.95      0.95      0.95       921



In [36]:
param_grid = {'n_estimators':[100, 150, 200, 250, 300],
             'criterion': ['friedman_mse', 'squared_error', 'mse', 'mae'],
             'min_samples_split': [2,3,4,5,6,7,8,9,10],
             'min_samples_leaf': [1,3,5,7,9,11,13,15],'max_leaf_nodes': [3,6,8,9,12,15,18,24],
              'max_depth': [3,5,7,9,11,13,15,17,19],
              'learning_rate': [0.05, 0.1, 0.2, 0.3, 0.4, 0.5],
              'loss': ['deviance', 'exponential']
              }

In [37]:
gbc_optm = RandomizedSearchCV(estimator=gbc, param_distributions=param_grid,n_iter=100, verbose=3)
gbc_optm.fit(X_train_sap, y_train_sap)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV 1/5] END criterion=squared_error, learning_rate=0.3, loss=exponential, max_depth=9, max_leaf_nodes=9, min_samples_leaf=13, min_samples_split=4, n_estimators=250;, score=0.953 total time=   4.1s
[CV 2/5] END criterion=squared_error, learning_rate=0.3, loss=exponential, max_depth=9, max_leaf_nodes=9, min_samples_leaf=13, min_samples_split=4, n_estimators=250;, score=0.961 total time=   4.7s
[CV 3/5] END criterion=squared_error, learning_rate=0.3, loss=exponential, max_depth=9, max_leaf_nodes=9, min_samples_leaf=13, min_samples_split=4, n_estimators=250;, score=0.962 total time=   4.1s
[CV 4/5] END criterion=squared_error, learning_rate=0.3, loss=exponential, max_depth=9, max_leaf_nodes=9, min_samples_leaf=13, min_samples_split=4, n_estimators=250;, score=0.951 total time=   4.0s
[CV 5/5] END criterion=squared_error, learning_rate=0.3, loss=exponential, max_depth=9, max_leaf_nodes=9, min_samples_leaf=13, min_samples_split=

0,1,2
,estimator,GradientBoostingClassifier()
,param_distributions,"{'criterion': ['friedman_mse', 'squared_error', ...], 'learning_rate': [0.05, 0.1, ...], 'loss': ['deviance', 'exponential'], 'max_depth': [3, 5, ...], ...}"
,n_iter,100
,scoring,
,n_jobs,
,refit,True
,cv,
,verbose,3
,pre_dispatch,'2*n_jobs'
,random_state,

0,1,2
,loss,'exponential'
,learning_rate,0.5
,n_estimators,250
,subsample,1.0
,criterion,'squared_error'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,11
,min_impurity_decrease,0.0


In [38]:
# GradientBoostingClassifier
gbc_tunning = GradientBoostingClassifier(criterion='squared_error', learning_rate=0.3,
                           max_depth=19, max_leaf_nodes=24, min_samples_leaf=9,
                           min_samples_split=7, n_estimators=150)
gbc_tunning.fit(X_train_sap, y_train_sap)
pred = gbc_tunning.predict(X_test_sap)

print(f'Accuracy score : {accuracy_score(pred, y_test_sap)}')
print(f'Confusion matrix :\n {confusion_matrix(pred, y_test_sap)}')
print(f'Classification report :\n {classification_report(pred, y_test_sap)}')

Accuracy score : 0.9619978284473398
Confusion matrix :
 [[424  13]
 [ 22 462]]
Classification report :
               precision    recall  f1-score   support

           0       0.95      0.97      0.96       437
           1       0.97      0.95      0.96       484

    accuracy                           0.96       921
   macro avg       0.96      0.96      0.96       921
weighted avg       0.96      0.96      0.96       921



<small>

#### Model Saving

In [67]:
import pickle
import joblib

In [68]:
import pickle
import os

# chemin relatif 
filename = os.path.join("..", "models", "Model.sav")

with open(filename, "wb") as f:
    joblib.dump(gbc_tunning, f)

print(f"Modèle sauvegardé dans {filename}")


Modèle sauvegardé dans ..\models\Model.sav


In [46]:
load_model = pickle.load(open(filename, 'rb'))

In [47]:
load_model.score(X_test_sap, y_test_sap)

0.9619978284473398

In [48]:
Dependents = 'Yes'
tenure = 1
OnlineSecurity = 'No'
OnlineBackup = 'Yes'
DeviceProtection = 'No'
TechSupport = 'No'
Contract = 'Month-to-month'
PaperlessBilling = 'No'
MonthlyCharges = 29.85
TotalCharges = 556.85

In [49]:
data = [[Dependents, tenure, OnlineSecurity, OnlineBackup, DeviceProtection, TechSupport, Contract, PaperlessBilling, MonthlyCharges, TotalCharges]]

In [50]:
df = pd.DataFrame(data, columns=['Dependents', 'tenure', 'OnlineSecurity',
        'OnlineBackup', 'DeviceProtection', 'TechSupport', 'Contract',
        'PaperlessBilling', 'MonthlyCharges', 'TotalCharges'])
df.head()
print(df.dtypes)

Dependents           object
tenure                int64
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
Contract             object
PaperlessBilling     object
MonthlyCharges      float64
TotalCharges        float64
dtype: object


In [51]:
for feature in df.columns:
    if df[feature].dtypes == 'O':
        categorical_feature = feature
        print(categorical_feature)

Dependents
OnlineSecurity
OnlineBackup
DeviceProtection
TechSupport
Contract
PaperlessBilling


In [52]:
encoder = LabelEncoder()
for feature in df.columns:
    if df[feature].dtypes == 'O':
        df[feature] = encoder.fit_transform(df[feature])

In [53]:
df.head()

Unnamed: 0,Dependents,tenure,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,Contract,PaperlessBilling,MonthlyCharges,TotalCharges
0,0,1,0,0,0,0,0,0,29.85,556.85


In [54]:
single = load_model.predict(df)
probability = load_model.predict_proba(df)[:,1]

In [55]:
print(single)

[1]


In [56]:
print(probability)

[0.63423622]


In [57]:
if single == 1:
    print("This Customer is likely to be Churned!")
    print(f"Confidence level is {np.round(probability*100, 2)}")
else:
    print("This Customer is likely to be Continue!")
    print(f"Confidence level is {np.round(probability*100, 2)}")

This Customer is likely to be Churned!
Confidence level is [63.42]


In [66]:
import os
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
import pickle
import joblib

df_encoder = df_en.copy()

# 
project_root = Path(os.getcwd()).parent
models_dir = project_root / "models"

# 
models_dir.mkdir(exist_ok=True)

# Colonnes catégorielles
categorical_features = [
    'Dependents', 'OnlineSecurity', 'OnlineBackup',
    'DeviceProtection', 'TechSupport', 'Contract', 'PaperlessBilling'
]

encoders = {}

for feature in categorical_features:
    le = LabelEncoder()
    df_encoder[feature] = le.fit_transform(df_encoder[feature])
    encoders[feature] = le  

# Sauvegarde dans models
with open(models_dir / "encoders.pkl", "wb") as f:
    joblib.dump(encoders, f)

print(f"encoders.pkl sauvegardé dans {models_dir}")


encoders.pkl sauvegardé dans c:\Users\papes\OneDrive - Ecoles Galiléo Global Education France\Bureau\Data scientist\churn_prediction\models


In [69]:
import numpy as np
print(np.__version__)


2.3.3
