In [1]:
pip install imblearn

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE

imblearn is a Python library that provides tools for dealing with imbalanced data in machine learning. One of the methods for dealing with imbalanced data is Synthetic Minority Over-sampling Technique (SMOTE), which is implemented in imblearn library.

SMOTE is an over-sampling technique that generates synthetic examples for the minority class by creating new examples along the line segments that join the minority class examples. The basic idea is to create synthetic examples that are similar to the existing minority class examples, but are slightly different. This can help to balance the class distribution and improve the performance of machine learning models.



In [3]:
cust_churn = pd.read_csv('cust_churn1.csv')

In [4]:
cust_churn.head()

Unnamed: 0.1,Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,...,InternetService_DSL,InternetService_Fiber optic,InternetService_No,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,0,1,0,0.0,0,0,0,1,...,1,0,0,1,0,0,0,0,1,0
1,1,0,0,0,0,0.464789,1,0,1,0,...,1,0,0,0,1,0,0,0,0,1
2,2,0,0,0,0,0.014085,1,0,1,1,...,1,0,0,1,0,0,0,0,0,1
3,3,0,0,0,0,0.619718,0,0,1,0,...,1,0,0,0,1,0,1,0,0,0
4,4,1,0,0,0,0.014085,1,0,0,0,...,0,1,0,1,0,0,0,0,1,0


In [5]:
cust_churn = cust_churn.drop(columns = 'Unnamed: 0' ,axis=1)

In [11]:
x = cust_churn.drop('Churn',axis=1)
x

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,...,InternetService_DSL,InternetService_Fiber optic,InternetService_No,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,1,0,1,0,0.000000,0,0,0,1,0,...,1,0,0,1,0,0,0,0,1,0
1,0,0,0,0,0.464789,1,0,1,0,1,...,1,0,0,0,1,0,0,0,0,1
2,0,0,0,0,0.014085,1,0,1,1,0,...,1,0,0,1,0,0,0,0,0,1
3,0,0,0,0,0.619718,0,0,1,0,1,...,1,0,0,0,1,0,1,0,0,0
4,1,0,0,0,0.014085,1,0,0,0,0,...,0,1,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,0,0,1,1,0.323944,1,1,1,0,1,...,1,0,0,0,1,0,0,0,0,1
7028,1,0,1,1,1.000000,1,1,0,1,1,...,0,1,0,0,1,0,0,1,0,0
7029,1,0,1,1,0.140845,0,0,1,0,0,...,1,0,0,1,0,0,0,0,1,0
7030,0,1,1,0,0.042254,1,1,0,0,0,...,0,1,0,1,0,0,0,0,0,1


In [13]:
y = cust_churn['Churn']
y.head()

0    0
1    0
2    1
3    0
4    1
Name: Churn, dtype: int64

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

# decision tree classifier
class sklearn.tree.DecisionTreeClassifier(*, criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0)

criterion: default=”gini”
The function to measure the quality of a split.gini calculates the impurity
splitter: random to choose the best random split
max_depthint, default=None
The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.
min_samples_splitint or float, default=2
The minimum number of samples required to split an internal node.consider min_samples_split as the minimum number.
min_samples_leafint or float, default=1
The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression.

In [15]:
dt = DecisionTreeClassifier(random_state=10,max_depth=6,min_samples_leaf=8)


In [16]:
dt.fit(x_train,y_train)

DecisionTreeClassifier(max_depth=6, min_samples_leaf=8, random_state=10)

In [17]:
y_pred = dt.predict(x_test)

In [18]:
y_pred

array([0, 0, 0, ..., 1, 0, 0], dtype=int64)

In [19]:
print(classification_report(y_test,y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.82      0.91      0.86      1021
           1       0.66      0.48      0.56       386

    accuracy                           0.79      1407
   macro avg       0.74      0.69      0.71      1407
weighted avg       0.78      0.79      0.78      1407



Precision = TP / (TP + FP),
The F1 score is the harmonic mean of precision and recall,
F1 score = 2 * (precision * recall) / (precision + recall)
recall is the proportion of positive examples in the dataset that are correctly identified by the mode,
Precision measures the proportion of positive predictions that are correct, while recall measures the proportion of actual positive examples that are correctly identified by the model. The F1 score provides a way to balance these two metrics, giving equal weight to precision and recall.

In [14]:
confusion_matrix(y_test,y_pred)

array([[922, 116],
       [170, 199]], dtype=int64)

In [15]:
sd = SMOTE()
x_resampled,y_resampled = sd.fit_resample(x,y)

In [16]:
xr_train, xr_test, yr_train, yr_test = train_test_split(x_resampled,y_resampled,test_size=0.2)

In [17]:
dt.fit(xr_train,yr_train)

DecisionTreeClassifier(max_depth=6, min_samples_leaf=8, random_state=10)

In [18]:
yr_pred_smote= dt.predict(xr_test)

In [19]:
len(xr_train)

8260

In [20]:
print(classification_report(yr_test,yr_pred_smote))

              precision    recall  f1-score   support

           0       0.85      0.77      0.81      1056
           1       0.78      0.86      0.82      1010

    accuracy                           0.81      2066
   macro avg       0.81      0.81      0.81      2066
weighted avg       0.81      0.81      0.81      2066



In [21]:
#  as we have performed the swote analysis now the data is balanced

# random forest classifier

In [22]:
 from sklearn.ensemble import RandomForestClassifier

In [23]:
rf =RandomForestClassifier(n_estimators=100, random_state=10,max_depth=6,min_samples_leaf=8)


In [24]:
rf.fit(x_train,y_train)
y_pred_rf=rf.predict(x_test)

In [25]:
print(classification_report(y_test,y_pred_rf,labels=[0,1]))

              precision    recall  f1-score   support

           0       0.83      0.93      0.88      1038
           1       0.70      0.47      0.56       369

    accuracy                           0.81      1407
   macro avg       0.76      0.70      0.72      1407
weighted avg       0.80      0.81      0.79      1407



In [26]:
# as we can see that 1 probablity is less

In [27]:
sd = SMOTE()
xr_resampled , yr_resampled = sd.fit_resample(x,y)

In [28]:
smote_rf = RandomForestClassifier(n_estimators=100,random_state=10,max_depth=6,min_samples_leaf=8)

In [29]:
xr_train, xr_test, yr_train, yr_test = train_test_split(xr_resampled,yr_resampled, test_size=0.2)

In [30]:
smote_rf.fit(xr_train,yr_train)

RandomForestClassifier(max_depth=6, min_samples_leaf=8, random_state=10)

In [31]:
yr_pred_smote = smote_rf.predict(xr_test)

In [32]:
yr_pred_smote

array([0, 0, 0, ..., 1, 1, 0], dtype=int64)

In [33]:
print(classification_report(yr_test,yr_pred_smote,labels=[0,1]))

              precision    recall  f1-score   support

           0       0.85      0.77      0.81      1003
           1       0.80      0.87      0.84      1063

    accuracy                           0.82      2066
   macro avg       0.83      0.82      0.82      2066
weighted avg       0.83      0.82      0.82      2066



In [34]:
# as we can see after smoting, we are getting nice values compared then the unsmoted values

In [35]:
# saving our model
import pickle

In [36]:
filename='model.sav'
pickle.dump(smote_rf,open(filename,'wb'))

In [37]:
load_model= pickle.load(open(filename, 'rb'))

In [39]:
load_model.score(xr_test,yr_test)

0.8247821878025169