In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [None]:
!pip install imblearn  # specifically designed to deal with imbalanced datasets

In [2]:
from imblearn.combine import SMOTEENN #By generating new synthetic samples, instead of simply duplicating existing samples, SMOTE can help to reduce the risk of overfitting which commonly accompanies random oversampling.

In [None]:
data = pd.read_csv('cust_churn_dummy_var.csv')
data

In [None]:
data.drop(columns=['Unnamed: 0'], axis=1, inplace=True)

In [None]:
data

In [None]:
x = data.drop('Churn',axis=True)
y = data['Churn']

In [None]:
y

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

## Using DecisionTreeClassifier

In [None]:
dtc = DecisionTreeClassifier(criterion='gini',random_state=42, max_depth=6,min_samples_leaf=8) # used random parameters

In [None]:
dtc.fit(x_train,y_train)

In [None]:
y_pred = dtc.predict(x_test)

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
accuracy_score(dtc.predict(x_train),y_train)

In [None]:
# Hperparameter Tuning
from sklearn.tree import DecisionTreeClassifier
dtc_dummy = DecisionTreeClassifier(random_state=42)
from sklearn.model_selection import GridSearchCV
params = {
    'max_depth': [2, 3,4, 5,7,8,9,10, 20],
    'min_samples_leaf': [5,6,7,8,9,10,11,12,13,14,15,20, 50, 100,101,102,103,105,110],
    'criterion': ["gini", "entropy"]
}
grid_search = GridSearchCV(estimator=dtc_dummy, 
                           param_grid=params, 
                           cv=5, n_jobs=-1, verbose=1, scoring = "accuracy")
grid_search.fit(x_train, y_train)
grid_search.best_estimator_

In [None]:
dtc_hyp_tuned = DecisionTreeClassifier(criterion='entropy',random_state=42, max_depth=7,min_samples_leaf=101) # tuned parameters 

In [None]:
dtc_hyp_tuned.fit(x_train,y_train)
y_pred = dtc_hyp_tuned.predict(x_test)

In [None]:
print(accuracy_score(y_test,y_pred))
print(accuracy_score(dtc_hyp_tuned.predict(x_train),y_train))

dtc --> random parameters||
dtc_hyp_tuned --> Tuned Parameters||
dtc_dummy --> dummy function made for cross validation

In [None]:
# since the model is not performing well either on training as well as test even after Hyperparameter Tuning

In [None]:
# to make data balanced we will use smoteenn technique

In [None]:
smn = SMOTEENN()
x_new, y_new = smn.fit_resample(x,y)

In [None]:
x_new_train, x_new_test, y_new_train, y_new_test = train_test_split(x_new,y_new,test_size=0.2)

Results after SMOTEENN

In [None]:
# Result with Random hyperparameters
dtc.fit(x_new_train,y_new_train)
y_new_pred = dtc.predict(x_new_test)
accuracy_score(y_new_test,y_new_pred)

In [None]:
# Result with Tuned hyperparameters
dtc_hyp_tuned.fit(x_new_train,y_new_train)
y_new_pred = dtc_hyp_tuned.predict(x_new_test)
accuracy_score(y_new_test,y_new_pred)

In [None]:
confusion_matrix(y_new_test,y_new_pred)

K-Fold Cross Validation

In [None]:
# K-Fold Cross-Validation
from sklearn.model_selection import cross_validate
def cross_validation(model, x, y, cv=10):
    
    score = ['accuracy', 'precision', 'recall', 'f1']
    results = cross_validate(estimator=model,
                               X=x,
                               y=y,
                               cv=cv,
                               scoring=score,
                               return_train_score=True)
      
    print(
          "Mean Validation Accuracy ", results['test_accuracy'].mean(),"\n",
          "Mean Validation Precision ", results['test_precision'].mean(),"\n",
          "Mean Validation Recall ", results['test_recall'].mean(),"\n",
          "Mean Validation F1 Score ", results['test_f1'].mean())

In [None]:
# without using SMOTEENN data

In [None]:
print("UnTuned_parameters Results:\n")

decision_tree_result = cross_validation(dtc_dummy,x_train, y_train, 5)
print(decision_tree_result)
# very bad score for recall, F1 score

In [None]:
decision_tree_result = cross_validation(dtc,x_train, y_train, 5)
print(decision_tree_result)

In [None]:
print("Tuned_parameters Results:\n")
decision_tree_result = cross_validation(dtc_hyp_tuned,x_train, y_train, 5)
print(decision_tree_result)

In [None]:
# dtc_hyp_tuned is comparatively performinng well for imbalanced data

In [None]:
# now apply on SMOOTEENN data
decision_tree_result = cross_validation(dtc,x_new_train, y_new_train, 5)
print(decision_tree_result)
print("\n")

In [None]:
# SMOOTEENN improves the overall score

In [None]:
# saving the model
import pickle
filename = 'dtc.sav'
pickle.dump(dtc,open(filename,'wb'))

## Now Checking with Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier()
rfc.fit(x_train,y_train)
y_pred = rfc.predict(x_test)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
# Bit of score is improved in comparison to decision tree classifier

In [None]:
rfc.fit(x_new_train,y_new_train)
y_new_pred = rfc.predict(x_new_test)

In [None]:
accuracy_score(y_new_pred,y_new_test)

In [None]:
print(classification_report(y_new_pred,y_new_test))

In [None]:
# amazing score by using the SMOOTEENN data

In [None]:
# K-Fold Cross-Validation
from sklearn.model_selection import cross_validate
def cross_validation(model, x, y, cv):
    
    score = ['accuracy', 'precision', 'recall', 'f1']
    results = cross_validate(estimator=model,
                               X=x,
                               y=y,
                               cv=cv,
                               scoring=score,
                               return_train_score=True)
      
    print(
          "Mean Validation Accuracy ", results['test_accuracy'].mean(),"\n",
          "Mean Validation Precision ", results['test_precision'].mean(),"\n",
          "Mean Validation Recall ", results['test_recall'].mean(),"\n",
          "Mean Validation F1 Score ", results['test_f1'].mean())

In [None]:
result1 = cross_validation(rfc,x_new_train, y_new_train, 10)
print(result1)
print("\n")

In [None]:
print(confusion_matrix(y_new_pred,y_new_test))

In [None]:
# random forest is performing more than decisiontree classifier by some amount

In [None]:
# saving the model
import pickle
filename = 'rfc.sav'
pickle.dump(rfc,open(filename,'wb'))

In [None]:
load_rfc = pickle.load(open(filename,'rb'))

In [None]:
load_rfc

In [None]:
load_rfc.score(x_new_test,y_new_test)

### Using XGBClassifier

In [16]:
from xgboost import XGBClassifier
xgb= XGBClassifier()

In [17]:
xgb.fit(x_new_train,y_new_train)
y_new_pred = xgb.predict(x_new_test)
print(classification_report(y_new_pred,y_new_test))

              precision    recall  f1-score   support

           0       0.96      0.97      0.96       563
           1       0.97      0.97      0.97       678

    accuracy                           0.97      1241
   macro avg       0.97      0.97      0.97      1241
weighted avg       0.97      0.97      0.97      1241



In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
              'learning_rate': [ 0.25, 0.3,0.4,0.5],
              'max_depth': [7,8,9,10,11],
              'n_estimators': [113,114,115]
             }

gsc = GridSearchCV(XGBClassifier(n_jobs=-1), param_grid=param_grid, n_jobs=-1, cv=3,scoring='accuracy')
gsc.fit(x_new_train, y_new_train)

print('Best score:', gsc.best_score_)
print('Best score:', gsc.best_params_)

In [20]:
xgb= XGBClassifier(learning_rate = 0.4, max_depth =9,n_estimators =114)
xgb.fit(x_new_train,y_new_train)
y_new_pred = xgb.predict(x_new_test)
print(classification_report(y_new_pred,y_new_test))

              precision    recall  f1-score   support

           0       0.95      0.97      0.96       559
           1       0.97      0.96      0.97       682

    accuracy                           0.96      1241
   macro avg       0.96      0.96      0.96      1241
weighted avg       0.96      0.96      0.96      1241



In [21]:
accuracy_score(y_new_pred,y_new_test)

0.9637389202256245

In [22]:
# XGB Classifer is giving best score

In [23]:
import pickle
filename = 'xgb_Classifier.pkl'
pickle.dump(xgb,open(filename,'wb'))

### Using Naive Bayes Classfier

In [3]:
from sklearn.naive_bayes import GaussianNB

In [24]:
df1 = pd.read_csv('Customer-Churn-Encoded.csv')
df1.columns

Index(['Unnamed: 0', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'PaperlessBilling', 'MonthlyCharges', 'TotalCharges',
       'Churn', 'InternetService_DSL', 'InternetService_Fiber optic',
       'InternetService_No', 'Contract_Month-to-month', 'Contract_One year',
       'Contract_Two year', 'PaymentMethod_Bank transfer (automatic)',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check'],
      dtype='object')

In [5]:
df1.drop(columns=['Unnamed: 0'], axis=1, inplace=True)

In [6]:
df1

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,...,InternetService_DSL,InternetService_Fiber optic,InternetService_No,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,0,1,0,0.013889,0,0,0,1,0,...,1,0,0,1,0,0,0,0,1,0
1,1,0,0,0,0.472222,1,0,1,0,1,...,1,0,0,0,1,0,0,0,0,1
2,1,0,0,0,0.027778,1,0,1,1,0,...,1,0,0,1,0,0,0,0,0,1
3,1,0,0,0,0.625000,0,0,1,0,1,...,1,0,0,0,1,0,1,0,0,0
4,0,0,0,0,0.027778,1,0,0,0,0,...,0,1,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,1,0,1,1,0.333333,1,1,1,0,1,...,1,0,0,0,1,0,0,0,0,1
7028,0,0,1,1,1.000000,1,1,0,1,1,...,0,1,0,0,1,0,0,1,0,0
7029,0,0,1,1,0.152778,0,0,1,0,0,...,1,0,0,1,0,0,0,0,1,0
7030,1,1,1,0,0.055556,1,1,0,0,0,...,0,1,0,1,0,0,0,0,0,1


In [7]:
x = df1.drop('Churn',axis=True)
y = df1['Churn']

In [8]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [9]:
GNB_clf = GaussianNB()
GNB_clf.fit(x_train,y_train)
y_pred = GNB_clf.predict(x_test)
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.72      0.89      0.80       829
           1       0.76      0.49      0.60       578

    accuracy                           0.73      1407
   macro avg       0.74      0.69      0.70      1407
weighted avg       0.74      0.73      0.72      1407



In [10]:
smn = SMOTEENN()
x_new, y_new = smn.fit_resample(x,y)
x_new_train, x_new_test, y_new_train, y_new_test = train_test_split(x_new,y_new,test_size=0.2,random_state=0)

In [11]:
GNB_clf_smooteenn = GaussianNB()
GNB_clf_smooteenn.fit(x_new_train,y_new_train)
y_new_pred = GNB_clf_smooteenn.predict(x_new_test)
print(classification_report(y_new_pred,y_new_test))

              precision    recall  f1-score   support

           0       0.92      0.87      0.90       603
           1       0.88      0.93      0.91       638

    accuracy                           0.90      1241
   macro avg       0.90      0.90      0.90      1241
weighted avg       0.90      0.90      0.90      1241



In [12]:
accuracy_score(y_new_pred,y_new_test)

0.9024979854955681

In [55]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'var_smoothing': np.logspace(0,-9, num=100)
}
nb_grid = GridSearchCV(estimator=GaussianNB(), param_grid=param_grid, verbose=1, cv=10, n_jobs=-1)
nb_grid.fit(x_new_train, y_new_train)
print(nb_grid.best_estimator_)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits
GaussianNB(var_smoothing=0.0533669923120631)


In [13]:
GNB_clf_smooteenn = GaussianNB(var_smoothing=0.23101297000831597) # tuning parameter
GNB_clf_smooteenn.fit(x_new_train,y_new_train)
y_new_pred = GNB_clf_smooteenn.predict(x_new_test)
print(classification_report(y_new_pred,y_new_test))

              precision    recall  f1-score   support

           0       0.90      0.90      0.90       570
           1       0.91      0.92      0.91       671

    accuracy                           0.91      1241
   macro avg       0.91      0.91      0.91      1241
weighted avg       0.91      0.91      0.91      1241



In [14]:
# Almost Same result
accuracy_score(y_new_pred,y_new_test)

0.9065269943593876

In [15]:
import pickle
filename = 'naive_Bayes_Classifier.sav'
pickle.dump(GNB_clf_smooteenn,open(filename,'wb'))

### Using SVC

In [20]:
from sklearn.svm import SVC
svc_clf = SVC(kernel='rbf')
svc_clf.fit(x_new_train,y_new_train)
y_new_pred = svc_clf.predict(x_new_test)
print(classification_report(y_new_pred,y_new_test))

              precision    recall  f1-score   support

           0       0.93      0.93      0.93       549
           1       0.94      0.94      0.94       697

    accuracy                           0.93      1246
   macro avg       0.93      0.93      0.93      1246
weighted avg       0.93      0.93      0.93      1246



In [21]:
accuracy_score(y_new_pred,y_new_test)

0.9341894060995185

In [None]:
# SVC is also giving a Good Score

In [59]:
import pickle
filename = 'SV_Classifier.sav'
pickle.dump(svc_clf,open(filename,'wb'))

## Till now XGBClassifier is the best model

In [22]:
# Lest's check using K-Nearest Neighbours

In [23]:
from sklearn.neighbors import KNeighborsClassifier

In [24]:
knc = KNeighborsClassifier()
knc.fit(x_new_train,y_new_train)
y_new_pred = knc.predict(x_new_test)
print(classification_report(y_new_pred,y_new_test))

              precision    recall  f1-score   support

           0       0.91      0.97      0.94       515
           1       0.98      0.93      0.95       731

    accuracy                           0.95      1246
   macro avg       0.94      0.95      0.95      1246
weighted avg       0.95      0.95      0.95      1246



In [30]:
#List Hyperparameters that we want to tune.
leaf_size = list(range(1,10))
n_neighbors = list(range(1,5))
p=[1,2]
hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)

knn_2 = KNeighborsClassifier()
clf = GridSearchCV(knn_2, hyperparameters, cv=10)

best_model = clf.fit(x_new_train,y_new_train)
print('Best leaf_size:', best_model.best_estimator_.get_params()['leaf_size'])
print('Best p:', best_model.best_estimator_.get_params()['p'])
print('Best n_neighbors:', best_model.best_estimator_.get_params()['n_neighbors'])

Best leaf_size: 1
Best p: 1
Best n_neighbors: 1


In [64]:
knc = KNeighborsClassifier(leaf_size=1,p=1,n_neighbors=1)
knc.fit(x_new_train,y_new_train)
y_new_pred = knc.predict(x_new_test)
print(classification_report(y_new_pred,y_new_test))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98       527
           1       0.99      0.97      0.98       703

    accuracy                           0.98      1230
   macro avg       0.98      0.98      0.98      1230
weighted avg       0.98      0.98      0.98      1230



In [65]:
accuracy_score(y_new_pred,y_new_test)

0.9796747967479674

In [62]:
# K-Fold Cross-Validation
from sklearn.model_selection import cross_validate
def cross_validation(model, x, y, cv):
    
    score = ['accuracy', 'precision', 'recall', 'f1']
    results = cross_validate(estimator=model,
                               X=x,
                               y=y,
                               cv=cv,
                               scoring=score,
                               return_train_score=True)
      
    print(
          "Mean Accuracy ", results['test_accuracy'].mean(),"\n",
          "Mean Precision ", results['test_precision'].mean(),"\n",
          "Mean Recall ", results['test_recall'].mean(),"\n",
          "Mean F1 Score ", results['test_f1'].mean())
result1 = cross_validation(knc,x_new_train, y_new_train, 5)
print(result1)
print("\n")

Mean Accuracy  0.9684777394569469 
 Mean Precision  0.9569534233226824 
 Mean Recall  0.9865573999163063 
 Mean F1 Score  0.9715137293877578
None




Best score, but it may be overfitting

In [36]:
import pickle
filename = 'KNN_Classifier.sav'
pickle.dump(knc,open(filename,'wb'))