# **Importing libraries and loading data from database**

In [1]:
import logging

logger = logging.getLogger()

file_handler = logging.FileHandler(filename='ecc_model_log.log', mode='w')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

logger.setLevel(logging.DEBUG)
logger.info('FILE STARTS RUNNING!')

INFO:root:FILE STARTS RUNNING!


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from pymongo import MongoClient

from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

DEBUG:matplotlib.pyplot:Loaded backend module://ipykernel.pylab.backend_inline version unknown.


In [4]:
logger.info('Libraries imported!')

INFO:root:Libraries imported!


In [5]:
# Connect to Mongodb cluster
client = MongoClient("mongodb+srv://shrutibalan1991:mycluster@mycluster.g3bp8fr.mongodb.net/?retryWrites=true&w=majority")

logger.info('Connected to database!')

# List of databases
db_list = client.list_database_names()
db_list

INFO:root:Connected to database!


['CHURN', 'loan_db', 'admin', 'local']

In [None]:
# List of collections/tables in 'CHURN'
mydb = client.CHURN
mydb.list_collection_names()

['ecomm_churn', 'ecomm_churn_bin_encoded', 'ecomm_churn_encoded', 'bank_churn']

In [None]:
# Load 'ecomm_churn_encoded'
cursor = mydb.ecomm_churn_encoded.find()
df_e = pd.DataFrame(list(cursor)).drop('_id', axis=1)


In [None]:
df_e.head()

Unnamed: 0,Churn,Tenure,WarehouseToHome,NumberOfDeviceRegistered,NumberOfAddress,DaySinceLastOrder,CashbackAmount,PreferredLoginDevice_Computer,PreferredLoginDevice_Mobile,CityTier_1,...,SatisfactionScore_3,SatisfactionScore_4,SatisfactionScore_5,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,Gender_Female,Gender_Male,Complain_0,Complain_1
0,1,10.0,12.0,3,5,2.0,120.73,0,1,0,...,0,0,1,0,1,0,0,1,0,1
1,1,0.0,22.0,5,2,7.0,139.19,1,0,1,...,0,0,1,0,0,1,1,0,0,1
2,1,13.0,9.0,4,2,2.0,126.83,0,1,0,...,1,0,0,1,0,0,0,1,0,1
3,0,5.0,14.0,3,2,7.0,189.98,1,0,0,...,0,0,0,0,0,1,1,0,1,0
4,0,0.0,13.0,4,3,0.0,161.32,0,1,1,...,1,0,0,1,0,0,0,1,1,0


In [6]:
logger.info('Data loaded from database!')

INFO:root:Data loaded from database!


# **Modeling**

In [7]:
logger.info('Modeling starts!')

INFO:root:Modeling starts!


## **Handling imbalance**

In [None]:
# Initialise SMOTE object
smote = SMOTE()

In [None]:
# Separate independent and dependent variables
X = df_e.drop('Churn', axis=1)
y = df_e['Churn']

In [None]:
# fit predictor and target variable
x_smote, y_smote = smote.fit_resample(X, y)

In [None]:
# Combine balanced X and y
df_e = pd.DataFrame(x_smote, columns=df_e.drop('Churn', axis=1).columns)
df_e['Churn'] = y_smote

df_e.head()

Unnamed: 0,Tenure,WarehouseToHome,NumberOfDeviceRegistered,NumberOfAddress,DaySinceLastOrder,CashbackAmount,PreferredLoginDevice_Computer,PreferredLoginDevice_Mobile,CityTier_1,CityTier_2,...,SatisfactionScore_4,SatisfactionScore_5,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,Gender_Female,Gender_Male,Complain_0,Complain_1,Churn
0,4.0,6.0,3,9,5.0,159.93,0,1,0,0,...,0,0,0,0,1,1,0,0,1,1
1,10.0,8.0,4,7,0.0,120.9,0,1,1,0,...,0,0,0,0,1,0,1,0,1,1
2,10.0,30.0,4,6,3.0,120.28,0,1,1,0,...,0,0,0,0,1,0,1,0,1,1
3,0.0,15.0,4,8,3.0,134.07,0,1,0,0,...,0,1,0,0,1,0,1,1,0,1
4,0.0,12.0,3,3,3.0,129.6,0,1,1,0,...,0,1,0,0,1,0,1,1,0,1


In [None]:
df_e.Churn.value_counts()

1    4682
0    4682
Name: Churn, dtype: int64

## **Modeling using df_e**

### **Split train-test sets**

In [None]:
# Separate dependent and independent variables
X = df_e.drop('Churn', axis=1)
y= df_e.Churn

In [None]:
#standardise the x value by using standardscaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
# Split train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.30,random_state =100)

In [None]:
# Checking the target rate in the population, train sample and test sample
print("Population risk rate :", 
      round(sum(df_e.Churn)*100/len(df_e), 2),"%")
print("Train set risk rate :", 
      round(sum(y_train)*100/len(y_train), 2),"%")
print("Test set risk rate :", 
      round(sum(y_test)*100/len(y_test), 2),"%")

Population risk rate : 16.84 %
Train set risk rate : 17.25 %
Test set risk rate : 15.87 %


### **DecisionTreeClassifier**

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Training the model
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train, y_train)

DecisionTreeClassifier()

In [None]:
# Predictions on test data
y_pred = dt_clf.predict(X_test)

In [None]:
# Evaluation metrics
print(metrics.classification_report(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))
print('Accuracy :', metrics.accuracy_score(y_test, y_pred))
print('F1-score : ', metrics.f1_score(y_test, y_pred))
print('R2-score : ', metrics.r2_score(y_test, y_pred))
print('ROC-AUC Score :', metrics.roc_auc_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.97      0.98      1421
           1       0.86      0.89      0.87       268

    accuracy                           0.96      1689
   macro avg       0.92      0.93      0.92      1689
weighted avg       0.96      0.96      0.96      1689

[[1382   39]
 [  30  238]]
Accuracy : 0.9591474245115453
F1-score :  0.873394495412844
R2-score :  0.6939799594567626
ROC-AUC Score : 0.9303071202747697


In [8]:
logger.info('Modeling ends!')

INFO:root:Modeling ends!


# **Save the model**

In [None]:
import pickle

pickle.dump(dt_clf, open('churn.pkl', 'wb'))

In [9]:
logger.info('Saved the model!')
logger.info('FILE ENDS RUNNING!')

INFO:root:Saved the model!
INFO:root:FILE ENDS RUNNING!
