In [2]:
'''
Here we will be importing our cleaned dataset and start building an ML model to predict the customer churn
First set path
'''
path = r'F:\Machine learning\Machine learning practice\Minimize Churn rate'
#### Importing Libraries ####

import pandas as pd
import numpy as np
import random
import seaborn as sn
import matplotlib.pyplot as plt
import os
os.chdir(path)
dataset = pd.read_csv('new_churn_data.csv')

In [3]:
'''
We are separating the user_ID field as this will not be input into the ML model.
The field is stored for later use
'''
## Data Preparation
user_identifier = dataset['user']
dataset = dataset.drop(columns = ['user'])

In [4]:
# One-Hot Encoding: This will introduce new columns for all the categorical columns. Very smart!!
dataset = pd.get_dummies(dataset)
dataset.columns

Index(['churn', 'age', 'deposits', 'withdrawal', 'purchases_partners',
       'purchases', 'cc_taken', 'cc_recommended', 'cc_disliked', 'cc_liked',
       'cc_application_begin', 'app_downloaded', 'web_user', 'android_user',
       'registered_phones', 'waiting_4_loan', 'cancelled_loan',
       'received_loan', 'rejected_loan', 'left_for_two_month_plus',
       'left_for_one_month', 'reward_rate', 'is_referred', 'housing_O',
       'housing_R', 'housing_na', 'payment_type_Bi-Weekly',
       'payment_type_Monthly', 'payment_type_Semi-Monthly',
       'payment_type_Weekly', 'payment_type_na', 'zodiac_sign_Aquarius',
       'zodiac_sign_Aries', 'zodiac_sign_Cancer', 'zodiac_sign_Capricorn',
       'zodiac_sign_Gemini', 'zodiac_sign_Leo', 'zodiac_sign_Libra',
       'zodiac_sign_Pisces', 'zodiac_sign_Sagittarius', 'zodiac_sign_Scorpio',
       'zodiac_sign_Taurus', 'zodiac_sign_Virgo', 'zodiac_sign_na'],
      dtype='object')

In [5]:
#prevent dummy variable trap
dataset = dataset.drop(columns = ['housing_na', 'zodiac_sign_na', 'payment_type_na'])

In [6]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dataset.drop(columns = 'churn'), dataset['churn'],
                                                    test_size = 0.2,
                                                    random_state = 0)

In [8]:
# Balancing the Training Set
'''
suppose the training set which was prepared is having data with around 60% churn = 0
In our case we have data as such.
Churn 1 was around 12K+ and churn 0 was around 8K+. This can create a bias
Our model will be performing great even though the accuracy is 60%
But there is some bias towards churn = 0. So our DS is pretty spread out.
But in some case this doesn't happen after train test split.
Thus it is needed to perform Training set balancing
'''
print('Before balancing churn column distribution:\n' + str(y_train.value_counts()))

pos_index = y_train[y_train.values == 1].index
neg_index = y_train[y_train.values == 0].index

'''
Here we are balancing the positive and negative churns in the training set at random
'''

if len(pos_index) > len(neg_index):
    higher = pos_index
    lower = neg_index
else:
    higher = neg_index
    lower = pos_index


'''
Length of churn 0 is more than length of churn 1.
Thus we are chopping churn 0 to the length of churn 1
'''
random.seed(0)
higher = np.random.choice(higher, size=len(lower))
lower = np.asarray(lower)
new_indexes = np.concatenate((lower, higher))

X_train = X_train.loc[new_indexes,]
y_train = y_train[new_indexes]

'''
Now churn 1 count = churn 0 count = 8940 nice!!!
Run the code below
'''
print('After balancing churn column distribution:\n' + str(y_train.value_counts()))

Before balancing churn column distribution:
0    12656
1     8940
Name: churn, dtype: int64
After balancing churn column distribution:
1    8940
0    8940
Name: churn, dtype: int64


In [9]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train2 = pd.DataFrame(sc_X.fit_transform(X_train))
X_test2 = pd.DataFrame(sc_X.transform(X_test))
X_train2.columns = X_train.columns.values
X_test2.columns = X_test.columns.values
X_train2.index = X_train.index.values
X_test2.index = X_test.index.values
X_train = X_train2
X_test = X_test2

In [11]:
#### Model Building ####


# Fitting Model to the Training Set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

# Predicting Test Set
y_pred = classifier.predict(X_test)


'''
accuracy_score: Overall how many times our model predicted accurate = (tp+tn)/no. of obs
precision_score: When it predicts yes, how often is it correct?
'''
# Evaluating Results
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
cm = confusion_matrix(y_test, y_pred)
print('Confusion matrix:\n' + str(cm))
print('accuracy_score:' + str(accuracy_score(y_test, y_pred)))
print('precision_score:' + str(precision_score(y_test, y_pred))) # tp / (tp + fp)
print('recall_score:' + str(recall_score(y_test, y_pred))) # tp / (tp + fn)
print('f1_score:' + str(f1_score(y_test, y_pred)))

Confusion matrix:
[[1667 1499]
 [ 591 1643]]
accuracy_score:0.6129629629629629
precision_score:0.5229153405474221
recall_score:0.7354521038495971
f1_score:0.6112351190476191
