## Develop a Deep Learning Based Churn Prediction Engine


### Import required libraries

In [1]:
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')

#### Load & understand the data 

In [2]:
# Read the data
data = pd.read_csv("TelcoChurn1.csv")

# Look at a snapshot of data
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
# See the summary stats and frequency distribution of features
data.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [4]:
# Lets' see if there are any missing values

print (data.apply(lambda x: sum(x.isnull()),axis=0))
print (np.where(data.applymap(lambda x: x == ' ')))

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64
(array([ 488,  753,  936, 1082, 1340, 3331, 3826, 4380, 5218, 6670, 6754]), array([19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19]))


In [5]:
# Let's see if there is class imbalance in the target variable
print (data['Churn'].value_counts(ascending=True))

Yes    1869
No     5174
Name: Churn, dtype: int64


In [6]:
data['TotalCharges'].replace(to_replace = ' ', value= np.nan, inplace = True)
data['TotalCharges'] = data['TotalCharges'].astype(float)
data.dropna(axis=0, inplace=True)

### Observations:

**On Type conversions:**

- Columns like CustomerID can be removed from the analysis
- We see that 'Tenure' and 'MonthlyCharges' are numeric columns present in the data, with the data close to normal distribution. 
- Along with them, 'TotalCharges' is also a numeric column but contains some info missing, but still is not a nan.
- The column 'SeniorCitizen' is a categorical column by its nature with 'Yes' as 1, and No as 0. So it shuold be converted into Categorical type
- All the categorical attribtues are strings. Hence there is need to convert them into numbers, by a way of encoding.
- Among the categorical attribtues, majority of them have binary classes(2 levels). Label encoding would help assign labels 0,1 for the levels as appropriate.
- But attributes like 'PaymentMethod', 'Contract', 'InternetService' are nominal and have more than 2 levels. So along with label encoding, we need to convert them into equidistant levels.

**On Missingness of data:**
 The data is clean and there are no missing values in the data
 
**On the class imbalance in the target attribute**
There are more instances where the customers din't churn than those that have custoemrs churned out. Class imbalance is clearly seen.

## Data Preprocessing

### Split the data into train and test sets


In [7]:
from sklearn.model_selection import train_test_split

y = data['Churn']
X = data.loc[:, data.columns != 'Churn']

X_train, X_test, y_train, y_test =   train_test_split(X, y, test_size=0.20, random_state=111)

print(X_train.shape, X_test.shape)

(5625, 20) (1407, 20)


In [8]:
### Remove customerID
X_train.drop(['customerID'], axis = 1, inplace=True)
X_test.drop(['customerID'], axis = 1, inplace=True)

### Missing values

In [9]:
# Impute missing values, if any!. Check number of missing values
print("Num missing values before imputation:")
print(pd.DataFrame(X_train['TotalCharges']).isnull().sum())

print("Num missing values before imputation:")
print(pd.DataFrame(X_test['TotalCharges']).isnull().sum())

Num missing values before imputation:
TotalCharges    0
dtype: int64
Num missing values before imputation:
TotalCharges    0
dtype: int64


### Type Conversions

In [10]:
# Convert 'SeniorCitizen' column into categorical
X_train['SeniorCitizen']=pd.Categorical(X_train['SeniorCitizen'])
X_test['SeniorCitizen']=pd.Categorical(X_test['SeniorCitizen'])

In [11]:
# Encode target variables to 0, 1
y_train = y_train.map(dict(Yes=1, No=0))
y_test = y_test.map(dict(Yes = 1, No=0))
print(y_train.shape, y_test.shape)

(5625,) (1407,)


### Standardizing numeric attributes

In [12]:
# Divide the columns into 3 categories, one ofor standardisation, one for label encoding and one for one hot encoding
num_cols = ["tenure", 'MonthlyCharges', 'TotalCharges']
cat_cols_ohe =['PaymentMethod', 'Contract', 'InternetService'] # those that need one-hot encoding
cat_cols_le = list(set(X_train.columns)- set(num_cols) - set(cat_cols_ohe)) #those that need label encoding

In [13]:
from sklearn.preprocessing import StandardScaler

scaler= StandardScaler()

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])


### Encoding attributes

In [14]:
X_train = pd.DataFrame(X_train)
X_test= pd.DataFrame(X_test)
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
le = LabelEncoder()
ohe = OneHotEncoder()

for col in cat_cols_le:
    le.fit(X_train[col])
    X_train[col] = le.transform(X_train[col])
    X_test[col] = le.transform(X_test[col])



In [15]:
print(X_train.shape, X_test.shape)

(5625, 19) (1407, 19)


In [17]:
ohe.fit(X_train[cat_cols_ohe])
tr_cols= ohe.transform(X_train[cat_cols_ohe])
te_cols = ohe.transform(X_test[cat_cols_ohe])

X_train.drop(columns=cat_cols_ohe, inplace=True)
X_test.drop(columns=cat_cols_ohe, inplace=True)

X_train = np.hstack((X_train,tr_cols.toarray()))
X_test = np.hstack((X_test, te_cols.toarray()))
print(X_train.shape, X_test.shape)




(5625, 26) (1407, 26)


In [18]:
print(y_train.shape, y_test.shape)

(5625,) (1407,)


In [19]:
print(y_train.value_counts(), '\n', y_test.value_counts())

0    4148
1    1477
Name: Churn, dtype: int64 
 0    1015
1     392
Name: Churn, dtype: int64


## Building the ANN Model

In [20]:
#Importing necessary modules
from tensorflow.keras.models import Sequential
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Dense, Dropout, Activation
from sklearn.model_selection import train_test_split

In [21]:
seed = 7
np.random.seed(seed)

In [22]:

X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

input_shape = X_train.shape[1]

In [53]:
model = Sequential()
model.add(Dense(32, input_dim=input_shape, kernel_initializer='uniform', activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.7))
model.add(Dense(1, activation='sigmoid'))



model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])


model.fit(X_train, y_train,
              epochs=25,
          batch_size=24, class_weight={0:0.2, 1:0.8})



W0716 04:06:54.175837 140735948321664 nn_ops.py:4372] Large dropout rate: 0.7 (>0.5). In TensorFlow 2.x, dropout() uses dropout rate instead of keep_prob. Please ensure that this is intended.
W0716 04:06:54.252901 140735948321664 data_adapter.py:1091] sample_weight modes were coerced from
  ...
    to  
  ['...']
W0716 04:06:54.320842 140735948321664 nn_ops.py:4372] Large dropout rate: 0.7 (>0.5). In TensorFlow 2.x, dropout() uses dropout rate instead of keep_prob. Please ensure that this is intended.


Train on 5625 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x1476b1908>

In [54]:

score = model.evaluate(X_test, y_test, batch_size=20)



In [55]:
print(score)
print ("Accuracy : %s" % "{0:.3%}".format(score[1]))


[0.5559938355973195, 0.69722813]
Accuracy : 69.723%


In [56]:

train_pred_dl=model.predict_classes(X_train)
test_pred_dl=model.predict_classes(X_test)

In [57]:
from sklearn import metrics
mlp_conf_matrix = metrics.confusion_matrix(y_test, test_pred_dl)
print (mlp_conf_matrix)

[[642 373]
 [ 53 339]]


In [58]:

accuracy = metrics.accuracy_score(y_test,test_pred_dl)
    
print ("Accuracy : %s" % "{0:.3%}".format(accuracy))

#Print Recall
recall = metrics.recall_score(y_test,test_pred_dl)
    
print ("Recall : %s" % "{0:.3%}".format(recall))

Accuracy : 69.723%
Recall : 86.480%


## MLP using features from AutoEncoders

In [29]:
encoding_dim  = 32
# this is our input placeholder
input_img = Input(shape=(input_shape,))

# "encoded" is the encoded representation of the input
encoded = Dense(encoding_dim, activation='relu')(input_img)

# "decoded" is the lossy reconstruction of the input
decoded = Dense(input_shape, activation='sigmoid')(encoded)

# this model maps an input to its reconstruction
autoencoder = Model(inputs=input_img, outputs=decoded)

In [30]:
autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy') #optimizer = adam --- can also be used 

In [31]:
autoencoder.fit(X_train, X_train,
                epochs=50,
                batch_size=24,
                shuffle=True,
                validation_data=(X_test, X_test))

Train on 5625 samples, validate on 1407 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x1473b3390>

In [33]:
# this model maps an input to its encoded representation
encoder = Model(inputs=input_img, outputs=encoded)


In [34]:
x_train_encoded = encoder.predict(X_train)
x_test_encoded = encoder.predict(X_test)

In [35]:
x_test_encoded

array([[0.        , 0.18937707, 0.14511806, ..., 0.        , 0.        ,
        0.6969709 ],
       [0.        , 0.        , 0.5015595 , ..., 0.        , 0.        ,
        0.1034191 ],
       [0.        , 0.21934086, 0.        , ..., 0.        , 0.        ,
        0.85755384],
       ...,
       [0.10965183, 0.5013616 , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.37887457],
       [0.        , 0.38840058, 0.        , ..., 0.6153631 , 0.        ,
        0.9018389 ]], dtype=float32)

In [36]:
x_train_encoded.shape

(5625, 32)

In [37]:
x_test_encoded.shape

(1407, 32)

In [38]:
model2 = Sequential()

model2.add(Dense(64, input_dim = 32, kernel_initializer='uniform', activation='relu'))
model2.add(Dropout(0.2))
model2.add(Dense(16, activation='relu'))
model2.add(Dropout(0.2))
model2.add(Dense(1, kernel_initializer='uniform', activation='sigmoid'))

In [39]:
model2.compile(loss='binary_crossentropy',
              optimizer='Adam',
              metrics=['accuracy'])

In [40]:
x_train_encoded.shape

(5625, 32)

In [48]:
model2.fit(x_train_encoded, y_train, batch_size=32, epochs=25, class_weight={0:0.2, 1:0.8})


W0716 04:06:25.269119 140735948321664 data_adapter.py:1091] sample_weight modes were coerced from
  ...
    to  
  ['...']


Train on 5625 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x14a313668>

In [49]:
score2 = model2.evaluate(x_test_encoded, y_test)
print (score2)

[0.5406732543953446, 0.7199716]


In [50]:

train_pred_dlac=model2.predict_classes(x_train_encoded)
test_pred_dlac=model2.predict_classes(x_test_encoded)

In [51]:
dlac_conf_matrix = metrics.confusion_matrix(y_test, test_pred_dlac)
print (dlac_conf_matrix)

[[701 314]
 [ 80 312]]


In [52]:

accuracy = metrics.accuracy_score(y_test,test_pred_dlac)
    
print ("Accuracy : %s" % "{0:.3%}".format(accuracy))

#Print Recall
recall = metrics.recall_score(y_test,test_pred_dlac)
    
print ("Recall : %s" % "{0:.3%}".format(recall))

Accuracy : 71.997%
Recall : 79.592%
