## Develop a Deep Learning Based Churn Prediction Engine


### Import required libraries

In [None]:
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')

#### Load & understand the data 

In [None]:
# Read the data
data = pd.read_csv("../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv")

# Look at a snapshot of data
data.head()

In [None]:
data.shape

In [None]:
# See the summary stats and frequency distribution of features
data.describe()

In [None]:
# Lets' see if there are any missing values

print (data.apply(lambda x: sum(x.isnull()),axis=0))
print (np.where(data.applymap(lambda x: x == ' ')))

In [None]:
# Let's see if there is class imbalance in the target variable
print (data['Churn'].value_counts(ascending=True))

In [None]:
data['TotalCharges'].replace(to_replace = ' ', value= np.nan, inplace = True)
data['TotalCharges'] = data['TotalCharges'].astype(float)
data.dropna(axis=0, inplace=True)

### Observations:

**On Type conversions:**

- Columns like CustomerID can be removed from the analysis
- We see that 'Tenure' and 'MonthlyCharges' are numeric columns present in the data, with the data close to normal distribution. 
- Along with them, 'TotalCharges' is also a numeric column but contains some info missing, but still is not a nan.
- The column 'SeniorCitizen' is a categorical column by its nature with 'Yes' as 1, and No as 0. So it shuold be converted into Categorical type
- All the categorical attribtues are strings. Hence there is need to convert them into numbers, by a way of encoding.
- Among the categorical attribtues, majority of them have binary classes(2 levels). Label encoding would help assign labels 0,1 for the levels as appropriate.
- But attributes like 'PaymentMethod', 'Contract', 'InternetService' are nominal and have more than 2 levels. So along with label encoding, we need to convert them into equidistant levels.

**On Missingness of data:**
 The data is clean and there are no missing values in the data
 
**On the class imbalance in the target attribute**
There are more instances where the customers din't churn than those that have custoemrs churned out. Class imbalance is clearly seen.

## Data Preprocessing

### Split the data into train and test sets


In [None]:
from sklearn.model_selection import train_test_split

y = data['Churn']
X = data.loc[:, data.columns != 'Churn']

X_train, X_test, y_train, y_test =   train_test_split(X, y, test_size=0.20, random_state=111)

print(X_train.shape, X_test.shape)

In [None]:
### Remove customerID
X_train.drop(['customerID'], axis = 1, inplace=True)
X_test.drop(['customerID'], axis = 1, inplace=True)

### Missing values

In [None]:
# Impute missing values, if any!. Check number of missing values
print("Num missing values before imputation:")
print(pd.DataFrame(X_train['TotalCharges']).isnull().sum())

print("Num missing values before imputation:")
print(pd.DataFrame(X_test['TotalCharges']).isnull().sum())

### Type Conversions

In [None]:
# Convert 'SeniorCitizen' column into categorical
X_train['SeniorCitizen']=pd.Categorical(X_train['SeniorCitizen'])
X_test['SeniorCitizen']=pd.Categorical(X_test['SeniorCitizen'])

In [None]:
# Encode target variables to 0, 1
y_train = y_train.map(dict(Yes=1, No=0))
y_test = y_test.map(dict(Yes = 1, No=0))
print(y_train.shape, y_test.shape)

### Standardizing numeric attributes

In [None]:
# Divide the columns into 3 categories, one ofor standardisation, one for label encoding and one for one hot encoding
num_cols = ["tenure", 'MonthlyCharges', 'TotalCharges']
cat_cols_ohe =['PaymentMethod', 'Contract', 'InternetService'] # those that need one-hot encoding
cat_cols_le = list(set(X_train.columns)- set(num_cols) - set(cat_cols_ohe)) #those that need label encoding

In [None]:
from sklearn.preprocessing import StandardScaler

scaler= StandardScaler()

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])


### Encoding attributes

In [None]:
X_train = pd.DataFrame(X_train)
X_test= pd.DataFrame(X_test)
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
le = LabelEncoder()
ohe = OneHotEncoder()

for col in cat_cols_le:
    le.fit(X_train[col])
    X_train[col] = le.transform(X_train[col])
    X_test[col] = le.transform(X_test[col])



In [None]:
print(X_train.shape, X_test.shape)

In [None]:
ohe.fit(X_train[cat_cols_ohe])
tr_cols= ohe.transform(X_train[cat_cols_ohe])
te_cols = ohe.transform(X_test[cat_cols_ohe])

X_train.drop(columns=cat_cols_ohe, inplace=True)
X_test.drop(columns=cat_cols_ohe, inplace=True)

X_train = np.hstack((X_train,tr_cols.toarray()))
X_test = np.hstack((X_test, te_cols.toarray()))
print(X_train.shape, X_test.shape)




In [None]:
print(y_train.shape, y_test.shape)

In [None]:
print(y_train.value_counts(), '\n', y_test.value_counts())

## Building the ANN Model

In [None]:
#Importing necessary modules
from tensorflow.keras.models import Sequential
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Dense, Dropout, Activation
from sklearn.model_selection import train_test_split

In [None]:
seed = 7
np.random.seed(seed)

In [None]:

X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

input_shape = X_train.shape[1]

In [None]:
model = Sequential()
model.add(Dense(32, input_dim=input_shape, kernel_initializer='uniform', activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.7))
model.add(Dense(1, activation='sigmoid'))



model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])


model.fit(X_train, y_train,
              epochs=25,
          batch_size=24, class_weight={0:0.2, 1:0.8})



In [None]:

score = model.evaluate(X_test, y_test, batch_size=20)

In [None]:
print(score)
print ("Accuracy : %s" % "{0:.3%}".format(score[1]))


In [None]:

train_pred_dl=model.predict_classes(X_train)
test_pred_dl=model.predict_classes(X_test)

In [None]:
from sklearn import metrics
mlp_conf_matrix = metrics.confusion_matrix(y_test, test_pred_dl)
print (mlp_conf_matrix)

In [None]:

accuracy = metrics.accuracy_score(y_test,test_pred_dl)
    
print ("Accuracy : %s" % "{0:.3%}".format(accuracy))

#Print Recall
recall = metrics.recall_score(y_test,test_pred_dl)
    
print ("Recall : %s" % "{0:.3%}".format(recall))

## MLP using features from AutoEncoders

In [None]:
encoding_dim  = 32
# this is our input placeholder
input_img = Input(shape=(input_shape,))

# "encoded" is the encoded representation of the input
encoded = Dense(encoding_dim, activation='relu')(input_img)

# "decoded" is the lossy reconstruction of the input
decoded = Dense(input_shape, activation='sigmoid')(encoded)

# this model maps an input to its reconstruction
autoencoder = Model(inputs=input_img, outputs=decoded)

In [None]:
autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy') #optimizer = adam --- can also be used 

In [None]:
autoencoder.fit(X_train, X_train,
                epochs=50,
                batch_size=24,
                shuffle=True,
                validation_data=(X_test, X_test))

In [None]:
# this model maps an input to its encoded representation
encoder = Model(inputs=input_img, outputs=encoded)


In [None]:
x_train_encoded = encoder.predict(X_train)
x_test_encoded = encoder.predict(X_test)

In [None]:
x_test_encoded

In [None]:
x_train_encoded.shape

In [None]:
x_test_encoded.shape

In [None]:
model2 = Sequential()

model2.add(Dense(64, input_dim = 32, kernel_initializer='uniform', activation='relu'))
model2.add(Dropout(0.2))
model2.add(Dense(16, activation='relu'))
model2.add(Dropout(0.2))
model2.add(Dense(1, kernel_initializer='uniform', activation='sigmoid'))

In [None]:
model2.compile(loss='binary_crossentropy',
              optimizer='Adam',
              metrics=['accuracy'])

In [None]:
x_train_encoded.shape

In [None]:
model2.fit(x_train_encoded, y_train, batch_size=32, epochs=25, class_weight={0:0.2, 1:0.8})


In [None]:
score2 = model2.evaluate(x_test_encoded, y_test)
print (score2)

In [None]:

train_pred_dlac=model2.predict_classes(x_train_encoded)
test_pred_dlac=model2.predict_classes(x_test_encoded)

In [None]:
dlac_conf_matrix = metrics.confusion_matrix(y_test, test_pred_dlac)
print (dlac_conf_matrix)

In [None]:

accuracy = metrics.accuracy_score(y_test,test_pred_dlac)
    
print ("Accuracy : %s" % "{0:.3%}".format(accuracy))

#Print Recall
recall = metrics.recall_score(y_test,test_pred_dlac)
    
print ("Recall : %s" % "{0:.3%}".format(recall))