Customer Churn Prediction using Artificial Neural Network

In [None]:
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Read the data
df = pd.read_csv("../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv")

# Look at a snapshot of data
df.head()

In [None]:
df.shape

In [None]:
# See the summary stats and frequency distribution of features
df.describe()

In [None]:
df.isnull().sum()

In [None]:
# Let's see if there is class imbalance in the target variable
print (df['Churn'].value_counts(ascending=True))

In [None]:
df['TotalCharges'].replace(to_replace = ' ', value= np.nan, inplace = True)
df['TotalCharges'] = df['TotalCharges'].astype(float)
df.dropna(axis=0, inplace=True)

In [None]:
### Remove customerID
df.drop(['customerID'], axis = 1, inplace=True)

DATA PREPROCESSING

In [None]:
from sklearn.model_selection import train_test_split

y = df['Churn']
X = df.loc[:, df.columns != 'Churn']

X_train, X_test, y_train, y_test =   train_test_split(X, y, test_size=0.20, random_state=111)

print(X_train.shape, X_test.shape)

In [None]:
# Convert 'SeniorCitizen' column into categorical
X_train['SeniorCitizen']=pd.Categorical(X_train['SeniorCitizen'])
X_test['SeniorCitizen']=pd.Categorical(X_test['SeniorCitizen'])

In [None]:
# Encode target variables to 0, 1
y_train = y_train.map(dict(Yes=1, No=0))
y_test = y_test.map(dict(Yes = 1, No=0))
print(y_train.shape, y_test.shape)

Standardizing numeric attributes

In [None]:
# Divide the columns into 3 categories, one ofor standardisation, one for label encoding and one for one hot encoding
num_cols = ["tenure", 'MonthlyCharges', 'TotalCharges']
cat_cols_ohe =['PaymentMethod', 'Contract', 'InternetService'] # those that need one-hot encoding
cat_cols_le = list(set(X_train.columns)- set(num_cols) - set(cat_cols_ohe)) #those that need label encoding

In [None]:
from sklearn.preprocessing import StandardScaler

scaler= StandardScaler()

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

ENCODING ATTRIBUTES

In [None]:
X_train = pd.DataFrame(X_train)
X_test= pd.DataFrame(X_test)
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
le = LabelEncoder()
ohe = OneHotEncoder()

for col in cat_cols_le:
    le.fit(X_train[col])
    X_train[col] = le.transform(X_train[col])
    X_test[col] = le.transform(X_test[col])

In [None]:
print(X_train.shape, X_test.shape)

In [None]:
ohe.fit(X_train[cat_cols_ohe])
tr_cols= ohe.transform(X_train[cat_cols_ohe])
te_cols = ohe.transform(X_test[cat_cols_ohe])

X_train.drop(columns=cat_cols_ohe, inplace=True)
X_test.drop(columns=cat_cols_ohe, inplace=True)

X_train = np.hstack((X_train,tr_cols.toarray()))
X_test = np.hstack((X_test, te_cols.toarray()))
print(X_train.shape, X_test.shape)

In [None]:
print(y_train.shape, y_test.shape)

In [None]:
print(y_train.value_counts(), '\n', y_test.value_counts())

ANN MODEL BUILDING

In [None]:
#Importing necessary modules
from tensorflow.keras.models import Sequential
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Dense, Dropout, Activation
from sklearn.model_selection import train_test_split

In [None]:
seed = 0
np.random.seed(seed)

In [None]:
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

input_shape = X_train.shape[1]

In [None]:
model = Sequential()
model.add(Dense(32, input_dim=input_shape, kernel_initializer='uniform', activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(1, activation='sigmoid'))



model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])


model.fit(X_train, y_train,
              epochs=25,
          batch_size=24, class_weight={0:0.2, 1:0.8})

In [None]:
score = model.evaluate(X_test, y_test, batch_size=20)

In [None]:
print(score)
print ("Accuracy : %s" % "{0:.3%}".format(score[1]))

In [None]:
train_pred_dl=model.predict_classes(X_train)
test_pred_dl=model.predict_classes(X_test)

In [None]:
from sklearn import metrics
mlp_conf_matrix = metrics.confusion_matrix(y_test, test_pred_dl)
print (mlp_conf_matrix)

In [None]:
accuracy = metrics.accuracy_score(y_test,test_pred_dl)
    
print ("Accuracy : %s" % "{0:.3%}".format(accuracy))

#Print Recall
recall = metrics.recall_score(y_test,test_pred_dl)
    
print ("Recall : %s" % "{0:.3%}".format(recall))