 # Prediticting Churn with ANN (Acc 90%,Recall 89%)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler,MinMaxScaler
from sklearn.metrics import accuracy_score,confusion_matrix,f1_score,matthews_corrcoef,precision_score,recall_score
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from sklearn.utils import class_weight
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Introduction Preparing the Dataset

In this project we will build an Artificial Neural Network Model which predicts credit cards churn. 

The dataset consists of 10,000 clients (entries) who mention their age, salary, marital status, credit card limit, credit card category, etc. Thus, the 19 attributes(features) will be our input to the neural network. As the dataset is have features is multiple formats; mainly string and intger it will require preparation.

We only have 16.07% of customers who have abandoned credit card services. Therefore, we have an unbalanced data set. In order to deal with this imbalance we will assigning weights to the two classes of target variable to balance it out.

We prepapre the dataset for ANN by replacing string variables to integers in the feature columns, We also drop the 'CLIENTNUM' column as it isnt a feature which will affect out target variable.

In [None]:
df = pd.read_csv('/kaggle/input/credit-card-customers/BankChurners.csv')
df = df.drop(df.columns[21:23],axis=1)
df=df.drop('CLIENTNUM',axis=1)  

df['Gender'].replace('M',1,inplace = True)
df['Gender'].replace('F',0,inplace = True)
 
df['Education_Level'].replace('Unknown',0,inplace = True)
df['Education_Level'].replace('Uneducated',1,inplace = True)
df['Education_Level'].replace('High School',2,inplace = True)
df['Education_Level'].replace('College',3,inplace = True)
df['Education_Level'].replace('Graduate',4,inplace = True)
df['Education_Level'].replace('Post-Graduate',5,inplace = True)
df['Education_Level'].replace('Doctorate',6,inplace = True)

df['Marital_Status'].replace('Unknown',0,inplace = True)
df['Marital_Status'].replace('Single',1,inplace = True)
df['Marital_Status'].replace('Married',2,inplace = True)
df['Marital_Status'].replace('Divorced',3,inplace = True)

df['Card_Category'].replace('Blue',0,inplace = True)
df['Card_Category'].replace('Gold',1,inplace = True)
df['Card_Category'].replace('Silver',2,inplace = True)
df['Card_Category'].replace('Platinum',3,inplace = True)


df['Income_Category'].replace('Unknown',0,inplace = True)
df['Income_Category'].replace('Less than $40K',1,inplace = True)
df['Income_Category'].replace('$40K - $60K',2,inplace = True)
df['Income_Category'].replace('$60K - $80K',3,inplace = True)
df['Income_Category'].replace('$80K - $120K',4,inplace = True)
df['Income_Category'].replace('$120K +',5,inplace = True)

df['Attrition_Flag'].replace('Existing Customer',0,inplace = True)
df['Attrition_Flag'].replace('Attrited Customer',1,inplace = True)

df.head()

# Preprocessing the Dataset

At this stage we split our dataset into feature matrix and target matrix. We scale the feature matrix to get a normally distributed data. 


In [None]:
#%%
x = df[df.columns[1:20]]
y = df[df.columns[0]]

#%%
rs = RobustScaler()
x =rs.fit_transform(x)



Now we split out preprocessed feature matrix and target matrix into Training Data and Testing Data.

As the data set is unbalanced we need to assign class weights to it. This is done by taking the ratio of Churned Customers to the total number of customers.  

Next, we build a 3-layer neural network. The input layer contains the same number of neurons as the number of columns in the feauture matrix. The output layer consits of a single layer which preditcs the output i.e 1 for churned customer and 0 for existing customer.

The neurons for hidden layer are usually a value between the number of neurons in the input layer and output layer. It is considered safe to take numebr of neurons in the hidden layer to be the mean of neurons in the input and output layer.

In [None]:
#%%
X_train,X_test,Y_train,Y_test = train_test_split(x,y,test_size=0.3, random_state=103)

#%%
cw = class_weight.compute_class_weight('balanced', np.unique(Y_train), Y_train)
a = y.value_counts()
ratio = a[1]/(a[1]+a[0])

#%%
weights = [ratio, 1-ratio]
model = Sequential()
model.add(Dense(19,activation="sigmoid"))
model.add(Dense(10,activation="sigmoid"))
model.add(Dense(1))
model.compile(optimizer='rmsprop',loss = "binary_crossentropy",metrics=["BinaryAccuracy"],loss_weights=weights)

history = model.fit(x=X_train,y=Y_train,epochs=100, class_weight = {0:cw[0], 1:cw[1]})

predictions = model.predict_classes(X_test) 

As seen below, we get highly accurate model which is successfully able to predict the right labels for the churned customers.

In [None]:
cm = confusion_matrix(Y_test, predictions)
mcc = matthews_corrcoef(Y_test,predictions)      
print('\n')
print('Neural Network Accuracy: ', accuracy_score(Y_test,predictions))
print('Neural Network Recall score: ', recall_score(Y_test,predictions))
#%%

ax= plt.subplot()
sns.heatmap(cm, annot=True, ax = ax, fmt='g')
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Neural Network Confusion Matrix')