In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Importing the necessary libraries:

In [None]:
# for plotting:
import seaborn as sns
import matplotlib.pyplot as plt

# for preprocessing/ feature engineering:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# getting some algos from sci-kit learn's libraries:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# for building a fantastic neural network!:
from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import Adam, RMSprop
from keras.losses import binary_crossentropy
from keras.metrics import Accuracy


In [None]:
data = pd.read_csv("../input/churn-modelling/Churn_Modelling.csv", index_col='CustomerId')

In [None]:
data.head()

Dropping the 'Exited' column since it contained the labels, also we drop the 'Surname' column because it might mess up the preprocessing since it has way too many unique values and that won't help anyway. 'RowNumber' column just felt useless to me.

In [None]:
y = data['Exited']
data.drop(['Exited', 'RowNumber', 'Surname'], axis=1, inplace=True)

Lets see the distribution of Males and Females in our data:

In [None]:
plt.figure(figsize=(6,6))
sns.countplot(data['Gender'])

Lets look at the distribution of the customers across the countries:

In [None]:
plt.figure(figsize=(6,6))
sns.countplot(data['Geography'], hue=data['Gender'])

Its time to split the data into training and test sets!

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(data, y, train_size=0.9, test_size=0.1)

Lets see how the data is distributed as per customer's salary and the number of items they bought:

In [None]:
plt.figure(figsize=(10,10))
sns.scatterplot(data=xtrain, y='Age', x='CreditScore', hue=y)

The data in here seems quite messed up, this might be due to low correlation between the two features I have mentioned.

Below is a list of columns on which I would be defining the categorical columns on which the LabelEncoder will work on:

In [None]:
cat_cols = ['Geography', 'Gender']

Encoding the categorical feautures:

In [None]:
enctrain = xtrain.copy()
enctest = xtest.copy() 

lab = LabelEncoder()
for i in cat_cols:
    enctrain[i] = lab.fit_transform(enctrain[i])
    enctest[i] = lab.transform(enctest[i])

In [None]:
enctrain.head()

Okay, so lets check the correlation of the columns or features:

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(enctrain.corr())

Alrighty, so all columns have an almost low correlation if not negative. This tells me that there is no need to drop any columns or features. Lets build a neural network now, shall we?


In [None]:
xtrain.shape

In [None]:
Model = Sequential([
    Dense(128, input_shape=(10,), activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

In [None]:
rms = Adam(lr=0.1)

Model.compile(
    optimizer=rms, 
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [None]:
hist = Model.fit(
    enctrain,
    ytrain,
    epochs=50,
    validation_data=(enctest,ytest),
)

In [None]:
tremod = DecisionTreeClassifier()
tremod.fit(enctrain, ytrain)
preds = tremod.predict(enctest)
accuracy_score(preds, ytest)

In [None]:
logmodel = LogisticRegression()
logmodel.fit(enctrain, ytrain)
logpreds = logmodel.predict(enctest)
accuracy_score(logpreds, ytest)


In [None]:
svm = SVC()
svm.fit(enctrain, ytrain)
svmpreds = svm.predict(enctest)
accuracy_score(svmpreds, ytest)

As we can see that all of these algorthms are not exactly working well on this dataset which such mediocre accuracy. Lets scale the data:



In [None]:
std = StandardScaler()

scaledtrain = std.fit_transform(enctrain)

scaledtest = std.transform(enctest)

Lets try running the Support Vector Classifier on scaled data:


In [None]:
svm.fit(scaledtrain, ytrain)
scaledsvmpreds = svm.predict(scaledtest)
accuracy_score(scaledsvmpreds, ytest)

In [None]:
logmodel.fit(scaledtrain, ytrain)
scaledlogpreds = logmodel.predict(scaledtest)
accuracy_score(scaledlogpreds, ytest)

Lets build another neural network but change its composition a little bit, tinker with it and...

In [None]:
ScaledModel = Sequential([
    Dense(128, input_shape=(10,), activation='relu'),
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    Dense(1, activation='sigmoid')
])

In [None]:
rms = RMSprop(lr=0.004)

ScaledModel.compile(
    optimizer=rms, 
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [None]:

histscaled = ScaledModel.fit(
    scaledtrain,
    ytrain,
    epochs=220,
    validation_data=(scaledtest,ytest)
)

The model now shows an accuracy of 91% on the training data with 85% accuracy on data that it has not seen before.