In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# These two bad boys will help us plot our data and understand it a bit graphically:
import seaborn as sns
import matplotlib.pyplot as plt

# Importing some tools to preprocess the data:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Lets import some algorithms now:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, r2_score

# Importing tools form Keras library in order to build Neural Network:
from keras.losses import binary_crossentropy
from keras.layers import Dense
from keras.models import Sequential
from keras.metrics import Accuracy
from keras.optimizers import RMSprop, SGD, Adam
from keras.optimizers.schedules import ExponentialDecay 


Reading the data:


In [None]:
data = pd.read_csv("../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv", index_col='customerID')
data.head()


Lets check the data for any missing values because the plotting tools such as seaborn and matplotlib don't exactly like it when you have even a single one(i.e it gives an error).

In [None]:
data.isnull().sum()

No missing values! :D

Let's visualize it a little bit now, checking for the proportion of males and females:

In [None]:
plt.figure(figsize=(7,7))
sns.set_context("poster", font_scale=0.7)
sns.set_palette(['pink', 'skyblue'])
sns.countplot(data['gender'])


How many senior citizens are there though compared to the non senior citizens?

In [None]:
plt.figure(figsize=(8,8))
sns.set_context("poster", font_scale=0.7)
sns.set_palette(['k', 'darkgrey'])
sns.countplot(data['SeniorCitizen'])
plt.xticks([0,1], ['Not a SeniorCitizen', 'SeniorCitizen'])

Way more non-senior citizens than senior citizens here.


Here's the little code to make a list of all categorical columns:

In [None]:
c = (data.dtypes == 'object')
catcol = list(c[c].index)

Since I am trying to find the correlation between the features in this dataset I will have to convert the categorical values into numerical values using the LabelEncoder.

In [None]:
encdata = data.copy()
enc = LabelEncoder()
columns = data.columns
for col in catcol:
    encdata[col] = enc.fit_transform(encdata[col])
    
encdata = pd.DataFrame(encdata, columns=columns)

Finally plotting the HeatMap:

In [None]:
plt.figure(figsize=(9,9))
sns.heatmap(encdata.corr(), cmap='Blues')

In [None]:
plt.figure(figsize=(7,7))
sns.set_context("poster", font_scale=0.7)
sns.set_palette(['pink', 'skyblue'])
sns.scatterplot(data=data, x='TotalCharges', y='tenure', hue='Churn')

In [None]:
plt.figure(figsize=(7,7))
sns.set_context("poster", font_scale=0.7)
sns.set_palette(['pink', 'skyblue'])
sns.scatterplot(data=data, x='MonthlyCharges', y='tenure', hue='Churn')

We can see from the data above that there is little correlation with the elements and with the naked eye, we wouldn't be able to conclude anything in my perspective. (feel free to correct me)

Lets redo the whole preprocessing once again for my satisfaction:

In [None]:
data = pd.read_csv("../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv")
data.head()

Lets use the Churn as a label and predict on it:


In [None]:
y = data['Churn']

enc = LabelEncoder()
y = enc.fit_transform(y)

data.drop(['Churn', 'customerID'], axis=1, inplace=True)

Defining the new list of categorical features since we dropped a couple of features:

In [None]:
c = (data.dtypes == 'object')
catcol = list(c[c].index)

Label encoding the data:

In [None]:
for col in catcol:
    data[col] = enc.fit_transform(data[col])

Splitting the data into training and test sets now:

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(data, y, train_size=0.95, test_size=0.05)

Lets predict on the data using Decision Tree Classifier

In [None]:
DecModel = DecisionTreeClassifier()

DecModel.fit(xtrain, ytrain)

DecPreds = DecModel.predict(xtest)

accuracy_score(DecPreds, ytest)

Lets predict on the data using Random Forest Classifier:

In [None]:
DecModel = RandomForestClassifier(n_estimators=1500)

DecModel.fit(xtrain, ytrain)

DecPreds = DecModel.predict(xtest)

accuracy_score(DecPreds, ytest)

Defining a Neural Network for binary classification now:

In [None]:
def neuralnet(xtrain, xtest, ytrain, ytest):
    NModel = Sequential([
    Dense(128, input_shape=(19,), activation='relu'),
    Dense(240, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
    ])
    
    adam = Adam(learning_rate=0.007)
    
    NModel.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
    
    Fit = NModel.fit(xtrain, ytrain, epochs=50, validation_data=(xtest, ytest))
    return Fit

Now its time to use Standard Scaler:

In [None]:
scale = StandardScaler()

scaledtrain = scale.fit_transform(xtrain)
scaledtest = scale.transform(xtest)

Lets train a neural network on the scaled data:

In [None]:
neuralnet(scaledtrain, scaledtest, ytrain, ytest)

Thank you for going through this notebook! I hope this helped you.