****Input all the neccessay packages****

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing

sns.set()
print(os.listdir("../input"))

In [None]:
churn_data = pd.read_csv('../input/churn-modelling/Churn_Modelling.csv',
                         index_col='RowNumber')

In [None]:
churn_data.info()

In [None]:
churn_data.head()

In [None]:
churn_data.describe()

We used the describe() function to get the statistics associated with each column. This will be helpful for the purpose of scaling.

In [None]:
churn_data.CreditScore.value_counts()

We will start with preprocessing each features.
1)Cerdit Score

In [None]:
churn_data.CreditScore.isna().any()

Lets remove the Customer ID and Surname, as they won't be of any help for a good analysis.

In [None]:
churn_data.drop(labels=['CustomerId','Surname'],
                axis=1,
                inplace=True)

In [None]:
churn_data.head()

In [None]:
churn_data.Geography.value_counts(dropna=False)

In [None]:
churn_data.Gender.value_counts(dropna=False)

Now we require to do one hot encoding of the above two selected labels. We can see that there are 3 classess in Geography and Two labels in Gender. Lets convert the string type to int.

In [None]:
churn_data_cleaned = pd.get_dummies(churn_data, 
                                    prefix=['Geo','Gen'], 
                                    prefix_sep='_',
                                    dummy_na=False, 
                                    columns=['Geography','Gender'],
                                    sparse=False,
                                    drop_first=False,
                                    dtype=int) 

In [None]:
churn_data_cleaned

Now that we have cleaned the data, and converted all the categorical data to numeric, we are ready for further analysis.

In [None]:
churn_data_cleaned.hist(bins=10,
                        figsize=(20,20),
                        xrot=30)

In [None]:
labels=churn_data_cleaned.columns
print(labels)
scaler=preprocessing.StandardScaler()
scaled_churn_data_cleaned=scaler.fit_transform(churn_data_cleaned)

In [None]:
scaled_churn_data_cleaned=pd.DataFrame(scaled_churn_data_cleaned)
scaled_churn_data_cleaned.columns=labels

In [None]:
scaled_churn_data_cleaned.hist(bins=10,
                               figsize=(20,20),
                               xrot=30)

In [None]:
fig,ax = plt.subplots(1,1,figsize=(20,20))
for i in scaled_churn_data_cleaned.columns:
    sns.kdeplot(scaled_churn_data_cleaned[i],
                 label=[i],
                 bw=1.5,
                 ax=ax)

We can see now that all the data have been scaled down in the limit of [-4,4].

Now, that we have done the necessary scaling, we can safely move forward and try to identify the corelation among all the set of features.

First we will try to calculate the correlation matrix with Exited(Output feature) column in consideration, and lets see how the dependency matrix looks like.

In [None]:
corr=scaled_churn_data_cleaned.corr()

In [None]:
fig,ax=plt.subplots(1,1,figsize=(20,10))
sns.heatmap(corr,
            annot=True,
            cmap='RdYlGn',
            ax=ax)

Now, lets try to visualize the correlation data in box plt using the seaborn package.

In [None]:
nr=7
nc=2
fig,ax=plt.subplots(nrows=nr,ncols=nc,figsize=(20,20))
i=0
for j in range(nr):
    for k in range(nc):
        axes=ax[j,k]
        
        sns.boxplot(x=scaled_churn_data_cleaned['Exited'],
                    y=scaled_churn_data_cleaned.iloc[:,i],
                    ax=axes)
        i+=1

Now we are ready for building the elements for modelling the data set. 
Before proceeding lets remove the output feature, Exited from the dataset.

In [None]:
scaled_churn_data_cleaned=scaled_churn_data_cleaned.drop('Exited',
                                                         axis=1)

In [None]:
scaled_churn_data_cleaned.columns

Now we can apply PCA to the given input dataset.

In [None]:
from sklearn.decomposition import PCA

n_comp = 2
pca=PCA(n_components=n_comp)
principal_components=pca.fit_transform(scaled_churn_data_cleaned)
len(principal_components)

In [None]:
pc_df=pd.DataFrame(principal_components,
                  columns=['principal_components_%s'%(i+1) for i in range(n_comp)],
                  index=range(1,len(principal_components)+1))
print(pc_df)

In [None]:
input_components=pc_df
output_components=churn_data.Exited
print(input_components.shape,output_components.shape)
final_df=pd.concat([input_components,output_components],axis=1)

Visualizing in 2D using the Principal components 1 & principal components 2 as y and x axis.

In [None]:
fig,ax=plt.subplots(1,1,figsize=(20,20))
ax.set_xlabel('principal_components_1',fontsize=20)
ax.set_ylabel('principal_components_2',fontsize=20)
ax.set_title('Customers Exited on PC1 & PC2',fontsize=20)

Targets=[0,1]
colors=['r','k']

for target,color in zip(Targets,colors):
    index_no_target=final_df['Exited']==target
    ax.scatter(final_df.loc[index_no_target,'principal_components_1'],
               final_df.loc[index_no_target,'principal_components_2'],
              c=color)
    ax.legend(Targets)
    ax.grid()

In [None]:
pca.explained_variance_ratio_

We observed that only 15.44% and 14.3% variances is attributed to eachc of the first two principal components.
Since we have 13 dimension in feature space, we will try to retain most of the variance using 10 principal components.

In [None]:
n_comp=10
pca_10=PCA(n_components=n_comp)
pca10_comp=pca_10.fit_transform(scaled_churn_data_cleaned)
df_PCA_10=pd.DataFrame(pca10_comp,
                       columns=['Principal_component_%s'%(i+1) for i in range(n_comp)],
                      index=range(1,len(pca10_comp)+1))
print(df_PCA_10)

In [None]:
sum(pca_10.explained_variance_ratio_)

In [None]:
#Test Train split of the datdset
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(df_PCA_10,
                                               output_components,
                                               test_size=0.4,
                                               random_state=0)

We can see here that 95.8% of the variance is retained in these 10 Principal Components. 

Now we can proceed for creating the model development using the ten principal components created.
First we will start with Logistic Regression.

In [None]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score,recall_score,classification_report
from sklearn.metrics import confusion_matrix,precision_recall_curve,roc_curve,auc,log_loss

model=LogisticRegression()
model.fit(x_train,y_train)
y_pred=model.predict(x_test)
y_pred_proba=model.predict_proba(x_test)[:, 1]
[fpr,tpr,thr]=roc_curve(y_test,y_pred_proba)

print('Train/Test split results:')
print(model.__class__.__name__+" accuracy is %2.3f" % accuracy_score(y_test, y_pred))
print(model.__class__.__name__+" log_loss is %2.3f" % log_loss(y_test, y_pred_proba))
print(model.__class__.__name__+" auc is %2.3f" % auc(fpr, tpr))
print(model.__class__.__name__+" score is  %.2f" % model.score(x_test,y_test))

The score is pretty well, lets now try with some other algorithm and see if we can get any better results.

Now lets start with Decision tree model, and see the results.

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

model=DecisionTreeClassifier(random_state=0)
model.fit(x_train,y_train)
y_pred=model.predict(x_test)

score=model.score(x_test,y_test)
print(score)

In [None]:
cross_val_score(model,x_train,y_train,cv=10)

In [None]:
from sklearn.ensemble import RandomForestClassifier

model=RandomForestClassifier(n_estimators=100,
                            bootstrap=True,
                            max_features='sqrt')
model.fit(x_train,y_train)
y_pred=model.predict(x_train)
print(model.score(x_test,y_test))

We see a significant improvement in the model accuracy on going from Decision Tree to Random Forest classifier.