<a id="1.1"></a>
<h3 style="background-color:skyblue;font-family:newtimeroman;font-size:200%;text-align:center">Libraries And Utilities</h3>

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as ex
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.offline as pyo
pyo.init_notebook_mode()
sns.set_style('darkgrid')
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score as f1
from sklearn.metrics import confusion_matrix
import scikitplot as skplt

plt.rc('figure',figsize=(18,9))
%pip install imbalanced-learn
from imblearn.over_sampling import SMOTE

<a id="1.1"></a>
<h3 style="background-color:skyblue;font-family:newtimeroman;font-size:200%;text-align:center">Data Loading</h3>


In [None]:
c_data = pd.read_csv('/kaggle/input/credit-card-customers/BankChurners.csv')
c_data = c_data[c_data.columns[:-2]]
c_data.head()

<a id="1.1"></a>
<h1 style="background-color:skyblue;font-family:newtimeroman;font-size:200%;text-align:center">Exploratory Data Analysis</h1>


In [None]:
fig = make_subplots(rows=2, cols=1)

tr1=go.Box(x=c_data['Customer_Age'],name='Age Box Plot',boxmean=True)
tr2=go.Histogram(x=c_data['Customer_Age'],name='Age Histogram')

fig.add_trace(tr1,row=1,col=1)
fig.add_trace(tr2,row=2,col=1)

fig.update_layout(height=700, width=1200, title_text="Distribution of Customer Ages")
fig.show()

<p style="text-align: center;"><span style='font-family: "Times New Roman", Times, serif; font-size: 24px;'>We can see that the distribution of customer ages in our dataset follows a fairly normal distribution, thus further use of the age feature can be done with the normality assumption.</span></p>

In [None]:
ex.pie(c_data,names='Gender',title='Propotion Of Customer Genders')

<p style="text-align: center;"><span style='font-family: "Times New Roman", Times, serif; font-size: 24px;'>There are more samples of females in our dataset compared to males but the percentage of difference is not that significant so we can say that genders are uniformly distributed.</span></p>

In [None]:
fig = make_subplots(rows=2, cols=1)

tr1=go.Box(x=c_data['Dependent_count'],name='Dependent count Box Plot',boxmean=True)
tr2=go.Histogram(x=c_data['Dependent_count'],name='Dependent count Histogram')

fig.add_trace(tr1,row=1,col=1)
fig.add_trace(tr2,row=2,col=1)

fig.update_layout(height=700, width=1200, title_text="Distribution of Dependent counts (close family size)")
fig.show()

<p style="text-align: center;"><span style='font-family: "Times New Roman", Times, serif; font-size: 24px;'>The distribution of Dependent counts is fairly normally distributed with a slight right skew.</span></p>

In [None]:
ex.pie(c_data,names='Education_Level',title='Propotion Of Education Levels')

<p style="text-align: center;"><span style='font-family: "Times New Roman", Times, serif; font-size: 24px;'>If assuming that most of the customers with unknown education status lack any sort of education we can state that more than 70% of the customers have a formal education level of which about 35% have a higher level of education.</span></p>

In [None]:
ex.pie(c_data,names='Marital_Status',title='Propotion Of Different Marriage Statuses')


<p style="text-align: center;"><span style='font-family: "Times New Roman", Times, serif; font-size: 24px;'>Almost half of the customers of the bank are married and interestingly enough almost the entire other half are customers which are single.only about 7% of the customers are divorced which is surprising considering the worldwide divorce rate statistics! (let me know where the bank is located and sign me up!)</span></p>

In [None]:
ex.pie(c_data,names='Income_Category',title='Propotion Of Different Income Levels')

In [None]:
ex.pie(c_data,names='Card_Category',title='Propotion Of Different Card Categories')

In [None]:
fig = make_subplots(rows=2, cols=1)

tr1=go.Box(x=c_data['Months_on_book'],name='Months on book Box Plot',boxmean=True)
tr2=go.Histogram(x=c_data['Months_on_book'],name='Months on book Histogram')

fig.add_trace(tr1,row=1,col=1)
fig.add_trace(tr2,row=2,col=1)

fig.update_layout(height=700, width=1200, title_text="Distribution of months the customer is part of the bank")
fig.show()

In [None]:
print('Kurtosis of Months on book features is : {}'.format(c_data['Months_on_book'].kurt()))

<p style="text-align: center;"><span style='font-size: 24px; font-family: "Times New Roman", Times, serif;'>We have a low kurtosis value pointing to a very flat shaped distribution (as can be seen in the plots above as well) meaning we cannot assume normality of the feature.</span></p>

In [None]:
fig = make_subplots(rows=2, cols=1)

tr1=go.Box(x=c_data['Total_Relationship_Count'],name='Total no. of products Box Plot',boxmean=True)
tr2=go.Histogram(x=c_data['Total_Relationship_Count'],name='Total no. of products Histogram')

fig.add_trace(tr1,row=1,col=1)
fig.add_trace(tr2,row=2,col=1)

fig.update_layout(height=700, width=1200, title_text="Distribution of Total no. of products held by the customer")
fig.show()

<p style="text-align: center;"><span style='font-family: "Times New Roman", Times, serif; font-size: 24px;'>The distribution of the total number of products held by the customer seems to be closer to a uniform distribution and may appear useless as a predictor for churn status.</span></p>

In [None]:
fig = make_subplots(rows=2, cols=1)

tr1=go.Box(x=c_data['Months_Inactive_12_mon'],name='number of months inactive Box Plot',boxmean=True)
tr2=go.Histogram(x=c_data['Months_Inactive_12_mon'],name='number of months inactive Histogram')

fig.add_trace(tr1,row=1,col=1)
fig.add_trace(tr2,row=2,col=1)

fig.update_layout(height=700, width=1200, title_text="Distribution of the number of months inactive in the last 12 months")
fig.show()

In [None]:
fig = make_subplots(rows=2, cols=1)

tr1=go.Box(x=c_data['Credit_Limit'],name='Credit_Limit Box Plot',boxmean=True)
tr2=go.Histogram(x=c_data['Credit_Limit'],name='Credit_Limit Histogram')

fig.add_trace(tr1,row=1,col=1)
fig.add_trace(tr2,row=2,col=1)

fig.update_layout(height=700, width=1200, title_text="Distribution of the Credit Limit")
fig.show()

In [None]:
fig = make_subplots(rows=2, cols=1)

tr1=go.Box(x=c_data['Total_Trans_Amt'],name='Total_Trans_Amt Box Plot',boxmean=True)
tr2=go.Histogram(x=c_data['Total_Trans_Amt'],name='Total_Trans_Amt Histogram')

fig.add_trace(tr1,row=1,col=1)
fig.add_trace(tr2,row=2,col=1)

fig.update_layout(height=700, width=1200, title_text="Distribution of the Total Transaction Amount (Last 12 months)")
fig.show()

<p style="text-align: center;"><span style='font-family: "Times New Roman", Times, serif; font-size: 24px;'>We see that the distribution of the total transactions (Last 12 months) displays a multimodal distribution, meaning we have some underlying groups in our data, it can be an interesting experiment to try and cluster the different groups and view the similarities between them and what describes best the different groups which create the different modes in our distribution.</span></p>

In [None]:
ex.pie(c_data,names='Attrition_Flag',title='Proportion of churn vs not churn customers')

<p style="text-align: center;"><span style='font-family: "Times New Roman", Times, serif; font-size: 24px;'>As we can see only 16% of the data samples represent churn customers, in the following steps I will use SMOTE to upsample the churn samples to match them with the regular customer sample size in order to give the later selected models a better chance of catching on small details which will almost definitely be missed out with such a size difference.</span></p>

<a id="1.1"></a>
<h3 style="background-color:skyblue;font-family:newtimeroman;font-size:200%;text-align:center">Data Preprocessing</h3>


In [None]:
c_data.Attrition_Flag = c_data.Attrition_Flag.replace({'Attrited Customer':1,'Existing Customer':0})
c_data.Gender = c_data.Gender.replace({'F':1,'M':0})
c_data = pd.concat([c_data,pd.get_dummies(c_data['Education_Level']).drop(columns=['Unknown'])],axis=1)
c_data = pd.concat([c_data,pd.get_dummies(c_data['Income_Category']).drop(columns=['Unknown'])],axis=1)
c_data = pd.concat([c_data,pd.get_dummies(c_data['Marital_Status']).drop(columns=['Unknown'])],axis=1)
c_data = pd.concat([c_data,pd.get_dummies(c_data['Card_Category']).drop(columns=['Platinum'])],axis=1)
c_data.drop(columns = ['Education_Level','Income_Category','Marital_Status','Card_Category','CLIENTNUM'],inplace=True)

<p style="text-align: center;"><span style='font-family: "Times New Roman", Times, serif; font-size: 24px;'>Here we one hot encode all the categorical features describing different statuses of a customer.</span></p>

In [None]:
sns.heatmap(c_data.corr('pearson'),annot=True)

<a id="1.1"></a>
<h3 style="background-color:skyblue;font-family:newtimeroman;font-size:200%;text-align:center">Data Upsampling Using SMOTE</h3>


In [None]:
oversample = SMOTE()
X, y = oversample.fit_resample(c_data[c_data.columns[1:]], c_data[c_data.columns[0]])
usampled_df = X.assign(Churn = y)

In [None]:
ohe_data = usampled_df[usampled_df.columns[15:-1]].copy()

usampled_df = usampled_df.drop(columns=usampled_df.columns[15:-1])

In [None]:
sns.heatmap(usampled_df.corr('pearson'),annot=True)


<a id="1.1"></a>
<h1 style="background-color:skyblue;font-family:newtimeroman;font-size:200%;text-align:center">Principal Component Analysis Of One Hot Encoded Data </h1>


<p style="text-align: center;"><span style='font-family: "Times New Roman", Times, serif; font-size: 24px;'>We will use principal component analysis to reduce the dimensionality of the one-hot encoded categorical variables losing some of the variances but at the same time using a couple of principal components instead of tens of one-hot encoded features will help me construct a better model.</span></p>

In [None]:

N_COMPONENTS = 10

pca_model = PCA(n_components = N_COMPONENTS )

pc_matrix = pca_model.fit_transform(ohe_data)

evr = pca_model.explained_variance_ratio_
cumsum_evr = np.cumsum(evr)

ax = sns.lineplot(x=np.arange(0,len(cumsum_evr)),y=cumsum_evr,label='Explained Variance Ratio')
ax.set_title('Explained Variance Ratio Using {} Components'.format(N_COMPONENTS))
ax = sns.lineplot(x=np.arange(0,len(cumsum_evr)),y=evr,label='Explained Variance Of Component X')
ax.set_xticks([i for i in range(0,len(cumsum_evr))])
ax.set_xlabel('Component number #')
ax.set_ylabel('Explained Variance')
plt.show()

In [None]:
usampled_df_with_pcs = pd.concat([usampled_df,pd.DataFrame(pc_matrix,columns=['PC-{}'.format(i) for i in range(0,N_COMPONENTS)])],axis=1)
usampled_df_with_pcs

In [None]:
sns.heatmap(usampled_df_with_pcs.corr('pearson'),annot=True)


<a id="1.1"></a>
<h3 style="background-color:skyblue;font-family:newtimeroman;font-size:200%;text-align:center">Model Selection</h3>


In [None]:
X_features = ['Total_Trans_Ct','PC-3', 'PC-4' ,'PC-1','PC-0','PC-2','Total_Ct_Chng_Q4_Q1','Total_Relationship_Count']

X = usampled_df_with_pcs[X_features]
y = usampled_df_with_pcs['Churn']

In [None]:
train_x,test_x,train_y,test_y = train_test_split(X,y,random_state=42)

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier  
from sklearn.tree import DecisionTreeClassifier 
'''''from xgboost import XGBClassifier''
from lightgbm import LGBMClassifier from catboost import CatBoostClassifier '''
from sklearn.ensemble import GradientBoostingClassifier


<a id="1.1"></a>
<h3 style="background-color:skyblue;font-family:newtimeroman;font-size:200%;text-align:center">Cross Validation</h3>


In [None]:
rf_pipe = Pipeline(steps =[ ('scale',StandardScaler()), ("RF",RandomForestClassifier(random_state=42)) ])
ada_pipe = Pipeline(steps =[ ('scale',StandardScaler()), ("RF",AdaBoostClassifier(random_state=42,learning_rate=0.7)) ])
svm_pipe = Pipeline(steps =[ ('scale',StandardScaler()), ("RF",SVC(random_state=42,kernel='rbf')) ])

grd_pipe = Pipeline(steps =[ ('scale',StandardScaler()), ("RF",GradientBoostingClassifier(random_state=42)) ])
knn_pipe = Pipeline(steps =[ ('scale',StandardScaler()), ("RF",KNeighborsClassifier()) ])
dtr_pipe = Pipeline(steps =[ ('scale',StandardScaler()), ("RF",DecisionTreeClassifier(random_state=42)) ])
gnb_pipe = Pipeline(steps =[ ('scale',StandardScaler()), ("RF",GaussianNB()) ])


rf_f1_cross_val_scores = cross_val_score(rf_pipe,train_x,train_y,cv=5,scoring='f1')
ada_f1_cross_val_scores=cross_val_score(ada_pipe,train_x,train_y,cv=5,scoring='f1')
svm_f1_cross_val_scores=cross_val_score(svm_pipe,train_x,train_y,cv=5,scoring='f1')

grd_f1_cross_val_scores = cross_val_score(grd_pipe,train_x,train_y,cv=5,scoring='f1')
knn_f1_cross_val_scores = cross_val_score(knn_pipe,train_x,train_y,cv=5,scoring='f1')
dtr_cross_val_scores = cross_val_score(dtr_pipe,train_x,train_y,cv=5,scoring='f1')
gnb_cross_val_scores = cross_val_score(gnb_pipe,train_x,train_y,cv=5,scoring='f1')


In [None]:
classifiers_score = [
    ('Random Forest', rf_f1_cross_val_scores),
    ('Adaboost', ada_f1_cross_val_scores),
    ('SVM', svm_f1_cross_val_scores),
    ('Gradient Boosting', grd_f1_cross_val_scores),
    ('KNeighbors', knn_f1_cross_val_scores),
    ('DecisionTree', dtr_cross_val_scores),
    ('GaussianNB', gnb_cross_val_scores)
]


In [None]:

for i in range(len(classifiers_score)):
    plt.subplot(len(classifiers_score),1,i+1)
    ax = sns.lineplot(x=range(0,len(classifiers_score[i][1])),y=classifiers_score[i][1])
    ax.set_title(classifiers_score[i][0])
    ax.set_xticks([i for i in range(0,len(classifiers_score[i][1]))])
    ax.set_xlabel('Fold Number')
    ax.set_ylabel('F1 Score')
    plt.show()
    

<a id="1.1"></a>
<h1 style="background-color:skyblue;font-family:newtimeroman;font-size:200%;text-align:center">Model Evaluation</h1>


In [None]:
rf_pipe.fit(train_x,train_y)
rf_prediction = rf_pipe.predict(test_x)

ada_pipe.fit(train_x,train_y)
ada_prediction = ada_pipe.predict(test_x)

svm_pipe.fit(train_x,train_y)
svm_prediction = svm_pipe.predict(test_x)

print('F1 Score of Random Forest Model On Test Set - {}'.format(f1(rf_prediction,test_y)))
print('F1 Score of AdaBoost Model On Test Set - {}'.format(f1(ada_prediction,test_y)))
print('F1 Score of SVM Model On Test Set - {}'.format(f1(svm_prediction,test_y)))

<a id="1.1"></a>
<h1 style="background-color:skyblue;font-family:newtimeroman;font-size:200%;text-align:center">Model Evaluation On Original Data (Before Upsampling)</h1>


In [None]:
ohe_data =c_data[c_data.columns[16:]].copy()
pc_matrix = pca_model.fit_transform(ohe_data)
original_df_with_pcs = pd.concat([c_data,pd.DataFrame(pc_matrix,columns=['PC-{}'.format(i) for i in range(0,N_COMPONENTS)])],axis=1)

unsampled_data_prediction_RF = rf_pipe.predict(original_df_with_pcs[X_features])
unsampled_data_prediction_ADA = ada_pipe.predict(original_df_with_pcs[X_features])
unsampled_data_prediction_SVM = svm_pipe.predict(original_df_with_pcs[X_features])

In [None]:
print('F1 Score of Random Forest Model On Original Data (Before Upsampling) - {}'.format(f1(unsampled_data_prediction_RF,original_df_with_pcs['Attrition_Flag'])))
print('F1 Score of AdaBoost Model On Original Data (Before Upsampling) - {}'.format(f1(unsampled_data_prediction_ADA,original_df_with_pcs['Attrition_Flag'])))
print('F1 Score of SVM Model On Original Data (Before Upsampling) - {}'.format(f1(unsampled_data_prediction_SVM,original_df_with_pcs['Attrition_Flag'])))

In [None]:
ax = sns.heatmap(confusion_matrix(unsampled_data_prediction_RF,original_df_with_pcs['Attrition_Flag']),annot=True,cmap='coolwarm',fmt='d')
ax.set_title('Prediction On Original Data With Random Forest Model Confusion Matrix')
ax.set_xticklabels(['Not Churn','Churn'],fontsize=18)
ax.set_yticklabels(['Predicted Not Churn','Predicted Churn'],fontsize=18)

plt.show()

In [None]:
unsampled_data_prediction_RF = rf_pipe.predict_proba(original_df_with_pcs[X_features])
skplt.metrics.plot_precision_recall(original_df_with_pcs['Attrition_Flag'], unsampled_data_prediction_RF)

In [None]:

#model building with multiple models
models = []
models.append(('Naive Bayes', GaussianNB()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('Decision Tree', DecisionTreeClassifier(random_state = 1)))
models.append(('Random Forest', RandomForestClassifier(random_state = 1)))
models.append(('SVM', SVC(gamma='auto', random_state = 1)))


# train_x,test_x,train_y,test_y = train_test_split(X,y,random_state=42)
# evaluate each model in turn
results = []
names = []
for name, model in models:
        model.fit(train_x,train_y)
        y_pred = model.predict(test_x)
        f1_ = f1(test_y, y_pred)
        print("{} : {}".format(name,f1_))
  