# BANK CHURNERS

In [None]:
#Import needed Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import csv
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import recall_score,precision_score,precision_recall_curve,confusion_matrix,recall_score
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier


In [None]:
#File Contains Non-informative columns
data=pd.read_csv('/kaggle/input/credit-card-customers/BankChurners.csv',na_values='NULL')
data=data[data.columns[:-2]]
data.head(5)

In [None]:
np.random.seed=42;seed=42

Target Column is Attrition_Flag Column 

In [None]:
data.shape

In [None]:
data.Attrition_Flag.value_counts()

The data is imbalanced as can be seen above from its Target value counts
with 1627 Attrited Customers out of 10127 customers


In [None]:
data.info()

data have no missing point

In [None]:
#Analysing Customers Age
age=data.Customer_Age.astype("int64")
age.describe()

In [None]:
fig=plt.figure(figsize=(5,5))
plt.title('Age_Plot')
plt.hist(age,color='g')
plt.grid(True)
plt.xlabel('Age_In_Years')
for x in [0.25,0.50,0.75]:
    plt.axvline(age.quantile(x),c='r',lw=2.0)
plt.show()

The Age plot above shows a normal distribution. The Vertical Lines Indicates Quarterly Quantiles 

In [None]:
data.Customer_Age=age;del age

In [None]:
data.info()

In [None]:
#Gender
gender=data.Gender
gender.value_counts()

In [None]:
plt.hist(data.Gender,bins=4)
plt.xticks(ticks=[0,1],labels=['Male','Female'])
plt.title('Gender_Plot');plt.grid('True')
plt.show()

In [None]:
data.Gender.replace(['F','M'],[0,1],inplace=True)

In [None]:
#Dependency Count
dependency=data.Dependent_count.astype('int64')
dependency.describe()

In [None]:
plt.figure(figsize=(8,10))
plt.title('Dependent_Plot')
plt.hist(dependency,color='g')
plt.grid('True')
plt.show()

In [None]:
data.Dependent_count=dependency

In [None]:
data.info()

Most Columns with dtype as object are digits and can be directly transformed to float point numbers 

In [None]:
#Converting Object dtype to Float Numbers
for x in data.columns[-12:]:
    data[x]=data[x].astype('float64')
data.head(5)

In [None]:
data.info()

In [None]:
#Credit Limit 
data.Credit_Limit.describe()

In [None]:
#plot of credit limit
fig,(ax1,ax2,ax3)=plt.subplots(1,3,figsize=(20,10))
fig.suptitle('Credit_Limit_Plot')
ax1.set_title('Hist_Of _Credit_Limit')
ax1.hist(data.Credit_Limit,color='r')
ax1.set_xlabel('Credit_Limit')
ax2.set_title('Boxplot_Of_Credit_Limit')
sns.boxplot(y=data.Credit_Limit,orient='v',color='g',ax=ax2)
ax2.set_xlabel('Credit_Limit')
ax3.set_title('Violin_Plot_Of_Credit_Limit')
sns.violinplot(y=data.Credit_Limit,orient='v',color='b',ax=ax3)
ax3.set_xlabel('Credit_Limit')
ax2.grid('True');ax1.grid('True');ax3.grid('True')
plt.show()

The Histplot above shows That majority of the customers have Credit Limits less than 10000,From the violin plot it can be seen that more customers have limits around 5000. The Boxplot shows the presence of outliers,some customers have credit limits as high as 30000. We have to deal with the outliers since this feature will most likely be an informative one 

In [None]:
#Box Plots
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,10))
fig.suptitle('BoxPlots')
ax1.set_title('Total_Rev_Bal & Trans_Amt')
sns.boxplot(data=data[['Total_Revolving_Bal','Total_Trans_Amt']],ax=ax1)
ax2.set_title('Tot_Amt_Chg & Ct & Avg_ut_Rat')
sns.boxplot(data=data[['Total_Amt_Chng_Q4_Q1','Total_Ct_Chng_Q4_Q1','Avg_Utilization_Ratio']],ax=ax2)
plt.xticks([0,1,2],['Total_Amt_Chng','Total_Ct_Chng','Avg_Utilization'])
plt.show()

The Boxplots of several Features with Continous Values are shown above.
The Boxplot shows the presence of much outliers for some of the features

# Handling Features with non-numeric dtypes

In [None]:
object_data=data.select_dtypes('object')
object_data.head(5)

In [None]:
#1.Attrition_Flag Mapping
object_data.Attrition_Flag.replace(['Existing Customer','Attrited Customer'],[0,1],inplace=True)
# Card Category Mapping
object_data.Card_Category.replace(['Blue','Silver','Gold','Platinum'],[0,1,2,3],inplace=True)
#Income Category Mapping
object_data.Income_Category.replace([x for x in data.Income_Category.value_counts().index],[0,1,2,3,4,5],inplace=True)
# Education Level Mapping
object_data.Education_Level.replace(['Unknown','Uneducated','High School','College','Graduate','Post-Graduate','Doctorate'],[0,1,2,3,4,5,6],inplace=True)
object_data.head(5)

In [None]:
data.Marital_Status.value_counts()

In [None]:
#Marital Status Mapping
object_data.Marital_Status.replace(['Unknown','Single','Married','Divorced'],[0,1,2,3],inplace=True)
object_data.head(5)

In [None]:
for x in object_data.columns:
    data[x]=object_data[x]
data.head(5)

In [None]:
data.drop('CLIENTNUM',axis=1,inplace=True)

In [None]:
#HeatMap of correlations
fig=plt.figure(figsize=(20,20))
corrmat=data.corr()
mask=np.array(corrmat)
mask[np.tril_indices_from(mask)]=False
sns.heatmap(corrmat,mask=mask,annot=True,vmax=1.0,square=True)
plt.show()

The heatmap of corelations between the features is shown above. 
From the heatmap some features shows very weak correlations with the Target varaible(Attrition_Falg) features such as Education_Level,Credit_Limit,Card_Category,Avg_Open_To_Buy.
The Heatmap also shows perfect corrleations between Credit_Limit and Avg_Open_To_Buy,which suggests that one of the features will not be useful. Features such as Dependent_counts,Marital_Status shows weak correlation with every all other features

In [None]:
data['Avg_per_Total_Rev']=round(data['Avg_Open_To_Buy']/(data['Total_Revolving_Bal']+1),3)
data.head(5)

In [None]:
data['Avg_per_Total_Trans']=round((data['Avg_Open_To_Buy']+1)/(1+data['Total_Trans_Amt']),3)

In [None]:
data.head(5)

In [None]:
#Droping some features
data.drop(['Avg_Open_To_Buy','Marital_Status','Education_Level','Card_Category','Dependent_count'],axis=1,inplace=True)
data.head(5)

In [None]:
#Taking the Log of some features
col=['Credit_Limit','Customer_Age','Months_on_book','Total_Relationship_Count','Total_Revolving_Bal','Total_Trans_Amt','Total_Trans_Ct','Avg_per_Total_Rev','Avg_per_Total_Trans']
for x in col:
    data[x]=np.log10(data[x]+1)
data.head(5)

The features contains values of varying magnitude. Values with higher magnitudes will have higher weights when buildingodels for this reason Features with higher magnitudes were transformed to the log to base 10 of their values to shrink their weights and put them on par with other features to prevent overfitting

In [None]:
#HeatMap of Transformed features correlations
fig=plt.figure(figsize=(15,15))
corrmat=data.corr()
mask=np.array(corrmat)
mask[np.tril_indices_from(mask)]=False
sns.heatmap(corrmat,mask=mask,annot=True,vmax=1.0,square=True)
plt.title('Heatmap Of Transformed Features Correlation',size=30)
plt.show()

Heatmap of Transformed and Extracted Features is shown above

In [None]:
#Split the Data Into test and train using StratifiedShuffleSplit
from sklearn.model_selection import StratifiedShuffleSplit as sss
split=sss(n_splits=1,test_size=0.20,random_state=42)
for x,y in split.split(data,data['Attrition_Flag']):
    train=data.loc[x];test=data.loc[y]
print('Train',train.shape,'\n','Test',test.shape)

In [None]:
#Extracting Target Features of Train and Test
y_train=train.Attrition_Flag.values
y_test=test.Attrition_Flag.values
train.drop('Attrition_Flag',axis=1,inplace=True)
test.drop('Attrition_Flag',axis=1,inplace=True)
y_train.shape

In [None]:
train.head(5)

In [None]:
# STANDARD SCALING
scaler=sklearn.preprocessing.StandardScaler()
scaler.fit(train)
train=scaler.transform(train)
test=scaler.transform(test)
train[:3]

# MODEL BUILDING 

Some important Functions will be written and  to avoid repeating codes


In [None]:
#Function to return Prediction,Decision_Function,confusion_matrix and classifcation report
def f(classifier,x,y,Method):
    pre=cross_val_predict(classifier,x,y,cv=5)
    pre_thresh=cross_val_predict(classifier,x,y,cv=5,method=Method)
    con_mat=confusion_matrix(y,pre)
    precision=precision_score(y,pre)
    recall=recall_score(y,pre)
    print('confusion_matrix:','\n',con_mat,'\n','\n','precision_score:',round(precision,3),'\n','\n','recall_score:',round(recall,3))
    return pre,pre_thresh

In [None]:
#Function To Display Precision_Recall_Curve
def curve(thresh,title):
    precision,recall,threshold=precision_recall_curve(y_train,thresh)
    plt.figure(figsize=(8,10))
    plt.plot(threshold,precision[:-1],c='r',label='Precision',lw=2)
    plt.plot(threshold,recall[:-1],c='g',label='Recall',lw=2)
    plt.legend(loc='best')
    plt.xlabel('Threshold')
    plt.ylabel('Precision_Recall')
    plt.title(title)
    plt.grid('True')
    plt.show()

In [None]:
#Selecting Best Number Of Estimators
estimators=[2,5,10,20,35,50,75,100,150,200,220,250,300];scores=[];
for x in estimators:
    forest_clf=RandomForestClassifier(n_estimators=x,random_state=42)
    forest_pre=cross_val_predict(forest_clf,train,y_train,cv=5)
    scores.append(recall_score(y_train,forest_pre))
# Plot of Scores vs Estimators
plt.figure(figsize=(8,8))
plt.plot(estimators,scores,'r*--',lw=2.5)
plt.xlabel('Number of Estimators')
plt.ylabel('Scores')
plt.title('Random_Forest_Estimators_vs_Scores_Plot')
plt.grid('True')
plt.show()
          

In [None]:
forest_clf=RandomForestClassifier(random_state=42,n_estimators=220)
forest_clf.fit(train,y_train)

In [None]:
forest_pre,forest_proba=f(forest_clf,train,y_train,'predict_proba')

In [None]:
curve(forest_proba[:,-1],'RandomForest_Precision_Recall_Curve')

In [None]:
# Predicting Test data
forest_test=forest_clf.predict(test)

In [None]:
#Test Data Recall Score
recall_score(y_test,forest_test)

In [None]:
#Finding Best Estimators for GradientBoostingClassifier
estimators=[2,5,10,20,35,50,100,150,200,250,270];scores=[]
for x in estimators:
    grad_clf=GradientBoostingClassifier(n_estimators=x,random_state=42)
    grad_pre=cross_val_predict(grad_clf,train,y_train,cv=5)
    scores.append(recall_score(y_train,grad_pre))
plt.figure(figsize=(8,8))
plt.title('GradientBoost_Estimators_Vs_Scores')
plt.xlabel('Number Of Estimators')
plt.ylabel('Scores')
plt.plot(estimators,scores,'r*--',lw=2.5)
plt.grid('True')
plt.show()

In [None]:
grad_clf=GradientBoostingClassifier(random_state=42,n_estimators=250)
grad_clf.fit(train,y_train)

In [None]:
grad_pre,grad_proba=f(grad_clf,train,y_train,'predict_proba')

In [None]:
print('Fig 1.5')
curve(grad_proba[:,-1],'GradientBoosting Precision Recall Curve')

In [None]:
# Predicting Test Data
grad_test=grad_clf.predict(test)

In [None]:
#Test Recall Score
recall_score(y_test,grad_test)

Gradient Boosting Classifier Gives a Recall score of 0.885 and a Precision Score of 0.943 on Train Data using Cross Validation and a Recall of 0.852 on Test data

In [None]:
for x,y in zip(data.columns,grad_clf.feature_importances_):
    print(x,'\t',round(y,5))

The Results above shows that features such as Total_Trans_Amt and Total_Chng_Q4_Q1 are the most informative and contributes significantly to the GradientBoosting Model while features such as Gender,Customer Age and Total_Ct_Chng_Q4_Q1 can be considered inconsequential and contributes less to the model and can be dropped.

PRECISION-RECALL CALLOFF : Precision and Recall scores vary with each other and with the value of threshold used for decision Making,as Precision increases Recall scores tends to decrease. If the threshold for decision making is decreased higher recall score is acheived while precision score reduces, if we increase the threshold of decision making a higher precision score is acheived with a reduced recall score this can be demonstrated from the Fig 1.5 (Gradient Boosting Precision_Recall_Curve),The threshold is at 0.6 which gives a precision of 0.943 and recall of 0.882. If the threshold is reduced to 0.2 we will acheive a recall of 0.9+ while precision will be 0.83. To achieve a recall of 1.0 the threshold 0.0 while precision will be between 0.35 and 0.40. The Business Objective is to predict as much Customers who are more likely to churn or be attrited this will be achieved by setting the threshold to get a high recall while keeping the precision at a reasonable point
  

In [None]:
#Set The Threshold for Decision Making at 0.2 For GradientBoosting
threshold=0.2
grad_tradeoff_dec=grad_proba[:,-1]
grad_tradeoff_pre=np.where(grad_tradeoff_dec>=0.2,1,0)
print('confusion_matrix:','\n',confusion_matrix(y_train,grad_tradeoff_pre),'\n','\n','Precision_Score:',round(precision_score(y_train,grad_tradeoff_pre),3),'\n','\n','Recall_Score:',round(recall_score(y_train,grad_tradeoff_pre),3))

In [None]:
#PREDICTING TEST DATA
grad_tradeoff_test_proba=grad_clf.predict_proba(test)[:,-1]
grad_tradeoff_test_pre=np.where(grad_tradeoff_test_proba>=0.2,1,0)
print('Test_Precision_Score:',round(precision_score(y_test,grad_tradeoff_test_pre),3),'\n','\n','Test Recall Score:',round(recall_score(y_test,grad_tradeoff_test_pre),3))

The result above shows that a threshold of 0.2 yields a recall score of 0.932 and precision score of 0.863


There are no hard rules for altering thresholds to make decisions, there are numerous ways to achieve this. We can acheive any value for recall and precision by continously varying the threshold.