In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sbn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score
from xgboost import XGBClassifier

pd.set_option("display.max_columns", 100)
pd.set_option('display.width', 1000)

path = '/kaggle/input/credit-card-customers/BankChurners.csv'
df = pd.read_csv(path)


#-------------some pre data processing(quite obvious ones)-----------------------
df['Attrition_Flag'].replace({'Existing Customer':0, 'Attrited Customer':1},inplace=True)
df.drop(df.columns[[0,-1,-2]].values,axis=1,inplace=True)
# print(df.head(2))
# print(df.shape)
df.head(4)

# **Exploratory Data Analysis**

In [None]:
#------------Exploratory Data Analysis---------------------
plt.hist(df['Customer_Age'],bins=40,density=True)    # age follows normal curve
plt.xticks(range(25,75,1))
plt.show()

sizes = (df['Attrition_Flag'].value_counts()).tolist()
plt.pie(sizes,explode=[0,0.1],shadow=True,autopct='%1.2f%%',labels=['Existing','Churned'])
plt.show()

#______gender based division
sizes_f = df.loc[df['Gender']=='F']['Attrition_Flag'].value_counts()
sizes_m = df.loc[df['Gender']=='M']['Attrition_Flag'].value_counts()
fig,(ax1,ax2) = plt.subplots(1,2)
ax1.pie(sizes_f,explode=[0,0.1],shadow=True,autopct='%1.2f%%',labels=['Existing','Churned'])
ax1.title.set_text('Females')
ax2.pie(sizes_m,explode=[0,0.1],shadow=True,autopct='%1.2f%%',labels=['Existing','Churned'])
ax2.title.set_text('Males')
plt.show()

sizes_gender = df['Gender'].value_counts()    #not much difference. thus it's eually distributed.
plt.pie(sizes_gender.tolist(),autopct='%1.2f%%',labels=sizes_gender.index.values,explode=[0.1,0],shadow=True)
plt.show()

#______card category based
sizes_b = df.loc[df['Card_Category']=='Blue']['Attrition_Flag'].value_counts().tolist()
sizes_s = df.loc[df['Card_Category']=='Silver']['Attrition_Flag'].value_counts().tolist()
sizes_g = df.loc[df['Card_Category']=='Gold']['Attrition_Flag'].value_counts().tolist()
sizes_p = df.loc[df['Card_Category']=='Platinum']['Attrition_Flag'].value_counts().tolist()
fig,((axs0, axs1), (axs2, axs3)) = plt.subplots(2,2)
axs0.pie(sizes_b,explode=[0,0.1],shadow=True,autopct='%1.2f%%',labels=['Existing','Churned'])
axs0.title.set_text('Blue Card')
axs1.pie(sizes_s,explode=[0,0.1],shadow=True,autopct='%1.2f%%',labels=['Existing','Churned'])
axs1.title.set_text('Silver Card')
axs2.pie(sizes_g,explode=[0,0.1],shadow=True,autopct='%1.2f%%',labels=['Existing','Churned'])
axs2.title.set_text('Gold Card')
axs3.pie(sizes_p,explode=[0,0.1],shadow=True,autopct='%1.2f%%',labels=['Existing','Churned'])
axs3.title.set_text('Platinum Card')
plt.show()


#______income category based

sizes_0 = df.loc[df['Income_Category']=='Less than $40K']['Attrition_Flag'].value_counts().tolist()
sizes_40 = df.loc[df['Income_Category']=='$40K - $60K']['Attrition_Flag'].value_counts().tolist()
sizes_60 = df.loc[df['Income_Category']=='$60K - $80K']['Attrition_Flag'].value_counts().tolist()
sizes_80 = df.loc[df['Income_Category']=='$80K - $120K']['Attrition_Flag'].value_counts().tolist()
sizes_120 = df.loc[df['Income_Category']=='$120K +']['Attrition_Flag'].value_counts().tolist()
sizes_unkn = df.loc[df['Income_Category']=='Unknown']['Attrition_Flag'].value_counts().tolist()

fig,((axs0, axs1, axs2), (axs3, axs4, axs5)) = plt.subplots(2,3)
axs0.pie(sizes_0,explode=[0,0.1],shadow=True,autopct='%1.2f%%',labels=['Existing','Churned'])
axs0.title.set_text('<40K')
axs1.pie(sizes_40,explode=[0,0.1],shadow=True,autopct='%1.2f%%',labels=['Existing','Churned'])
axs1.title.set_text('40-60K')
axs2.pie(sizes_60,explode=[0,0.1],shadow=True,autopct='%1.2f%%',labels=['Existing','Churned'])
axs2.title.set_text('60-80K')
axs3.pie(sizes_80,explode=[0,0.1],shadow=True,autopct='%1.2f%%',labels=['Existing','Churned'])
axs3.title.set_text('80-120K')
axs4.pie(sizes_120,explode=[0,0.1],shadow=True,autopct='%1.2f%%',labels=['Existing','Churned'])
axs4.title.set_text('>120K')
axs5.pie(sizes_unkn,explode=[0,0.1],shadow=True,autopct='%1.2f%%',labels=['Existing','Churned'])
axs5.title.set_text('Unknown')
plt.show()

#-----education based----
sizes_edu = df['Education_Level'].value_counts()
plt.pie(sizes_edu.tolist(),labels=sizes_edu.index.values,autopct='%1.2f%%')
plt.show()
#
# #-------------------correlation between numeric variables and target--------------
numeric_data = df.select_dtypes(include=[np.number])
corr_numeric = numeric_data.corr()
sbn.heatmap(corr_numeric,cmap="YlGnBu",annot=True)
plt.xticks(rotation=45)
plt.show()


# **DATA PROCESSING**

**Conversion of categorical variables into numerical**

In [None]:
# #--------------let's convert some categorical variables into numerical--------------
# #ordinal to numerical
map_education_level = {'High School':1,'Graduate':3,'Uneducated':0,'College':2,'Post-Graduate':4,'Doctorate':5}
map_income_level = {'$60K - $80K':3,'Less than $40K':1, '$80K - $120K':4,'$40K - $60K':2,'$120K +':5}
map_card_category = {'Blue':1,'Gold':3,'Silver':2,'Platinum':4}
df['Education_Level'].replace(map_education_level,inplace=True)
df['Income_Category'].replace(map_income_level,inplace=True)
df['Card_Category'].replace(map_card_category,inplace=True)

#
# #hot encoding of gender category
df.insert(2,'Gender_M',df['Gender'],True)
df.rename({'Gender':'Gender_F'},axis=1,inplace=True)
df['Gender_M'].replace({'M':1,'F':0},inplace=True)
df['Gender_F'].replace({'M':0,'F':1},inplace=True)
#
# #hot encoding of marital status
df.insert(7,'Single',df['Marital_Status'],True)
df.insert(7,'Divorced',df['Marital_Status'],True)
df.insert(7,'Unknown',df['Marital_Status'],True)
df.rename({'Marital_Status':'Married'},axis=1,inplace=True)
df['Married'].replace({'Single':0, 'Married':1, 'Divorced':0, 'Unknown':0},inplace=True)
df['Single'].replace({'Single':1, 'Married':0, 'Divorced':0, 'Unknown':0},inplace=True)
df['Divorced'].replace({'Single':0, 'Married':0, 'Divorced':1, 'Unknown':0},inplace=True)
df['Unknown'].replace({'Single':0, 'Married':0, 'Divorced':0, 'Unknown':1},inplace=True)


df.head()

**Dealing With missing values**

In [None]:
plt.hist(df.loc[df['Income_Category']!='Unknown']['Income_Category'])   # income is rightly skewed. so central value is median
plt.show()

plt.hist(df.loc[df['Education_Level']!='Unknown']['Education_Level'])   # education is normally distributed. so central value is mean
plt.show()

#Missing values in education column
educatedDF = df.loc[df['Education_Level']!='Unknown']
mean_education = educatedDF['Education_Level'].mean()
df['Education_Level'].replace({'Unknown':mean_education},inplace=True)

#Missing values in income column
salariedDF = df.loc[df['Income_Category']!='Unknown']
median_salaries = salariedDF['Income_Category'].median()
df['Income_Category'].replace({'Unknown':median_salaries},inplace=True)

df.head()


# Dataset Split

In [None]:
x = df.iloc[:,1:]
y = df.iloc[:,0]
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.33,random_state=42)

# Upsampling using SMOTE

In [None]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)


#-----Upsampling----
from sklearn.utils import resample
from collections import Counter

print("Before Upsampling:-")
print(Counter(y_train))

# X_train_upsampled, y_train_upsampled = resample(x_train[y_train == 1],
#                                                 y_train[y_train == 1],
#                                                 replace=True,
#                                                 n_samples=x_train[y_train == 0].shape[0],
#                                                 random_state=123)


# Let's use SMOTE to oversample
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
x_train_upsampled, y_train_upsampled = oversample.fit_resample(x_train,y_train)

print("After Upsampling:-")
print(Counter(y_train_upsampled))

# FITTING INTO MODEL

In [None]:

#-----Random Forest after upsampling------
print("\n\n\n\n AFTER UPSAMPLING\n\n")
classifier = RandomForestClassifier(n_estimators = 50, random_state = 0)
classifier.fit(x_train_upsampled, y_train_upsampled)
# Predicting result for training set and validation set
predict_val_rf = classifier.predict(x_test)


# Model Performance

print("Accuracy : ", accuracy_score(y_test, predict_val_rf) *  100)
print("Recall : ", recall_score(y_test, predict_val_rf) *  100)
print("Precision : ", precision_score(y_test, predict_val_rf) *  100)
print(confusion_matrix(y_test, predict_val_rf))
print(classification_report(y_test, predict_val_rf))


Using XGboost

In [None]:
model = XGBClassifier()
model.fit(x_train_upsampled, y_train_upsampled)
# Predicting result for training set and validation set
predict_val_rf2 = model.predict(x_test)


# Model Performance

print("Accuracy : ", accuracy_score(y_test, predict_val_rf2) *  100)
print("Recall : ", recall_score(y_test, predict_val_rf2) *  100)
print("Precision : ", precision_score(y_test, predict_val_rf2) *  100)
print(confusion_matrix(y_test, predict_val_rf2))
print(classification_report(y_test, predict_val_rf2))


Every time we run this model, we won't find the same exact score. Since smote wont create the exact same observations every time. I ran it like 10 times, and it was always around 91% of recall, so i'm gonna consider that only as my final result. 