In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#import data 

import pandas as pd
bank = pd.read_csv("/kaggle/input/credit-card-customers/BankChurners.csv")

In [None]:
bank.shape

In [None]:
bank.columns

In [None]:
# below columns are not needed

bank = bank.drop(["Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1",
                "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2"
                 ,"CLIENTNUM"],
                axis=1)

In [None]:
bank.head()

In [None]:
bank.describe()

In [None]:
#heatmap for correlation 

import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize= (12,6))

sns.heatmap(bank.corr(),annot = True)

In [None]:
#checking null values

bank.isnull().sum()

In [None]:
bank["Attrition_Flag"].value_counts()

In [None]:
# data is severly imbalanced so we will apply SMOTE to balance it

sns.countplot("Attrition_Flag",data=bank)

In [None]:
#replace Attrition_Flag to numerical values

bank["Attrition_Flag"].replace(to_replace = ["Existing Customer","Attrited Customer"],value=[1,0],inplace=True)

In [None]:
bank["Attrition_Flag"].value_counts()

In [None]:
#checking correlation of attrition_flag(decides whether customer kept the credit card) with other columns

bank.corr()["Attrition_Flag"].sort_values().plot(kind="bar")

In [None]:
#exploring Income category column

bank["Income_Category"].value_counts()

In [None]:
#EDA for Gender column

size_m = bank.loc[bank["Gender"] == "M"]["Attrition_Flag"].value_counts()
size_f = bank.loc[bank["Gender"] == "F"]["Attrition_Flag"].value_counts()
fig,(ax1,ax2) = plt.subplots(1,2)
ax1.pie(size_m,shadow=True,explode=[0,0.1],labels=["Existing","Churned"],autopct='%1.2f%%')
ax1.title.set_text("Male")
ax2.pie(size_f,shadow=True,explode=[0,0.1],labels=["Existing","Churned"],autopct='%1.2f%%')
ax2.title.set_text("Female")
plt.show()



In [None]:
# Male and female ratio is not equally uniformed so we can ignore Male,Female churned ratio too

size = bank["Gender"].value_counts()
plt.pie(size,shadow=True,labels=size.index.values,autopct='%1.2f%%',explode=[0.1,0])
plt.show()



In [None]:
#EDA for Income category

plt.figure(figsize=(10,5))
sns.countplot("Income_Category",hue="Attrition_Flag",data=bank)

In [None]:
# onehot encoding for categorical columns

bank = pd.get_dummies(data=bank,columns=["Gender","Education_Level","Marital_Status","Income_Category",
                                         "Card_Category"]
                      ,drop_first=True)

In [None]:
#checking correlation of attrition_flag with other fields
plt.figure(figsize=(15,5))
bank.corr()["Attrition_Flag"].sort_values().plot(kind="bar")

In [None]:
# divide into features and labels

X = bank.drop("Attrition_Flag",axis=1)

In [None]:
X.shape

In [None]:
y= bank["Attrition_Flag"]

In [None]:
y.shape

In [None]:
# do train test split for features and labels

from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

In [None]:
# checking training sample numbers for Label ( attrition flag)
from collections import Counter
print(Counter(y_train))

In [None]:
# perform Standard Scaling and SMOTE to handle imbalance between data 

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train,y_train)

In [None]:
# checking training sample numbers for Label ( attrition flag) after SMOTE ( they are equal )


from collections import Counter
print(Counter(y_train))

In [None]:
#perform grid Search CV to find best hyperparameters

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

param = {"n_estimators": [10,50,100,200],"max_features" : ["auto", "sqrt", "log2"],
         "criterion" : ["gini", "entropy"]}

In [None]:
# add estimator and perform grid search CV

grid = GridSearchCV(RandomForestClassifier(),param_grid=param,verbose=2)

In [None]:
grid.fit(X_train,y_train)

In [None]:
grid.best_params_

In [None]:
grid_pred = grid.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix,classification_report

print(confusion_matrix(y_test,grid_pred))
print(classification_report(y_test,grid_pred))

In [None]:
I tried the same with Naive Bayes and SVM as well but accuracy was lower, hence my final model will be with Random
Forrest classifier with 96% accuracy 