In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Importing datasets
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
%matplotlib inline

In [None]:
# Reading File
df_cc = pd.read_csv('/kaggle/input/credit-card-customers/BankChurners.csv')

In [None]:
df_cc.describe()

In [None]:
# Dropping last 2 columns
df_cc.drop(['Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'],axis=1, inplace=True)
df_cc.drop(['Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1'],axis=1, inplace=True)

In [None]:
# Dropping client number
df_cc.drop('CLIENTNUM',axis=1,inplace=True)

In [None]:
df_cc.head()

In [None]:
# Checking for null values
sns.heatmap(df_cc.isnull(), yticklabels = False, cbar = False, cmap="Blues")

In [None]:
# Data Visualization
plt.figure(figsize=[15,5])
plt.subplot(131)
sns.countplot(x='Attrition_Flag',data = df_cc, label = 'Counts')
plt.subplot(132)
sns.countplot(x='Gender',data = df_cc, hue='Attrition_Flag')
plt.subplot(133)
sns.countplot(x='Marital_Status',data = df_cc, hue='Attrition_Flag')

In [None]:
# Distribution of attrition across agaes. Looks pretty spread
plt.figure(figsize=[14,6])
sns.countplot(x='Customer_Age',data = df_cc, hue='Attrition_Flag')

In [None]:
# Dependent spread in data
df_cc['Dependent_count'].hist(bins=5)

In [None]:
# function to replace unknown by Edu_unknown (Since education level and marriage both have unknown as value)
def update_edu_unknown(data):
    edu_level = data[0]
    if edu_level == 'Unknown':
        return 'Edu_Unknown'
    else: 
        return edu_level

In [None]:
df_cc['Education_Level'] = df_cc[['Education_Level','Marital_Status']].apply(update_edu_unknown,axis=1)

In [None]:
# Checking impact of other parameters o attrition. No bias seen
plt.figure(figsize=[18,8])
plt.subplot(221)
sns.countplot(x='Education_Level',data = df_cc, hue='Attrition_Flag')
plt.subplot(222)
sns.countplot(x='Income_Category',data = df_cc, hue='Attrition_Flag')
plt.subplot(223)
sns.countplot(x='Card_Category',data = df_cc, hue='Attrition_Flag')

In [None]:
# Dropping columns as adding dummies (Categ value replacement)
attir_flag = pd.get_dummies(df_cc['Attrition_Flag'],drop_first=True)
df_cc.drop(['Attrition_Flag'], axis=1, inplace=True)
df_cc = pd.concat([df_cc, attir_flag], axis=1)

In [None]:
gender_flag = pd.get_dummies(df_cc['Gender'],drop_first=True)
df_cc.drop(['Gender'], axis=1, inplace=True)
df_cc = pd.concat([df_cc, gender_flag], axis=1)

In [None]:
marital_flag = pd.get_dummies(df_cc['Marital_Status'],drop_first=True)
df_cc.drop(['Marital_Status'], axis=1, inplace=True)
df_cc = pd.concat([df_cc, marital_flag], axis=1)

In [None]:
edu_flag = pd.get_dummies(df_cc['Education_Level'],drop_first=True)
df_cc.drop(['Education_Level'], axis=1, inplace=True)
df_cc = pd.concat([df_cc, edu_flag], axis=1)

In [None]:
cccat_flag = pd.get_dummies(df_cc['Card_Category'],drop_first=True)
df_cc.drop(['Card_Category'], axis=1, inplace=True)
df_cc = pd.concat([df_cc, cccat_flag], axis=1)

In [None]:
inccat_flag = pd.get_dummies(df_cc['Income_Category'],drop_first=True)
df_cc.drop(['Income_Category'], axis=1, inplace=True)
df_cc = pd.concat([df_cc, inccat_flag], axis=1)

In [None]:
# All categ values replaced 
df_cc.head()

In [None]:
# Assigning values and splitting into training and test sets
X = df_cc.drop('Existing Customer',axis=1).values
y = df_cc['Existing Customer'].values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [None]:
# Applying feature scaling to the parameteres
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 0:9] = sc.fit_transform(X_train[:, 0:9])
X_test[:, 0:9] = sc.transform(X_test[:, 0:9])

In [None]:
# Running the data through Randomforest classifier
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

In [None]:
y_predict = classifier.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.metrics import classification_report
cm = confusion_matrix(y_test,y_predict)
sns.heatmap(cm,annot=True)

In [None]:
print(classification_report(y_test, y_predict))

In [None]:
# Running the data through Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier
grd = GradientBoostingClassifier(n_estimators = 999).fit(X_train,y_train)

In [None]:
y_predict = grd.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.metrics import classification_report
cm = confusion_matrix(y_test,y_predict)
sns.heatmap(cm,annot=True)

In [None]:
print(classification_report(y_test, y_predict))