# **Explorating data**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from datetime import date,datetime
from collections import Counter
from sklearn.metrics import confusion_matrix,accuracy_score
import warnings
warnings.filterwarnings('ignore')
sns.set_style("darkgrid")

In [None]:
def auto(axis):
    for i in axis.patches:
           axis.text(i.get_x() + i.get_width() / 2, i.get_height() + 5, i.get_height(),ha='center', va='bottom')

In [None]:
bank = pd.read_csv('../input/credit-card-customers/BankChurners.csv') 
bank = bank.drop(bank.columns[21:23],axis=1)
bank = bank.drop(bank.columns[0],axis=1)
bank.head()


# **Check N/a data**

In [None]:
bank.info()

# **Number Male and Female**

In [None]:
fig , a = plt.subplots(1,2,figsize=(10,5))
sns.countplot(x='Gender',data=bank,ax=a[0])
auto(a[0])
a[1].pie(bank.Gender.value_counts().sort_values(),labels=bank.Gender.value_counts().sort_values().index,autopct='%1.1f%%')
a[1].legend()

# **Statistics Churned Customers**

In [None]:
fig1 , a1 = plt.subplots(1,3,figsize=(20,5))
sns.countplot(x='Attrition_Flag',data=bank,ax=a1[0],palette=['lightblue','lightgreen'])
auto(a1[0])
a1[1].pie(bank.Attrition_Flag.value_counts(),labels=bank.Attrition_Flag.value_counts().index,autopct='%1.1f%%',colors=['lightblue','lightgreen'])
sns.countplot(x='Attrition_Flag',hue='Gender',data=bank,ax=a1[2])
auto(a1[2])

# **Age of customers**

In [None]:
fig2 , a2 = plt.subplots(1,2,figsize=(15,5))
sns.distplot(bank['Customer_Age'] ,ax=a2[0] ,kde=True)
sns.boxplot(y='Customer_Age',x='Gender',data=bank,ax=a2[1])

# **Dependent count data**

In [None]:
a3=sns.countplot(x='Dependent_count',data=bank)
auto(a3)

# **Study's level of customers**

In [None]:
fig4 , a4 = plt.subplots(1,2,figsize=(15,5))
sns.countplot(x='Education_Level',data=bank,ax=a4[0])
auto(a4[0])
sns.countplot(x='Education_Level',hue='Gender',data=bank,ax=a4[1])

# **Marital Status**

In [None]:
fig5 , a5 = plt.subplots(1,2,figsize=(15,5))
sns.countplot(x='Marital_Status',data=bank,ax=a5[0])
auto(a5[0])
sns.countplot(x='Marital_Status',hue='Gender',data=bank,ax=a5[1])

# **Income for customers**

In [None]:
income_order=['Less than $40K','$40K - $60K','$60K - $80K','$80K - $120K','$120K +','Unknown']
fig6 , a6 = plt.subplots(1,2,figsize=(15,5))
sns.countplot(x='Income_Category',data=bank,ax=a6[0],order=income_order)
auto(a6[0])
sns.countplot(x='Income_Category',hue='Gender',data=bank,ax=a6[1],order=income_order)

In [None]:
fig7 , a7 = plt.subplots(1,2,figsize=(15,5))
sns.countplot(x='Card_Category',data=bank,ax=a7[0])
auto(a7[0])
sns.countplot(x='Card_Category',hue='Gender',data=bank,ax=a7[1])

# **Take Numerical data**

In [None]:
col= ['Gender','Education_Level','Marital_Status','Income_Category','Card_Category']
data1=pd.DataFrame(pd.get_dummies(bank.Attrition_Flag)['Attrited Customer'])
data2=pd.get_dummies(bank,columns=col,drop_first=True).drop('Attrition_Flag',axis=1)
data=pd.concat([data1,data2],axis=1)
data.head()

# **Check correlation to find the feature for prediction models**

In [None]:
corr=data.corr()
a8 = plt.figure(figsize = (15,10))
sns.heatmap(corr,annot=True)

In [None]:
col_use=corr.iloc[:,0][abs(corr.iloc[:,0]) > 0.1].index
Use=data[col_use]
Use.head(3)

In [None]:
sns.heatmap(Use.corr(),annot=True)

# **Built the predict models**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier 
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
# Set seed for reproducibility
SEED = 1

In [None]:
# Split data into 70% train and 30% test
X_train, X_test, y_train, y_test = train_test_split(Use.iloc[:,1:10], Use.iloc[:,0],test_size= 0.3,random_state= SEED)
# Instantiate individual classifiers
lr = LogisticRegression(random_state=SEED)
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier(random_state=SEED)
# Define a list called classifier that contains the tuples (classifier_name, classifier)
classifiers = [('Logistic Regression', lr),('K Nearest Neighbours', knn),('Classification Tree', dt)]

# **Accuracy for Logistic , KNN and Decision Tree model**

In [None]:
# Iterate over the defined list of tuples containing the classifiers
for name, clf in classifiers:
    #fit clf to the training set
    clf.fit(X_train, y_train)
    # Predict the labels of the test set
    y_pred = clf.predict(X_test)
    # Evaluate the accuracy of clf on the test set
    print('{:s} : {:.3f}'.format(name, accuracy_score(y_test, y_pred)))

# **Update results with the Voting Classifier**

In [None]:
# Instantiate a VotingClassifier 'vc'
vc = VotingClassifier(estimators=classifiers)
# Fit 'vc' to the traing set and predict test set labels
vc.fit(X_train, y_train)
y_pred = vc.predict(X_test)
# Evaluate the test-set accuracy of 'vc'
print('Voting Classifier: {:.3f}'.format(accuracy_score(y_test, y_pred)))

# **Using Adapt Boosting for Logistic and Decision Tree model **

In [None]:
classifiers2=[('K Nearest Neighbours', knn)]
# Iterate over the defined list of tuples containing the classifiers
for name, clf in [('Logistic Regression', lr),('Classification Tree', dt)]:
    # Instantiate an AdaBoost classifier 'adab_clf'
    adb_clf = AdaBoostClassifier(base_estimator=clf, n_estimators=100)
    classifiers2.append((name,adb_clf))
    #fit clf to the training set
    adb_clf.fit(X_train, y_train)
    # Predict the labels of the test set
    y_pred = adb_clf.predict(X_test)
    # Evaluate the accuracy of clf on the test set
    print('{:s} : {:.3f}'.format(name, accuracy_score(y_test, y_pred)))

# **Combine Adapt Boosting with Voting Classifier**

In [None]:
# Instantiate a VotingClassifier 'vc'
vc = VotingClassifier(estimators=classifiers2)
# Fit 'vc' to the traing set and predict test set labels
vc.fit(X_train, y_train)
y_pred = vc.predict(X_test)
# Evaluate the test-set accuracy of 'vc'
print('Voting Classifier: {:.3f}'.format(accuracy_score(y_test, y_pred)))