In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import squarify
from wordcloud import WordCloud
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier  
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [None]:
data = pd.read_csv('../input/credit-card-customers/BankChurners.csv')
data.head()

In [None]:
data.describe()

In [None]:
data.columns

In [None]:
data.isna().sum()

# EDA

## Age

In [None]:
plt.figure(figsize=(15,8))
sns.distplot(data['Customer_Age'], color = 'green')
age_mean = data['Customer_Age'].mean()
plt.vlines(age_mean, 0, 0.05, color = 'green')
print('Age mean : ', age_mean)
print('Age median : ', data['Customer_Age'].median())

In [None]:
data['Customer_Age_Categorized'] = pd.cut(data['Customer_Age'], bins=5)
data.head()

In [None]:
plt.figure(figsize=(15,8))
sns.countplot(y='Card_Category', hue='Customer_Age_Categorized', data = data)
plt.legend(loc = 'center right')

In [None]:
data.groupby('Card_Category')['Customer_Age_Categorized'].value_counts()

## Customer_Age_Categorized - Credit_Limit

In [None]:
colors = ['white', 'yellow', 'blue', 'green','black']
categories = data['Customer_Age_Categorized'].unique()

plt.figure(figsize = (15,8))
for color, category in zip(colors,categories):
    sns.distplot(data[data['Customer_Age_Categorized']== category]['Credit_Limit'], rug=True, rug_kws={"color": color},
                  kde_kws={"color": color, "lw": 3, "label": category},
                  hist_kws={"histtype": "step", "linewidth": 3,
                            "alpha": .8, "color": color})
plt.legend(categories,loc='upper right')

In [None]:
a = data.groupby('Customer_Age_Categorized')['Credit_Limit'].agg(['mean','median','min','max'])
b = data.groupby("Customer_Age_Categorized")['Credit_Limit'].quantile([0.25, 0.75]).unstack()
c = pd.concat([a,b], axis =1)
c

## Gender

In [None]:
data['Gender'].value_counts()

In [None]:
labels = data['Gender'].unique()
sizes = [data['Gender'].value_counts()[1],data['Gender'].value_counts()[0]]
plt.figure(figsize = (10,10))
plt.pie(sizes,labels=labels, explode=(0.08,0))

In [None]:
data.groupby('Gender')['Credit_Limit'].agg(['mean','count'])

Note : Females are using more credit card but males credit limit more than females.

## Income_Category - Card_Category

In [None]:
data.groupby('Income_Category')['Card_Category'].value_counts()

## Attrition_Flag

In [None]:
data['Attrition_Flag'].value_counts()

In [None]:
customer_type = data['Attrition_Flag'].value_counts().keys()
customer_type_values = [data['Attrition_Flag'].value_counts()[0],data['Attrition_Flag'].value_counts()[1]]
plt.figure(figsize=(12,8))
plt.bar(customer_type,customer_type_values, color = 'maroon', width = .5)

In [None]:
churn = data[data["Attrition_Flag"] == 'Attrited Customer']
non_churn = data[data["Attrition_Flag"] == 'Existing Customer']

In [None]:
churn.head()

In [None]:
churn.groupby("Gender").agg({"Credit_Limit":"mean", "Customer_Age":"mean","Attrition_Flag":"count"})

In [None]:
non_churn.groupby("Gender").agg({"Credit_Limit":"mean", "Customer_Age":"mean","Attrition_Flag":"count"})

In [None]:
data.groupby(['Attrition_Flag',"Gender","Card_Category"]).agg({"Credit_Limit":"mean"})

In [None]:
data.groupby(["Gender","Customer_Age_Categorized"])["Attrition_Flag"].value_counts()

In [None]:
data.columns

In [None]:
plt.figure(figsize = (12, 8))
squarify.plot(sizes = data.Customer_Age_Categorized.value_counts().values, alpha = 0.8,
              label = data.Customer_Age_Categorized.unique())
plt.axis('off')
plt.show()

In [None]:
plt.subplots(figsize=(15,22))
wordcloud = WordCloud(background_color = 'white',
                     width=512,
                     height=384).generate("".join(data.Education_Level))

plt.imshow(wordcloud)
plt.axis('off')

In [None]:
df = px.data.tips()
fig = px.sunburst(data, path=['Attrition_Flag', 'Gender', 'Card_Category'], values='Credit_Limit',title="Dont Forget to Click Chart to Examine Deeply ")
fig.show()

In [None]:
df = px.data.tips()
fig = px.sunburst(data, path=['Attrition_Flag', 'Gender', 'Card_Category'], values='Customer_Age',title="Dont Forget to Click Chart to Examine Deeply ")
fig.show()

In [None]:
# Correlation Matrix
f, ax = plt.subplots(figsize= [20,15])
sns.heatmap(data.corr(), annot=True, fmt=".2f", ax=ax, cmap = "Spectral" )
ax.set_title("Correlation Matrix", fontsize=20)
plt.show()

# Data Preprocess

In [None]:
data.drop('Avg_Open_To_Buy', axis =1, inplace = True)
data.drop('Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1', axis =1, inplace = True)

In [None]:
data.dtypes

In [None]:
data.Attrition_Flag.unique()

In [None]:
cleanup_nums = {"Gender":     {"M": 0, "F": 1},
                "Marital_Status": {"Single": 0, "Married": 1, "Unknown": 2, "Divorced": 3},
                "Attrition_Flag": {"Existing Customer":0 , "Attrited Customer": 1}}

data.replace(cleanup_nums, inplace=True)

In [None]:
data.head()

In [None]:
data = pd.get_dummies(data, columns=["Education_Level", "Income_Category","Card_Category","Customer_Age_Categorized"], prefix=["education", "income", "card","age_cat"])

In [None]:
data.dtypes

In [None]:
categorical_data = data[["education_College","education_Doctorate","education_Graduate","education_High School","education_Post-Graduate","education_Uneducated",
                        "education_Unknown","income_$120K +","income_$40K - $60K","income_$60K - $80K","income_$80K - $120K","income_Less than $40K","income_Unknown",
                        "card_Blue","card_Gold","card_Platinum","card_Silver","age_cat_(25.953, 35.4]","age_cat_(35.4, 44.8]","age_cat_(44.8, 54.2]","age_cat_(54.2, 63.6]",
                        "age_cat_(63.6, 73.0]"]]

In [None]:
y = data["Attrition_Flag"]
X = data.drop(["Attrition_Flag","education_College","education_Doctorate","education_Graduate","education_High School","education_Post-Graduate","education_Uneducated",
                        "education_Unknown","income_$120K +","income_$40K - $60K","income_$60K - $80K","income_$80K - $120K","income_Less than $40K","income_Unknown",
                        "card_Blue","card_Gold","card_Platinum","card_Silver","age_cat_(25.953, 35.4]","age_cat_(35.4, 44.8]","age_cat_(44.8, 54.2]","age_cat_(54.2, 63.6]",
                        "age_cat_(63.6, 73.0]"], axis = 1)
cols = X.columns
index = X.index

In [None]:
X.head()

In [None]:
y.head()

In [None]:
print(X.shape, y.shape)

In [None]:
transformer = RobustScaler().fit(X)
X = transformer.transform(X)
X = pd.DataFrame(X, columns = cols, index = index)

In [None]:
X = pd.concat([X,categorical_data], axis = 1)

In [None]:
X.head()

In [None]:
print("X.shape:",X.shape)
print("y.shape",y.shape)

# Train - Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Choose Base Model

In [None]:
models = []
models.append(('Naive Bayes', GaussianNB()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier(random_state = 42)))
models.append(('Random Forest', RandomForestClassifier(random_state = 42)))
models.append(('SVM', SVC(gamma='auto', random_state = 42)))
models.append(('XGBoost', GradientBoostingClassifier(random_state = 42)))
models.append(("CatBoost", CatBoostClassifier(random_state = 42, verbose = False)))

# evaluate each model in turn
results = []
names = []

In [None]:
for name, model in models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print("{} : {}".format(name,accuracy))

In [None]:
clf = KNeighborsClassifier()
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=["Existing Customer","Attrited Customer"]))

In [None]:
print(confusion_matrix(y_test, y_pred))