# Credit Card Churn Prediction

### Loading Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
plt.style.use('classic')
sns.set()


from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

## Loading The Data

In [None]:
df = pd.read_csv('../input/credit-card-customers/BankChurners.csv')

In [None]:
print(df.shape)
pd.set_option('display.max_columns', 23)
df.head(5)

In [None]:
df.isnull().sum().sum()

## Exploratory Data Analysis

In [None]:

fig = plt.figure(constrained_layout=False, figsize=(17, 20))
spec = gridspec.GridSpec(ncols=2, nrows=3, figure=fig)
ax1 = fig.add_subplot(spec[0, 0])
ax2 = fig.add_subplot(spec[0, 1])
ax3 = fig.add_subplot(spec[1, 0])
ax4 = fig.add_subplot(spec[1, 1])
ax5 = fig.add_subplot(spec[2, 0])
ax6 = fig.add_subplot(spec[2, 1])

labels = ['Female', 'Male']
ax1.pie(df['Gender'].value_counts(),labels = labels,  autopct='%.1f%%',
        shadow=True, wedgeprops={'edgecolor': 'black'})
ax1.set_title('Proportion of Gender')

labels = df.Education_Level.value_counts().keys().tolist()
ax2.pie(df['Education_Level'].value_counts(), autopct='%.1f%%', labels=labels, 
       shadow=True, wedgeprops={'edgecolor':'black'})
ax2.set_title('Proportion of Education Level')


sns.countplot(ax=ax3, x=df['Marital_Status'])
ax3.set_title('Marital Status of Customers')

sns.countplot(ax=ax4, x=df['Income_Category'])
ax4.set_title('Income Category of Customers')


sns.countplot(y='Card_Category', data=df, hue='Income_Category', ax=ax5).set_title('Card Category based on Income Status')

sns.countplot(x='Attrition_Flag', data=df, ax=ax6).set_title('type of customers')

plt.tight_layout()
plt.show()

In [None]:
fig = plt.figure(figsize=(8, 5), constrained_layout=True)
spec = gridspec.GridSpec(nrows=2, ncols=1, figure=fig)
ax1 = fig.add_subplot(spec[0, 0])
ax2 = fig.add_subplot(spec[1, 0])

sns.boxplot(x='Customer_Age', data=df, color='red',notch=True, linewidth=1, ax=ax1).set(title='Distribustion of Customer Age', 
                                                               xticks=[], xlabel='')

ax2.hist(df['Customer_Age'], color='red', alpha=0.4, edgecolor='black')
ax2.set_xlabel('Customer Age')


In [None]:
fig = plt.figure(figsize=(8, 5), constrained_layout=True)
spec = gridspec.GridSpec(ncols=1, nrows=2, figure=fig)
ax1 = fig.add_subplot(spec[0, 0])
ax2 = fig.add_subplot(spec[1, 0])

sns.boxplot(x='Credit_Limit', data=df, color='blue',notch=True, linewidth=1, ax=ax1).set(title='Distribution of Credit Limit', 
                                                           xlabel='', xticks=[])
bins=[2500, 5000, 8000, 10000, 12000, 15000, 18000, 20000, 22000, 25000, 28000, 30000,33000,  35000 ]
ax2.hist(df['Credit_Limit'], bins=bins, color='blue', alpha=0.6, edgecolor='black')
ax2.set_xlabel('Credit Limit')

In [None]:
fig = plt.figure(figsize=(15, 5), constrained_layout=True)
spec = gridspec.GridSpec(nrows=2, ncols=2, figure=fig)

ax1 = fig.add_subplot(spec[0, 0])
ax2 = fig.add_subplot(spec[0, 1])
ax3 = fig.add_subplot(spec[1, 0])
ax4 = fig.add_subplot(spec[1, 1])

sns.boxplot(x='Months_Inactive_12_mon', data=df, color='orange',notch=True,linewidth=0.7, ax=ax1).set(title='Distribution of number of inactive months in the last 12 months', 
                                                                            xlabel='', xticks=[])
bins=[1, 2, 3, 4, 5, 6]
ax3.hist(df['Months_Inactive_12_mon'],bins=bins, color='orange', alpha=0.5, edgecolor='black')

sns.boxplot(x='Months_on_book', data=df, color='yellow',notch=True,linewidth=0.8, ax=ax2).set(title='Distribution of months the customer is part of the bank', 
                                                                   xlabel='', xticks=[])
ax4.hist(df['Months_on_book'], color='yellow', alpha=0.6, edgecolor='black')
plt.show()

fig2 = plt.figure(figsize=(15,5), constrained_layout=True)
spec2 = gridspec.GridSpec(nrows=2, ncols=1, figure=fig2)

ax5 = fig2.add_subplot(spec2[0, 0])
ax6 = fig2.add_subplot(spec2[1, 0])

sns.boxplot(x='Total_Trans_Amt', data=df, color='green', notch=True, linewidth=1, ax=ax5).set(title='Distribution of Total Transaction Amount', 
                                                                                             xlabel='', xticks=[])
bins=[3000,4000, 5000,6000, 7000, 8000,9000, 10000,11000, 12000,13000,14000, 15000, 18000, 20000 ]
ax6.hist(df['Total_Trans_Amt'],bins=bins, color='green', alpha=0.6, edgecolor='black')
plt.show()

## Data Preprocessing

In [None]:
df['Attrition_Flag'].replace({'Existing Customer': 0, 
                            'Attrited Customer': 1}, inplace=True)

df['Gender'].replace({'M': 0, 'F':1}, inplace=True)

df.replace({'Unknown': np.nan}, inplace=True)
df['Card_Category'].replace({'Platinum': np.nan})

df.dropna(inplace=True)
df = df.iloc[:, :-2]

In [None]:

df2 = pd.concat([df.drop(['CLIENTNUM','Education_Level', 
                          'Marital_Status', 'Income_Category', 'Card_Category', 
                         'Total_Amt_Chng_Q4_Q1', 'Total_Ct_Chng_Q4_Q1'], axis=1), 
                pd.get_dummies(df['Education_Level']), 
                pd.get_dummies(df['Marital_Status']), 
                pd.get_dummies(df['Income_Category']), 
                pd.get_dummies(df['Card_Category'])], axis=1)
df2.reset_index(drop=True, inplace=True)

In [None]:
df2.sample(5)

In [None]:
df2.columns

In [None]:
plt.figure(figsize=(40, 20))
sns.heatmap(df2.corr('pearson'), annot=True)

## Modeling

In [None]:
x = df2.drop(['Attrition_Flag'], axis=1)
y = df2['Attrition_Flag']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

### Logistic Regression

In [None]:
log = LogisticRegression(C=500, max_iter=50000)
log.fit(x_train, y_train)
yhat1 = log.predict(x_test)
print('Logistic Regression :\n\n\t', f'The Training model accuracy :{log.score(x_train, y_train)}\n\t',
     f'The Test model accuracy: {log.score(x_test, y_test)}')
print(classification_report(y_test, yhat1))

### Decision Tree Classifier

In [None]:
s = []
for i in range(1, 10):
    churn_tree = DecisionTreeClassifier(criterion='entropy', max_depth=i)
    churn_tree.fit(x_train, y_train)
    s.append(churn_tree.score(x_test, y_test))
    
x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
fig = plt.figure(figsize=(8, 4))
plt.plot(x, s, '--o', markersize=22, color='#e74c3c')
plt.xlabel('max depth of tree')
plt.ylabel('Accuracy')
plt.xticks(x)
plt.title('Churn Tree Accuracy')
plt.show()

In [None]:
churn_tree = DecisionTreeClassifier(criterion='entropy', max_depth=7)
churn_tree.fit(x_train, y_train)
yhat2 = churn_tree.predict(x_test)
cf_mat = confusion_matrix(y_test, yhat2)
plt.figure(figsize=(5, 4))
sns.heatmap(cf_mat, annot=True, fmt='g')
plt.show()


print('Decision Tree Classifier :\n\n\t', f'The Training model accuracy :{churn_tree.score(x_train, y_train)}\n\t',
     f'The Test model accuracy: {churn_tree.score(x_test, y_test)}')
print(classification_report(y_test, yhat2))

### Random Forest Classifier

In [None]:
s = []
for i in range(4, 25):
    churn_forest = RandomForestClassifier(criterion='gini', n_estimators=700, max_depth=i, random_state=42)
    churn_forest.fit(x_train, y_train)
    s.append(churn_forest.score(x_test, y_test))

x = np.arange(4, 25)
plt.figure(figsize=(13, 7))
plt.plot(x, s, '--o', markersize=15, color='blue')
plt.xlabel('max depth of forest')
plt.ylabel('Accuracy')
plt.xticks(x)
plt.title('Churn forest Accuracy')
plt.show()

In [None]:
churn_forest = RandomForestClassifier(criterion='gini', n_estimators=700, max_depth=20, random_state=42)
churn_forest.fit(x_train, y_train)
yhat3 = churn_forest.predict(x_test)
plt.figure(figsize=(5, 4))
cf_mat = confusion_matrix(y_test, yhat3)
sns.heatmap(cf_mat, annot=True, fmt='g')
plt.show()


print('Random Forest Classifier :\n\n\t', f'The Training model accuracy :{churn_forest.score(x_train, y_train)}\n\t',
     f'The Test model accuracy: {churn_forest.score(x_test, y_test)}')
print(classification_report(y_test, yhat3))

### XGBoost Classifier

In [None]:
xgb = XGBClassifier(learning_rate=0.1, n_estimators=700, max_depth=13,
                        min_child_weight=3, gamma=0.3, subsample=0.6, colsample_bytree=1.0,
                        objective='binary:logistic', nthread=4, scale_pos_weight=1, random_state=42)
xgb.fit(x_train, y_train, eval_metric='auc')
yhat4 = xgb.predict(x_test)

plt.figure(figsize=(5, 4))
cf_mat = confusion_matrix(y_test, yhat4)
sns.heatmap(cf_mat, annot=True, fmt='g')
plt.show()


print('XGBoost Classifier :\n\n\t', f'The Training model accuracy :{xgb.score(x_train, y_train)}\n\t',
     f'The Test model accuracy: {xgb.score(x_test, y_test)}\n')
print(classification_report(y_test, yhat4))

## AdaBoostClassifier

In [None]:
abc = AdaBoostClassifier(n_estimators=500, random_state=42)
abc.fit(x_train, y_train)
yhat5 = abc.predict(x_test)

plt.figure(figsize=(5, 4))
cf_mat = confusion_matrix(y_test, yhat5)
sns.heatmap(cf_mat, annot=True, fmt='g')
plt.show()


print('AdaBoost Classifier :\n\n\t', f'The Training model accuracy :{abc.score(x_train, y_train)}\n\t',
     f'The Test model accuracy: {abc.score(x_test, y_test)}\n')
print(classification_report(y_test, yhat5))

### KNeighbors Classifier

In [None]:
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(x_train, y_train)
yhat6 = knn.predict(x_test)

plt.figure(figsize=(5, 4))
cf_mat = confusion_matrix(y_test, yhat6)
sns.heatmap(cf_mat, annot=True, fmt='g')
plt.show()


print('KNeighbors Classifier :\n\n\t', f'The Training model accuracy :{knn.score(x_train, y_train)}\n\t',
     f'The Test model accuracy: {knn.score(x_test, y_test)}\n')
print(classification_report(y_test, yhat6))

### Gradient Boosting Classifier

In [None]:
gbc = GradientBoostingClassifier()
gbc.fit(x_train, y_train)
yhat7 = gbc.predict(x_test)

plt.figure(figsize=(5, 4))
cf_mat = confusion_matrix(y_test, yhat7)
sns.heatmap(cf_mat, annot=True, fmt='g')
plt.show()


print('Gradient Boosting Classifier :\n\n\t', f'The Training model accuracy :{gbc.score(x_train, y_train)}\n\t',
     f'The Test model accuracy: {gbc.score(x_test, y_test)}\n')
print(classification_report(y_test, yhat7))