In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as ex

# Load and View the Data

In [None]:
df = pd.read_csv("../input/credit-card-customers/BankChurners.csv")

In [None]:
df.shape

In [None]:
df.info()

In [None]:
# drop last 2 colu,ms
df = df.drop(df.columns[-2:], axis=1)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.head()

In [None]:
ex.pie(df, names='Attrition_Flag', title='Proportion of Existing vs Attrited Customers')

# Exploratory Data Analysis

In [None]:
#Distribution of Independent Variables
fig, ax = plt.subplots(19,1, figsize = (8,85))
sns.boxplot(df['Customer_Age'], ax=ax[0])
sns.countplot(df['Gender'], ax=ax[1])
sns.countplot(df['Dependent_count'], ax=ax[2])
sns.countplot(df['Education_Level'], ax=ax[3])
sns.countplot(df['Marital_Status'], ax=ax[4])
sns.countplot(df['Income_Category'], ax=ax[5])
sns.countplot(df['Card_Category'], ax=ax[6])
sns.boxplot(df['Months_on_book'], ax=ax[7])
sns.boxplot(df['Total_Relationship_Count'], ax=ax[8])
sns.boxplot(df['Months_Inactive_12_mon'], ax=ax[9])
sns.boxplot(df['Contacts_Count_12_mon'], ax=ax[10])
sns.boxplot(df['Credit_Limit'], ax=ax[11])
sns.boxplot(df['Total_Revolving_Bal'], ax=ax[12])
sns.boxplot(df['Avg_Open_To_Buy'], ax=ax[13])
sns.boxplot(df['Total_Amt_Chng_Q4_Q1'], ax=ax[14])
sns.boxplot(df['Total_Trans_Amt'], ax=ax[15])
sns.boxplot(df['Total_Trans_Ct'], ax=ax[16])
sns.boxplot(df['Total_Ct_Chng_Q4_Q1'], ax=ax[17])
sns.boxplot(df['Avg_Utilization_Ratio'], ax=ax[18])

In [None]:
# Distribution of Variables, Attrited vs Existing Customer
fig, ax = plt.subplots(19,1, figsize = (11,90))
sns.boxplot(data=df, x='Attrition_Flag', y='Customer_Age', ax=ax[0])
sns.countplot(df['Gender'], ax=ax[1], hue=df['Attrition_Flag'])
sns.countplot(df['Dependent_count'], ax=ax[2], hue=df['Attrition_Flag'])
sns.countplot(df['Education_Level'], ax=ax[3], hue=df['Attrition_Flag'])
sns.countplot(df['Marital_Status'], ax=ax[4], hue=df['Attrition_Flag'])
sns.countplot(df['Income_Category'], ax=ax[5], hue=df['Attrition_Flag'])
sns.countplot(df['Card_Category'], ax=ax[6], hue=df['Attrition_Flag'])
sns.boxplot(data=df, y='Months_on_book', ax=ax[7], x='Attrition_Flag')
sns.boxplot(data=df, y='Total_Relationship_Count', ax=ax[8], x='Attrition_Flag')
sns.boxplot(data=df, y='Months_Inactive_12_mon', ax=ax[9], x='Attrition_Flag')
sns.boxplot(data=df, y='Contacts_Count_12_mon', ax=ax[10], x='Attrition_Flag')
sns.boxplot(data=df, y='Credit_Limit', ax=ax[11], x='Attrition_Flag')
sns.boxplot(data=df, y='Total_Revolving_Bal', ax=ax[12], x='Attrition_Flag')
sns.boxplot(data=df, y='Avg_Open_To_Buy', ax=ax[13], x='Attrition_Flag')
sns.boxplot(data=df, y='Total_Amt_Chng_Q4_Q1', ax=ax[14], x='Attrition_Flag')
sns.boxplot(data=df, y='Total_Trans_Amt', ax=ax[15], x='Attrition_Flag')
sns.boxplot(data=df, y='Total_Trans_Ct', ax=ax[16], x='Attrition_Flag')
sns.boxplot(data=df, y='Total_Ct_Chng_Q4_Q1', ax=ax[17], x='Attrition_Flag')
sns.boxplot(data=df, y='Avg_Utilization_Ratio', ax=ax[18], x='Attrition_Flag')

# Preprocessing

In [None]:
# One Hot Encoding of Categorical Variables

In [None]:
df3 = df
df3.head()

In [None]:
# One hot encoding categorical variables
df3['Attrition_Flag'] = df3['Attrition_Flag'].replace({'Attrited Customer':1, 'Existing Customer':0})
df3['Gender'] = df3['Gender'].replace({'F':1, 'M':0})
# Custom coding for Income Category
df3['Income_Category'] = df3['Income_Category'].replace({'Less than $40K':1, '$40K - $60K':2, '$60K - $80K':3, '$80K - $120K':4
                                                        ,'$120K +':5, 'Unknown':1})
df3 = pd.concat([df3, pd.get_dummies(df3['Education_Level']).drop(columns = ['Unknown'])], axis = 1)
df3 = pd.concat([df3, pd.get_dummies(df3['Marital_Status']).drop(columns = ['Unknown'])], axis = 1)
df3 = pd.concat([df3, pd.get_dummies(df3['Card_Category'])], axis = 1)
df3 = df3.drop(['Education_Level', 'Marital_Status', 'Card_Category'], axis = 1)


In [None]:
# Client Number not useful, dropping column
df3 = df3.drop(['CLIENTNUM'], axis = 1)

In [None]:
df3.head()

In [None]:
# Viewing Income Category after Custom Encoding
df3.Income_Category.value_counts()

In [None]:
fig, ax = plt.subplots(figsize = (15,15))
sns.heatmap(df3.corr('pearson'),annot=False)
plt.title("Correlation Heatmap")

In [None]:
# Importing libraries for building model and metrics
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
# Assigning X,y variables
X = df3.drop(['Attrition_Flag'], axis=1)
y = df3['Attrition_Flag']

In [None]:
scaler = StandardScaler()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = .2, random_state=9)

# Random Forest Classifier

In [None]:
# create model and pipeline with gridsearch

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf_pipe = Pipeline([('scaler',scaler), ('rf', rf)])
rf_param = {'rf__n_estimators': [100,200,300,500,700,1000],
           'rf__max_features': [2,3,5,7,9]}

In [None]:
rf_grid = RandomizedSearchCV(rf_pipe, rf_param, cv=7, scoring = "accuracy", n_iter=10, random_state=9)
rf_grid.fit(X_train, y_train)

In [None]:
rf_grid_results = pd.DataFrame(rf_grid.cv_results_)

In [None]:
rf_grid_results

In [None]:
print('RF Best Score:', rf_grid.best_score_)

In [None]:
print('RF Best Parameters:', rf_grid.best_params_)

In [None]:
rf_pred = rf_grid.predict(X_test)
rf_acc = accuracy_score(y_test, rf_pred)
print('Accuracy Score: ',rf_acc)

In [None]:
rf_conf = confusion_matrix(y_test, rf_pred)
rf_conf = pd.DataFrame(data=rf_conf, columns=[['Predicted:Existing', 'Predicted:Churned']], index=[['Actual: Existing', 'Actual: Churned']])
fig, ax = plt.subplots(figsize = (9,9))
sns.heatmap(rf_conf, annot=True, fmt='g')