In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import chi2, f_classif
from sklearn.preprocessing import LabelEncoder
from scipy.stats import chi2_contingency,f_oneway
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


%matplotlib inline

In [None]:
dataset = pd.read_csv("/kaggle/input/credit-card-customers/BankChurners.csv")

In [None]:
dataset.head()

Delete columns Clientnum, Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2

In [None]:
dataset = dataset.drop(['CLIENTNUM','Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1','Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'],axis=1)

In [None]:
print('Rows : %d, Columns : %d' % dataset.shape)

In [None]:
dataset.describe()

In [None]:
dataset.info()

# **Exploratory Data Analysis**

**Numerical Variables**

In [None]:
numerical_var = [feature for feature in dataset.columns if dataset[feature].dtypes != 'O']
numerical_var

**Discrete variables**

In [None]:
discrete_var = [feature for feature in numerical_var if dataset[feature].nunique() < 25]
discrete_var

**Continous Variables**

In [None]:
continuous_var = [feature for feature in numerical_var if feature not in discrete_var]
continuous_var

**Categorical variables**

In [None]:
categorical_var = [feature for feature in dataset.columns if dataset[feature].dtypes == 'O']
categorical_var

In [None]:
#### Showing Customer Churn

plt.figure(figsize=(15,5))
plt.pie(dataset['Attrition_Flag'].value_counts(),labels=dataset['Attrition_Flag'].value_counts().index, autopct='%.2f%%')
plt.title("Presentase Customer Churn", fontsize=15)

In [None]:
plt.figure(figsize=(15,5))
g= sns.countplot(data=dataset, x='Income_Category', order= dataset['Income_Category'].value_counts(ascending=False).index)
g.set_xticklabels(['< 40','40-60','80-120','60-80','unknown','> 120'])
plt.xlabel('Income Category (in thousands of dollar)',fontsize=15)
plt.ylabel('Total',fontsize=15)

In [None]:
dataset.groupby('Income_Category')['Attrition_Flag'].value_counts()

In [None]:
dataset['Income_Category'].value_counts(ascending=False)

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(data=dataset, x='Income_Category',hue='Attrition_Flag')

In [None]:
sns.pairplot(dataset,hue='Attrition_Flag')

In [None]:
sns.scatterplot(data=dataset, x='Avg_Open_To_Buy', y='Credit_Limit',hue='Attrition_Flag')

In [None]:
sns.scatterplot(data=dataset, x='Months_on_book', y='Customer_Age',hue='Attrition_Flag')

In [None]:
sns.scatterplot(data=dataset, x='Total_Amt_Chng_Q4_Q1', y='Total_Ct_Chng_Q4_Q1',hue='Attrition_Flag')

In [None]:
dataset.columns

In [None]:
print(dataset[['Avg_Open_To_Buy','Credit_Limit']].corr())
print('\n')
print(dataset[['Total_Amt_Chng_Q4_Q1','Total_Ct_Chng_Q4_Q1']].corr())
print('\n')
print(dataset[['Months_on_book','Customer_Age']].corr())

In [None]:
sns.boxplot(data=dataset, x='Avg_Open_To_Buy')

**Check outliers**

In [None]:
Q1 = dataset['Avg_Open_To_Buy'].quantile(q=0.25)
Q3 = dataset['Avg_Open_To_Buy'].quantile(q=0.75)
IQR = Q3 - Q1
lower = Q1 - IQR*1.5
upper = Q3 + IQR*1.5
print('Q1 : ', Q1)
print('Q3 : ', Q3)
print('lower boundary: ', lower)
print('upper boundary: ', upper)

In [None]:
dataset.describe()

# **Preprocessing**

**Feature Engineering**

In [None]:
df = dataset.copy()

In [None]:
df.head()

**Check missing values**

In [None]:
df.isnull().sum()

**Change categorical variables into numerical**

In [None]:
df['Attrition_Flag_code'] = pd.get_dummies(df['Attrition_Flag'],drop_first=True)
df['Gender_code'] = pd.get_dummies(df['Gender'],drop_first=True)
df['Education_Level_code'] = df['Education_Level'].apply(lambda x : 1 if x == 'Uneducated' else (2 if x == 'High School' else (3 if x == 'College' else (4 if x == 'Graduate' else (5 if x == 'Post-Graduate' else (6 if x == 'Doctorate' else 0))))))
marital_dummies = pd.get_dummies(df[['Marital_Status','Attrition_Flag_code']],columns=['Marital_Status'])
df['Income_Category_code'] = df['Income_Category'].apply(lambda x : 1 if x == 'Less than $40K' else (2 if x == '$40K - $60K' else (3 if x == '$60K - $80K' else (4 if x == '$80K - $120K' else (5 if x == '$120K +' else 0)))))
df['Card_Category_code'] = df['Card_Category'].apply(lambda x : 0 if x == 'Blue' else (1 if x == 'Silver' else (2 if x == 'Gold' else 3)))


In [None]:
df = df.drop(['Attrition_Flag','Gender','Education_Level','Marital_Status','Income_Category','Card_Category'],axis=1)

In [None]:
df.columns

In [None]:
df_categorical = []

for x in categorical_var:
    df_categorical.append(x + '_code')

In [None]:
df_categorical

****Feature Selection****

**1. categorical feature selection**

In [None]:
df_categorical.remove('Marital_Status_code')

In [None]:
df_categorical

In [None]:
## chi square analysis

chi2_check = []
p_values = []
for feature in df_categorical[1:]:   
    if chi2_contingency(pd.crosstab(df['Attrition_Flag_code'],df[feature]))[1] <= 0.05:
        chi2_check.append('Reject Null Hypothesis')
        p_values.append(chi2_contingency(pd.crosstab(df['Attrition_Flag_code'],df[feature]))[1])
    else:
        chi2_check.append('Accept Null Hypothesis')
        p_values.append(chi2_contingency(pd.crosstab(df['Attrition_Flag_code'],df[feature]))[1])

data = pd.DataFrame(data=[df_categorical[1:],chi2_check,p_values]).T
data.columns = ['Column','Hypothesis','P-value']


In [None]:
marital = []
chi2_check2 = []
p_values2 = []

for x in marital_dummies.columns:
    marital.append(x)
    
for feature in marital[1:]:   
    if chi2_contingency(pd.crosstab(marital_dummies['Attrition_Flag_code'],marital_dummies[feature]))[1] <= 0.05:
        chi2_check2.append('Reject Null Hypothesis')
        p_values2.append(chi2_contingency(pd.crosstab(marital_dummies['Attrition_Flag_code'],marital_dummies[feature]))[1])
    else:
        chi2_check2.append('Accept Null Hypothesis')
        p_values2.append(chi2_contingency(pd.crosstab(marital_dummies['Attrition_Flag_code'],marital_dummies[feature]))[1])

data2 = pd.DataFrame(data=[marital[1:],chi2_check2,p_values2]).T
data2.columns = ['Column','Hypothesis','P-value']


In [None]:
data_hypothesis = pd.concat([data,data2],ignore_index= True)

In [None]:
data_hypothesis

In [None]:
data_hypothesis[data_hypothesis['Hypothesis'] == 'Reject Null Hypothesis']

In [None]:
df.columns

In [None]:
# drop features have no relationship with Attrition Flag
df = df.drop(['Education_Level_code','Card_Category_code'],axis=1)

In [None]:
df = pd.concat([df,marital_dummies['Marital_Status_Married']],axis=1)

In [None]:
df.info()

In [None]:
df['Attrition_Flag_code'] = df['Attrition_Flag_code'].astype('int64')
df['Gender_code'] = df['Gender_code'].astype('int64')
df['Marital_Status_Married'] = df['Marital_Status_Married'].astype('int64')
df['Income_Category_code'] = df['Income_Category_code'].astype('int64')

**2. Numerical feature selection**

In [None]:
numerical_var

In [None]:
f_values = []
p_values = []
hypothesis = []

for feature in numerical_var:
    if(f_classif(df[[feature]],df[['Attrition_Flag_code']])[1] <= 0.05):
        hypothesis.append("Reject Null Hypothesis")
        f_values.append(f_classif(df[[feature]],df[['Attrition_Flag_code']])[0][0])
        p_values.append(f_classif(df[[feature]],df[['Attrition_Flag_code']])[1][0])
    else:
        hypothesis.append("Accept Null Hypothesis")
        f_values.append(f_classif(df[[feature]],df[['Attrition_Flag_code']])[0][0])
        p_values.append(f_classif(df[[feature]],df[['Attrition_Flag_code']])[1][0])

numerical_relationship = pd.DataFrame(data=[numerical_var,f_values,p_values,hypothesis]).T
numerical_relationship.columns = ['Column','F-Values','p-values','Hypothesis']

In [None]:
numerical_relationship

**Drop columns that 'Accept Null Hypothesis' or have no correlation with dependent variable**

In [None]:
numerical_relationship[numerical_relationship['Hypothesis'] == 'Accept Null Hypothesis']['Column'].tolist()

df = df.drop(numerical_relationship[numerical_relationship['Hypothesis'] == 'Accept Null Hypothesis']['Column'].tolist(), axis=1)

**check multicollinearity**

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(df.corr(), annot= True)

There is no multicollinearity between each independent variables or correlation between each independent variables are less than 0.9 so that there is no multicollinearity

**Normalization**

In [None]:
X = df.drop('Attrition_Flag_code',axis=1)
y = df['Attrition_Flag_code'].values

In [None]:
scaler = StandardScaler()

In [None]:
scaled_data = scaler.fit_transform(X)

**Handling imbalanced data**

In [None]:
oversample = SMOTE()

In [None]:
scaled_data, y = oversample.fit_resample(scaled_data,y)

Splitting data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(scaled_data, y, test_size=0.3, random_state=42)

Modeling

**1. KNN**

In [None]:
n_neighbors = [1,3,5,7,9,11,13,15,17,19,21]
weights = ['uniform', 'distance']
algorithm = ['auto'] #['auto', 'ball_tree', 'kd_tree', 'brute']
leaf_size = [int(x) for x in np.linspace(start = 10, stop= 100, num= 10)]
p = [1,2]
random_grid = {
    'n_neighbors' : n_neighbors,
    'weights' : weights,
    'algorithm' : algorithm,
    'leaf_size' : leaf_size,
    'p' : p
}

knn_model = KNeighborsClassifier()
knn_randomcv = RandomizedSearchCV(estimator= knn_model, param_distributions= random_grid, n_iter= 25, cv = 5, verbose= 2, random_state= 100, n_jobs= 1)
knn_randomcv.fit(X_train, y_train)
best_random_grid_knn = knn_randomcv.best_estimator_
print("Best: %f using %s" % (knn_randomcv.best_score_, knn_randomcv.best_params_))


**2. Decision Tree**

In [None]:
dc_model = DecisionTreeClassifier()
dc_model.fit(X_train,y_train)

**3. Random Forest**

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 1000,10)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,14]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,6,8]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']}

rf=RandomForestClassifier()
rf_randomcv=RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=5,cv=3,verbose=2,
                               random_state=100,n_jobs=-1)

rf_randomcv.fit(X_train,y_train)
best_random_grid_rf = rf_randomcv.best_estimator_

In [None]:
# Predict label

# KNN
y_pred = best_random_grid_knn.predict(X_test)
y_pred_train = best_random_grid_knn.predict(X_train)

# Decision Tree
y_pred_dc = dc_model.predict(X_test)
y_pred_dc_train = dc_model.predict(X_train)

# Random Forest
y_pred_rf = best_random_grid_rf.predict(X_test)
y_pred_rf_train = best_random_grid_rf.predict(X_train)


# Predict probability
y_pred_proba = best_random_grid_knn.predict_proba(X_test)
y_pred_proba_dc = dc_model.predict_proba(X_test)
y_pred_proba_rf = best_random_grid_rf.predict_proba(X_test)

**Evaluation metrics using confusion matrix**

**Evaluation metric of KNN**

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
print(classification_report(y_train,y_pred_train))

In [None]:
# confusion matrix training
plot_confusion_matrix(knn_randomcv, X_train, y_train)

In [None]:
# confusion matrix testing
plot_confusion_matrix(best_random_grid_knn, X_test, y_test)

**Evaluation metric of Decision Tree**

In [None]:
print(classification_report(y_train,y_pred_dc_train))

In [None]:
print(classification_report(y_test,y_pred_dc))

In [None]:
plot_confusion_matrix(dc_model, X_train, y_train)

In [None]:
plot_confusion_matrix(dc_model, X_test, y_test)

**Evaluation metric of random forest**

In [None]:
print(classification_report(y_train,y_pred_rf_train))

In [None]:
print(classification_report(y_test,y_pred_rf))

In [None]:
plot_confusion_matrix(rf_randomcv, X_train, y_train)

In [None]:
plot_confusion_matrix(rf_randomcv, X_test, y_test)

**Evaluation using ROC AUC**

In [None]:
print(dataset['Attrition_Flag'].value_counts())
print(df['Attrition_Flag_code'].value_counts())

In [None]:
## get existing customer(1) probability
y_pred_proba = y_pred_proba[:,1]
y_pred_proba_dc = y_pred_proba_dc[:,1]
y_pred_proba_rf = y_pred_proba_rf[:,1]

auc = roc_auc_score(y_test, y_pred_proba)
auc_dc = roc_auc_score(y_test, y_pred_proba_dc)
auc_rf = roc_auc_score(y_test, y_pred_proba_rf)

fpr,tpr,threshold = roc_curve(y_test, y_pred_proba)
fpr_dc,tpr_dc,threshold_dc = roc_curve(y_test, y_pred_proba_dc)
fpr_rf,tpr_rf,threshold_rf = roc_curve(y_test, y_pred_proba_rf)

print("AUC Score KNN: ",np.round(auc,3))
print("AUC Score Decision Tree: ",np.round(auc_dc,3))
print("AUC Score Random Forest: ",np.round(auc_rf,3))


In [None]:
plt.plot([0,1],[0,1],linestyle= '-')
plt.plot(fpr,tpr,linestyle='--',color='orange',label = 'KNN')
plt.plot(fpr_dc,tpr_dc,linestyle='--',color='green', label='Decision Tree')
plt.plot(fpr_rf,tpr_rf,linestyle='--',color='blue', label='Random Forest')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(bbox_to_anchor=(1.05, 1) ,loc='upper left')

Based on the models above, Random forest have the best performance. So, in this case we use random forest as the model