In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

In [None]:
import os
import missingno as msno
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
df = pd.read_csv("../input/credit-card-customers/BankChurners.csv")


#Remove the ID column and the last two columns
df = df.iloc[:,1:-2]
#Check out the missing values
df.replace(["Unknown","NaN"],np.nan, inplace=True)
#Visualize the missing value
msno.matrix(df) 
# We have missing values in education, maritial and income randomly distributed
# We can delete rows with more than 2 missing values
df["num_missing"] = df.apply(lambda x: x.isnull().sum(), axis=1)
# I checked the maximum missing number per row is 2. 
df = df[df["num_missing"]!=2]
# I will put the missing value back to Unknown here since they might contain info
df.fillna("Unknown", inplace=True)

In [None]:
# For all the categorical data, we create order for those which has order meaning and encode the rest
education_dic = {'Uneducated':0, 'High School':1, 'Unknown':2, 'College':3, 'Graduate':4, 'Post-Graduate':5,\
    'Doctorate':6}
income_dic = {'Less than $40K':0, '$40K - $60K':1, "Unknown":2, '$60K - $80K':3, '$80K - $120K':4,'$120K +':5 }
churn_dic = {'Existing Customer':0, 'Attrited Customer':1}
card_dic = {'Blue':0, 'Silver':1, 'Gold':2, 'Platinum':3}

df["Education_Level"].replace(education_dic, inplace=True)
df["Income_Category"].replace(income_dic, inplace=True)
df["Attrition_Flag"].replace(churn_dic, inplace=True)
df["Card_Category"].replace(card_dic, inplace=True)

In [None]:
# For sure those numbers might not be in the correct magitude. We can adjust those later after we see the feature importance.
df = pd.get_dummies(df)
df.shape

In [None]:
#Correlation Plot
from string import ascii_letters
sns.set(style="white")
# Generate a large random dataset
rs = np.random.RandomState(22)
# Compute the correlation matrix
corr = df.corr()
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=np.bool))
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df,df["Attrition_Flag"], test_size=0.2,
                                                    random_state=42)
X_train.shape, X_test.shape # We have 7845 in train and 1962 in test


## Upampling & Model Prediction

In [None]:
from sklearn.utils import resample
X = X_train.copy()
# separate minority and majority classes
non_churn = X[X["Attrition_Flag"]==0]
churn     = X[X["Attrition_Flag"]==1]

# upsample minority
churn_upsampled = resample(churn,
                          replace=True, # sample with replacement
                          n_samples=len(non_churn), # match number in majority class
                          random_state=1) # reproducible results

upsampled = pd.concat([non_churn,churn_upsampled])

# check new class counts
# print(upsampled["Attrition_Flag"].value_counts()) #6561 for each category

In [None]:
X_train = upsampled.drop("Attrition_Flag",axis=1)
y_train = upsampled["Attrition_Flag"].astype(int)
X_test = X_test.drop("Attrition_Flag", axis=1)
y_test = y_test.astype(int)

### Logistic Model

In [None]:
#Lets do a simple logistic regression here first 
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import accuracy_score, recall_score, precision_score

from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(solver = "lbfgs",random_state=42).fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Model Performance
def prediction_result(y_test, y_pred):
    print("Accuracy : ", accuracy_score(y_test, y_pred) *  100)
    print("Recall : ", recall_score(y_test, y_pred) *  100)
    print("Precision : ", precision_score(y_test, y_pred) *  100)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
prediction_result(y_test,y_pred)

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators = 50, random_state = 0).fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Model Performance
prediction_result(y_test,y_pred)

### XGBoost

In [None]:
import xgboost as xgb
clf = xgb.XGBClassifier( 
    n_estimatoryhs=1000,
    max_depth=3, 
    learning_rate=0.02, 
    subsample=0.8,
    colsample_bytree=0.4, 
    missing=-1, 
)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Model Performance
prediction_result(y_test,y_pred)

## SMOTE and Model Prediction

In [None]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
X_train, X_test, y_train, y_test = train_test_split(df,df["Attrition_Flag"], test_size=0.2,
                                                    random_state=40)
print(X_train.shape, y_train.shape)
X_train.shape, X_test.shape # We have 7845 in train and 1962 in test

oversample = SMOTE()
X_train_SMOTE, y_train_SMOTE = oversample.fit_resample(X_train,y_train)

print("After Upsampling:-")
print(X_train_SMOTE.shape, y_train_SMOTE.shape)

X_train_SMOTE = X_train_SMOTE.drop("Attrition_Flag",axis=1)
X_test = X_test.drop("Attrition_Flag", axis=1)

In [None]:
def prediction_result(y_test, y_pred):
    print("Accuracy : ", accuracy_score(y_test, y_pred) *  100)
    print("Recall : ", recall_score(y_test, y_pred) *  100)
    print("Precision : ", precision_score(y_test, y_pred) *  100)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    
# Logistic Regression
clf = LogisticRegression(solver = "lbfgs",random_state=42).fit(X_train_SMOTE, y_train_SMOTE)
y_pred = clf.predict(X_test)
print("LOGISTIC RESULT")
print(prediction_result(y_test,y_pred))

# Random Forest
clf = RandomForestClassifier(n_estimators = 50, random_state = 0).fit(X_train_SMOTE, y_train_SMOTE)
y_pred = clf.predict(X_test)
print("RANDOM FOREST RESULT")
print(prediction_result(y_test,y_pred))

# XGBoost
import xgboost as xgb
clf = xgb.XGBClassifier( 
    n_estimatoryhs=1000,
    max_depth=3, 
    learning_rate=0.02, 
    subsample=0.8,
    colsample_bytree=0.4, 
    missing=-1, 
)

clf.fit(X_train_SMOTE, y_train_SMOTE)
y_pred = clf.predict(X_test)

# Model Performance
print("XGBOOST RESULT")
print(prediction_result(y_test,y_pred))

### The result is not as good as upsampling(out of my expectation). 

## Upsampling with H2O AutoML

In [None]:
import h2o
h2o.init()

### I ran into a error because h2o take the task as a regression task. I have no idea what happened since I already specified the target column as "enum". No related info online and the error disappeared after I restarted the kernel. 

In [None]:
from h2o.automl import H2OAutoML
h2o_df = h2o.H2OFrame(df)
h2o_df["Attrition_Flag"] = h2o_df["Attrition_Flag"].asfactor()
# h2o_df.describe()
train, test = h2o_df.split_frame(ratios=[.8])

# Identify predictors and response
x = train.columns
y = "Attrition_Flag"
x.remove(y)

train[y] = train[y].asfactor()
test[y] = test[y].asfactor()

aml = H2OAutoML(max_runtime_secs=600,
                exclude_algos=['DeepLearning'],
                seed=1,
                stopping_metric='AUC',
                sort_metric='AUC',
                balance_classes=True,
                project_name='Churn_Prediction'
)

%time aml.train(x=x, y=y, training_frame=train)

In [None]:
# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print all rows instead of default

In [None]:
model = h2o.get_model('StackedEnsemble_AllModels_AutoML_20201222_163314')
model.model_performance(test)

### The recall is 258/(258+36)=87%. It can be improved to 1 with a lower threshold of 0.012851.

## Variable Importance for Churn Prediction

In [None]:
model2 = h2o.get_model("XGBoost_grid__1_AutoML_20201222_163314_model_4")
model2.varimp_plot(num_of_features=20)