In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load + Process data

In [None]:
df = pd.read_csv(os.path.join(dirname, 'BankChurners.csv'))
df = df.drop(columns = [
        'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
        'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'
    ],
)


In [None]:
df.head()

In [None]:
num_cols = ['Customer_Age', 'Dependent_count', 'Months_on_book',
       'Total_Relationship_Count', 'Months_Inactive_12_mon',
       'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
       'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
       'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio'
]

cat_cols = [x for x in df.columns if x not in ['CLIENTNUM']+num_cols]

In [None]:
df = pd.get_dummies(
    df,
    columns=cat_cols
)



In [None]:
df.info()

# Creating a train and test set

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
feature_cols = [x for x in df.columns if x not in ['Attrition_Flag_Existing Customer', 'CLIENTNUM', 'Attrition_Flag_Attrited Customer']]

# Logistic Regression on all feature columns

In [None]:
X=df[feature_cols]
Y=df[['Attrition_Flag_Existing Customer']]
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,random_state=42)

model=LogisticRegression(C=500,max_iter=50000)
model.fit(X_train, np.ravel(Y_train))


print('Logistic Regression:')
print('Traning Model accruracy: {:.2%}'.format(model.score(X_train,Y_train)))
print('Test Model accruracy: {:.2%}'.format(model.score(X_test,Y_test['Attrition_Flag_Existing Customer'])))


# Can we improve by being more selective with features?

In [None]:
# examine correlations to see if we can remove some of the columns to improve accuracy.
correlations = (
    df[feature_cols]
    .corr()
    .reset_index()
    .melt(
        id_vars = ['index'],
        var_name='col2',
        value_name='pearsons_corr', 
    )
    .rename(columns={'index': 'col1'})
    .query('col1 < col2')
    .assign(abs_corr = lambda x: abs(x['pearsons_corr']))
    .sort_values('abs_corr', ascending=False)
)

correlations.head(10)

In [None]:
columns_to_drop = [
    'Gender_F', 'Avg_Open_To_Buy', 'Card_Category_Blue',
    'Customer_Age', 'Marital_Status_Married', 
]

In [None]:
X=df[set(feature_cols) - set(columns_to_drop)]
Y=df[['Attrition_Flag_Existing Customer']]
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,random_state=42)

model=LogisticRegression(C=500, max_iter=2000)
model.fit(X_train, np.ravel(Y_train))


print('Logistic Regression:')
print('Traning Model accruracy: {:.2%}'.format(model.score(X_train,Y_train)))
print('Test Model accruracy: {:.2%}'.format(model.score(X_test,Y_test['Attrition_Flag_Existing Customer'])))


# Random Forest 

Potentially some overfitting going on here.

In [None]:
from sklearn.ensemble import RandomForestClassifier

X=df[set(feature_cols) - set(columns_to_drop)]
Y=df[['Attrition_Flag_Existing Customer']]
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,random_state=42)

model=RandomForestClassifier(max_depth = 20, random_state=42)
model.fit(X_train, np.ravel(Y_train))


print('Random Forest:')
print('Traning Model accruracy: {:.2%}'.format(model.score(X_train,Y_train)))
print('Test Model accruracy: {:.2%}'.format(model.score(X_test,Y_test['Attrition_Flag_Existing Customer'])))



In [None]:
# lets experiment with the depth of the trees

X=df[set(feature_cols) - set(columns_to_drop)]
Y=df[['Attrition_Flag_Existing Customer']]
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,random_state=42)

random_forest_results = []

for i in range(1, 30):
    model=RandomForestClassifier(max_depth=i, random_state=42)
    model.fit(X_train, np.ravel(Y_train))
    score = model.score(X_test,Y_test['Attrition_Flag_Existing Customer'])
    random_forest_results.append({
        'max_depth': i,
        'accuracy': score
    })
    
pd.DataFrame(random_forest_results).set_index('max_depth').plot()
    

# Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

X=df[set(feature_cols) - set(columns_to_drop)]
Y=df[['Attrition_Flag_Existing Customer']]
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,random_state=42)

model=DecisionTreeClassifier(max_depth=10, random_state=42)
model.fit(X_train, np.ravel(Y_train))


print('Decision Tree')
print('Traning Model accruracy: {:.2%}'.format(model.score(X_train,Y_train)))
print('Test Model accruracy: {:.2%}'.format(model.score(X_test,Y_test['Attrition_Flag_Existing Customer'])))




In [None]:
# lets experiment with the depth of the trees

X=df[set(feature_cols) - set(columns_to_drop)]
Y=df[['Attrition_Flag_Existing Customer']]
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,random_state=42)

decision_tree_results = []

for i in range(1, 30):
    model=DecisionTreeClassifier(max_depth=i, random_state=42)
    model.fit(X_train, np.ravel(Y_train))
    score = model.score(X_test,Y_test['Attrition_Flag_Existing Customer'])
    decision_tree_results.append({
        'max_depth': i,
        'accuracy': score
    })
    
pd.DataFrame(decision_tree_results).set_index('max_depth').plot()
    

Accuracy peaks with max depth around 9.