In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

from warnings import simplefilter
simplefilter(action='ignore', category = FutureWarning)

In [None]:
FILE_PATH = '/kaggle/input/credit-card-customers/BankChurners.csv'
data = pd.read_csv(FILE_PATH)

In [None]:
data.head()

In [None]:
data.columns

In [None]:
data.drop(['Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1', 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'], axis = 1, inplace = True)

In [None]:
data.columns

In [None]:
data.info()

In [None]:
at = data['Attrition_Flag'].value_counts()
attrition = at.index
at

In [None]:
ge = data['Gender'].value_counts()
gender = ge.index
ge

In [None]:
ed = data['Education_Level'].value_counts()
education = ed.index
ed

In [None]:
ma = data['Marital_Status'].value_counts()
marital = ma.index
ma

In [None]:
inc = data['Income_Category'].value_counts()
income = inc.index
inc

In [None]:
ca = data['Card_Category'].value_counts()
card = ca.index
ca

In [None]:
# Encoding (Manual)

In [None]:
for i in range(len(attrition)):
    data['Attrition_Flag'].replace(attrition[i], i, inplace = True)

for i in range(len(gender)):
    data['Gender'].replace(gender[i], i, inplace = True)

for i in range(len(education)):
    data['Education_Level'].replace(education[i], i, inplace = True)
    
for i in range(len(marital)):
    data['Marital_Status'].replace(marital[i], i, inplace = True)
    
for i in range(len(income)):
    data['Income_Category'].replace(income[i], i, inplace = True)

for i in range(len(card)):
    data['Card_Category'].replace(card[i], i, inplace = True)

In [None]:
data.head()

In [None]:
data = data.drop(['CLIENTNUM'], axis = 1)

In [None]:
data.head()

In [None]:
data.isnull().sum()

In [None]:
data.columns

In [None]:
y = data['Attrition_Flag']

In [None]:
features = ['Customer_Age', 'Gender', 'Dependent_count',
       'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category',
       'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon',
       'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
       'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
       'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']
X = data[features]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)

In [None]:
X_incl_const = sm.add_constant(X_train)

In [None]:
vif = [variance_inflation_factor(exog = X_incl_const.values, exog_idx = i) 
       for i in range(X_incl_const.shape[1])]

pd.DataFrame({'coef_name': X_incl_const.columns,
             'vif': np.around(vif, 2)})

In [None]:
X_train.drop(['Total_Revolving_Bal', 'Avg_Open_To_Buy'], axis = 1, inplace = True)
X_test.drop(['Total_Revolving_Bal', 'Avg_Open_To_Buy'], axis = 1, inplace = True)

In [None]:
X_incl_const = sm.add_constant(X_train)

In [None]:
vif = [variance_inflation_factor(exog = X_incl_const.values, exog_idx = i) 
       for i in range(X_incl_const.shape[1])]

pd.DataFrame({'coef_name': X_incl_const.columns,
             'vif': np.around(vif, 2)})

In [None]:
# Testing 7 different algorithms:
#     - Logistic Regression
#     - Linear Discriminant Analysis
#     - K-Nearest Neighbors
#     - Classification and Regression Trees
#     - Gaussian Naive Bayes
#     - Support Vector Machines
#     - Random Forest Classifier

In [None]:
kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)

# Logistic Regression
log_model = LogisticRegression()
print("Logistic Regression:", cross_val_score(log_model, X_train, y_train, cv = kfold, scoring = 'accuracy').mean().round(3))

# Linear Discriminant Analysis
lda_model = LinearDiscriminantAnalysis()
print("Linear Discriminant Analysis:", cross_val_score(lda_model, X_train, y_train, cv = kfold, scoring = 'accuracy').mean().round(3))

# K-Nearest Neigbors
knn_model = KNeighborsClassifier()
print("K-Nearest Neigbors:", cross_val_score(knn_model, X_train, y_train, cv = kfold, scoring = 'accuracy').mean().round(3))

# Classification and Regression Trees
cart_model = DecisionTreeClassifier()
print("Classification and Regression Trees:", cross_val_score(cart_model, X_train, y_train, cv = kfold, scoring = 'accuracy').mean().round(3))

# Gaussian Naive Bayes
gnb_model = GaussianNB()
print("Gaussian Naive Bayes:", cross_val_score(gnb_model, X_train, y_train, cv = kfold, scoring = 'accuracy').mean().round(3))

# Support Vector Machines
svm_model = SVC(gamma = 'auto')
print("Support Vector Machines:", cross_val_score(svm_model, X_train, y_train, cv = kfold, scoring = 'accuracy').mean().round(3))

# Random Forest Classifier
rfc_model = RandomForestClassifier()
print("Random Forest Classifier:", cross_val_score(rfc_model, X_train, y_train, cv = kfold, scoring = 'accuracy').mean().round(3))

In [None]:
# Logistic Regression
log_model = LogisticRegression()
log_model.fit(X_train, y_train)
print("Logistic Regression:", log_model.score(X_test, y_test).round(3))

# Linear Discriminant Analysis
lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X_train, y_train)
print("Linear Discriminant Analysis:", lda_model.score(X_test, y_test).round(3))

# K-Nearest Neigbors
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
print("K-Nearest Neigbors:", knn_model.score(X_test, y_test).round(3))

# Classification and Regression Trees
cart_model = DecisionTreeClassifier()
cart_model.fit(X_train, y_train)
print("Classification and Regression Trees:", cart_model.score(X_test, y_test).round(3))

# Gaussian Naive Bayes
gnb_model = GaussianNB()
gnb_model.fit(X_train, y_train)
print("Gaussian Naive Bayes:", gnb_model.score(X_test, y_test).round(3))

# Support Vector Machines
svm_model = SVC(gamma = 'auto')
svm_model.fit(X_train, y_train)
print("Support Vector Machines:", svm_model.score(X_test, y_test).round(3))

# Random Forest Classifier
rfc_model = RandomForestClassifier()
rfc_model.fit(X_train, y_train)
print("Random Forest Classifier:", rfc_model.score(X_test, y_test).round(3))

In [None]:
# Choosing RFC because it scored the highest

In [None]:
rfc_model.fit(X_train, y_train)
pred = rfc_model.predict(X_test)

In [None]:
print(accuracy_score(y_test, pred).round(5))
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

In [None]:
# 96% accuracy + recall with the random forest model!