In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [None]:
data_path = '/kaggle/input/credit-card-customers/BankChurners.csv'
data = pd.read_csv(data_path)
data = data.drop(data.iloc[:,-2:], axis=1) #Drop last two columns, per advice from the author
data.head()

In [None]:
sns.countplot(x='Attrition_Flag',data=data)

In [None]:
sns.countplot(x='Income_Category', hue='Attrition_Flag', data=data)

In [None]:
#Function for label encoding of multiple columns at once
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [None]:
data = data.iloc[:,1:] #Get rid of column "CLIENTNUM", as every client has unique number, and that does not help our model
data.head()

In [None]:
categorical_data = MultiColumnLabelEncoder(columns=['Gender','Education_Level','Marital_Status','Income_Category','Card_Category']).fit_transform(data)

sc = StandardScaler()
continious_data = pd.DataFrame(sc.fit_transform(data.iloc[:,-8:-1])) #From column Credit_Limit to Avg_Utilization_Ratio(not including Avg_Utilization_Ratio)

# Merge the two dataframes that we preprocessed

In [None]:
data = pd.concat([categorical_data,continious_data], axis=1)
data.head()

In [None]:
X = data.drop('Attrition_Flag',axis=1)
y = data['Attrition_Flag'].map({'Existing Customer':0, 'Attrited Customer':1}) #Manually encode

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y) #Stratify equally splits imbalanced y data, as we have in this example
print("X_train shape: ",X_train.shape)
print("X_test shape: ",X_test.shape)
print("y_train shape: ",y_train.shape)
print("y_test shape: ",y_test.shape)

In [None]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)

In [None]:
def report_and_confusion_matrix(y_true, y_pred):
    print("Model Report")
    print(classification_report(y_true, y_pred))
    score = accuracy_score(y_true, y_pred)
    print("Accuracy: "+str(score))

    fig, ax = plt.subplots(figsize = (8,8))
    mtx = confusion_matrix(y_true, y_pred)
    sns.heatmap(mtx, annot=True, fmt='d', linewidth=0.5, cbar=True, ax=ax)
    plt.ylabel("True Label")
    plt.xlabel("Predicted Label")

report_and_confusion_matrix(y_test, y_pred)

In [None]:
!pip install catboost

In [None]:
from catboost import CatBoostClassifier

cbc = CatBoostClassifier()
cbc.fit(X_train, y_train)
y_pred = cbc.predict(X_test)

In [None]:
report_and_confusion_matrix(y_test, y_pred)