In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import plot_roc_curve
import seaborn as sns

In [None]:
# import the dataset
data = pd.read_csv('../input/credit-card-customers/BankChurners.csv')

In [None]:
# Cheack the dataset 
data.head()

In [None]:
# Check the data type
data.info()

In [None]:
# Delete useless columns
data = data.drop(['Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1', 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2', 'CLIENTNUM'], axis=1)

In [None]:
# Check data again
data.head(10)

In [None]:
# Check the target values
data['Attrition_Flag'].unique()

In [None]:
data['Attrition_Flag'].value_counts()

In [None]:
# Check if we have missing data
data.isnull().sum()

In [None]:
# check data statistics
data.describe()

In [None]:
# Check the corrolation matrix
corr_matrix = data.corr()
fig, ax = plt.subplots(figsize=(15, 10))
ax = sns.heatmap(corr_matrix,
                 annot=True,
                 linewidths=0.5,
                 fmt=".2f",
                 cmap="YlGnBu");
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5);

In [None]:
# Since we don't have missing data, let's integer encode categorical data
def category_mapping(df, variable):
    return {k: i for i, k in enumerate(df[variable].unique(), 0)}

def integer_encode(df, variable, category_mapping):
    df[variable] = df[variable].map(category_mapping)

for variable in ['Attrition_Flag','Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category']:
    mappings = category_mapping(data, variable)
    integer_encode(data, variable, mappings)

In [None]:
# Check data again
data.head()

In [None]:
# Split the data into X: variables, and y: Target
X = data.drop('Attrition_Flag', axis= 1).values
y = data['Attrition_Flag'].values

In [None]:
# Split the data into training-set and test-set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 42)

In [None]:
# Since the data varies in range, we will scale it
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
# the data is ready for the models, we will use 4 different models and see which one performs better
models = {"Logistic Regression": LogisticRegression(),
          "KNN": KNeighborsClassifier(),
          "Random Forest": RandomForestClassifier(),
          "SVC": SVC(kernel = 'rbf')}
def fit_and_score(models, X_train, X_test, y_train, y_test):
    np.random.seed(42)
    model_scores = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        model_scores[name] = model.score(X_test, y_test)
    return model_scores

In [None]:
model_scores = fit_and_score(models=models,
                             X_train=X_train,
                             X_test=X_test,
                             y_train=y_train,
                             y_test=y_test)
model_scores

In [None]:
model_compare = pd.DataFrame(model_scores, index=["accuracy"])
model_compare.T.plot.bar();

In [None]:
# Of the 4 models, RandomForest performed the best, lets check F1 and recall of this model
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
model_f1 = f1_score(y_test, y_pred)
model_recall = recall_score(y_test, y_pred)
print(model_f1, model_recall)

In [None]:
# Classification report
print(classification_report(y_test, y_pred))

In [None]:
# lets check the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

In [None]:
# ROC 
plot_roc_curve(clf, X_test, y_test);

In [None]:
# Lets see if we can improve the model by using RandomizedSearchCV

parameters = {"n_estimators": np.arange(50, 500, 50),
           "max_depth": [None, 3, 5, 10],
           "min_samples_split": np.arange(2, 20, 2),
           "min_samples_leaf": np.arange(1, 20, 2)}

rs_cv = RandomizedSearchCV(RandomForestClassifier(),
                           param_distributions=parameters,
                           cv=5,
                           n_iter=25,
                           verbose=True, n_jobs=-1)

rs_cv.fit(X_train, y_train);

In [None]:
# Check the best parameters the RandomizedSearch found
rs_cv.best_params_

In [None]:
# Check if the new parameters are better than what we've got with the default parameters
rs_cv.score(X_test, y_test) > clf.score(X_test, y_test)

In [None]:
# Let's try XGboost and see if it will perform better than RandomForest
from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(X_train, y_train)

In [None]:
classifier.score(X_test, y_test)

In [None]:
y_predxg = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_predxg)
print(cm)
accuracy_score(y_test, y_predxg)

In [None]:
# As seen, XGboost is doing better than RandomForest, let's see how it does with cross-validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

In [None]:
# Classification report
print(classification_report(y_test, y_predxg))

In [None]:
# ROC 
plot_roc_curve(classifier, X_test, y_test);