# BankChurners: credit-card-customers 
From: https://www.kaggle.com/sakshigoyal7/credit-card-customers?select=BankChurners.csv

"A manager at the bank is disturbed with more and more customers leaving their credit card services. They would really appreciate if one could predict for them who is gonna get churned so they can proactively go to the customer to provide them better services and turn customers' decisions in the opposite direction

I got this dataset from a website with the URL as https://leaps.analyttica.com/home. I have been using this for a while to get datasets and accordingly work on them to produce fruitful results. The site explains how to solve a particular business problem.

Now, this dataset consists of 10,000 customers mentioning their age, salary, marital_status, credit card limit, credit card category, etc. There are nearly 18 features.

We have only 16.07% of customers who have churned. Thus, it's a bit difficult to train our model to predict churning customers. "


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

## customers dataset 

In [None]:
pd.set_option('max_columns', None)

In [None]:
pdf_customers = pd.read_csv('../input/credit-card-customers/BankChurners.csv') 
pdf_customers.columns = [c.lower() for c in pdf_customers.columns]
pdf_customers = pdf_customers.drop(['naive_bayes_classifier_attrition_flag_card_category_contacts_count_12_mon_dependent_count_education_level_months_inactive_12_mon_1',
       'naive_bayes_classifier_attrition_flag_card_category_contacts_count_12_mon_dependent_count_education_level_months_inactive_12_mon_2'],axis=1)
print("total null values: {}".format(pdf_customers.isnull().sum().sum()))
pdf_customers.head()

## Features

### customer flag



In [None]:
pdf_customers.attrition_flag.unique()

In [None]:
pdf_customers['binary_flag'] = pdf_customers.attrition_flag.apply(
    lambda f: 0 if f == pdf_customers.attrition_flag.unique()[0] else 1)

In [None]:
n_churn = pdf_customers[pdf_customers.binary_flag==1].clientnum.count()
n_loyal = pdf_customers[pdf_customers.binary_flag==0].clientnum.count()
print('total loyal clients: {}'.format(n_loyal))
print('total churn clients: {}\n'.format(n_churn))
print('churn/loyal: {}\n'.format(n_churn/n_loyal))
      

pdf_customers.groupby(['attrition_flag']).clientnum.count().plot(kind='bar')
plt.show()

### Other features

In [None]:
lt_features = pdf_customers.columns[2:-1]

In [None]:
i=0

In [None]:

sns.histplot(data=pdf_customers, x=lt_features[i], hue='attrition_flag', alpha=.3)
plt.xticks(rotation=90)
i+=1

#### transforming dummies

In [None]:
for col in ['gender', 'education_level','marital_status', 'income_category','card_category']:
    pdf_customers[pdf_customers[col].unique().tolist()] = pd.get_dummies(pdf_customers[col]).astype(int)

In [None]:
pdf_features = pdf_customers[['customer_age', 
       'dependent_count', 'months_on_book','total_relationship_count', 'months_inactive_12_mon',
       'contacts_count_12_mon', 'credit_limit', 'total_revolving_bal',
       'avg_open_to_buy', 'total_amt_chng_q4_q1', 'total_trans_amt',
       'total_trans_ct', 'total_ct_chng_q4_q1', 'avg_utilization_ratio', 
        'M', 'F', 'High School', 'Graduate', 'Uneducated',
       'Unknown', 'College', 'Post-Graduate', 'Doctorate', 'Married', 'Single',
       'Divorced', '$60K - $80K', 'Less than $40K', '$80K - $120K',
       '$40K - $60K', '$120K +', 'Blue', 'Gold', 'Silver', 'Platinum','binary_flag']]

In [None]:
pdf_features.head()

## modeling

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.linear_model import LogisticRegression

### 1. Simple train test split and model fitting

In [None]:
X = pdf_features[pdf_features.columns[:-1]]
y = pdf_features['binary_flag']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [None]:
y_train

In [None]:
y_test.sum()/y_test.count()

In [None]:
print("size train sample: {}".format(y_train.count()))
print("churn in train sample: {}".format(y_train.sum()))
print("churn cat/ total train: {}".format(y_train.sum()/y_train.count()))

In [None]:
print("size test sample: {}".format(y_test.count()))
print("churn in test sample: {}".format(y_test.sum()))
print("churn cat/ total test: {}".format(y_test.sum()/y_test.count()))

In [None]:
clf_KNN = KNeighborsClassifier()

In [None]:
clf_KNN.fit(X_train, y_train)

In [None]:
y_pred = clf_KNN.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
f1_1 = f1_score(y_test, y_pred)
precision_1 = precision_score(y_test, y_pred)
recall_1 = recall_score(y_test, y_pred)
roc_auc_1 = roc_auc_score(y_test, clf_KNN.predict_proba(X_test)[:, 1])

print("f1_score: {}".format(f1_1))
print("precision_score: {}".format(precision_1))
print("recall_score: {}".format(recall_1))
print("roc_auc_score: {}".format(roc_auc_1))

### 1.2. Undersampling 

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler


In [None]:
rus = RandomUnderSampler()
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)

In [None]:
clf_KNN_rus = KNeighborsClassifier()

In [None]:
clf_KNN_rus.fit(X_train_rus, y_train_rus)

In [None]:
y_pred_rus = clf_KNN_rus.predict(X_test)

In [None]:
confusion_matrix(y_test, y_pred_rus)

In [None]:
f1_12 = f1_score(y_test, y_pred_rus)
precision_12 = precision_score(y_test, y_pred_rus)
recall_12 = recall_score(y_test, y_pred_rus)
roc_auc_12 = roc_auc_score(y_test, clf_KNN_rus.predict_proba(X_test)[:, 1])

print("f1_score: {}".format(f1_12))
print("precision_score: {}".format(precision_12))
print("recall_score: {}".format(recall_12))
print("roc_auc_score: {}".format(roc_auc_12))

### 1.3. oversampling

In [None]:
ros = RandomOverSampler()
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

In [None]:
clf_KNN_ros = KNeighborsClassifier()

In [None]:
clf_KNN_ros.fit(X_train_ros, y_train_ros)

In [None]:
y_pred_ros = clf_KNN_rus.predict(X_test)

In [None]:
confusion_matrix(y_test, y_pred_ros)

In [None]:
f1_13 = f1_score(y_test, y_pred_ros)
precision_13 = precision_score(y_test, y_pred_ros)
recall_13 = recall_score(y_test, y_pred_ros)
roc_auc_13 = roc_auc_score(y_test, clf_KNN_ros.predict_proba(X_test)[:, 1])

print("f1_score: {}".format(f1_13))
print("precision_score: {}".format(precision_13))
print("recall_score: {}".format(recall_13))
print("roc_auc_score: {}".format(roc_auc_13))

In [None]:
print("f1_score: {}".format(f1_12))
print("precision_score: {}".format(precision_12))
print("recall_score: {}".format(recall_12))
print("roc_auc_score: {}".format(roc_auc_12))

### 2. Applying cross-validation

In [None]:
from sklearn.model_selection import KFold

In [None]:
kf = KFold(5)

In [None]:
fold = 0
pdf_results_KNN_rus = pd.DataFrame()
for train_i, test_i in kf.split(X):
    X_train_fold = X.loc[train_i]
    X_test_fold = X.loc[test_i]
    y_train_fold = y[train_i]
    y_test_fold = y[test_i]
    
    rus = RandomUnderSampler()
    X_train_fold_rus, y_train_fold_rus = rus.fit_resample(X_train_fold, y_train_fold)

    clf_KNN_rus = KNeighborsClassifier()

    clf_KNN_rus.fit(X_train_fold_rus, y_train_fold_rus)

    y_pred_fold_rus = clf_KNN_rus.predict(X_test_fold)

#     confusion_matrix(y_test_fold, y_pred_fold_rus)

    f1_2 = f1_score(y_test_fold, y_pred_fold_rus)
    precision_2 = precision_score(y_test_fold, y_pred_fold_rus)
    recall_2 = recall_score(y_test_fold, y_pred_fold_rus)
    roc_auc_2 = roc_auc_score(y_test_fold, clf_KNN_rus.predict_proba(X_test_fold)[:, 1])
    
    pdf_results_KNN_rus.loc[fold, 'fold'] = fold
    pdf_results_KNN_rus.loc[fold, 'f1_score'] = f1_2
    pdf_results_KNN_rus.loc[fold, 'roc_auc_score'] = roc_auc_2

    
#     print('new fold')
#     print("f1_score: {}".format(f1_2))
#     print("precision_score: {}".format(precision_2))
#     print("recall_score: {}".format(recall_2))
#     print("roc_auc_score: {}".format(roc_auc_2))
#     print('-------------')
    fold+=1

In [None]:
pdf_results_KNN_rus.mean()

In [None]:
fold = 0
pdf_results_KNN_ros = pd.DataFrame()
for train_i, test_i in kf.split(X):
    X_train_fold = X.loc[train_i]
    X_test_fold = X.loc[test_i]
    y_train_fold = y[train_i]
    y_test_fold = y[test_i]
    
    ros = RandomOverSampler()
    X_train_fold_ros, y_train_fold_ros = ros.fit_resample(X_train_fold, y_train_fold)

    clf_KNN_ros = KNeighborsClassifier()

    clf_KNN_ros.fit(X_train_fold_ros, y_train_fold_ros)

    y_pred_fold_ros = clf_KNN_ros.predict(X_test_fold)

#     confusion_matrix(y_test_fold, y_pred_fold_rus)

    f1_21 = f1_score(y_test_fold, y_pred_fold_ros)
    precision_21 = precision_score(y_test_fold, y_pred_fold_ros)
    recall_21 = recall_score(y_test_fold, y_pred_fold_ros)
    roc_auc_21 = roc_auc_score(y_test_fold, clf_KNN_ros.predict_proba(X_test_fold)[:, 1])
    
    pdf_results_KNN_ros.loc[fold, 'fold'] = fold
    pdf_results_KNN_ros.loc[fold, 'f1_score'] = f1_21
    pdf_results_KNN_ros.loc[fold, 'roc_auc_score'] = roc_auc_21

    
#     print('new fold')
#     print("f1_score: {}".format(f1_2))
#     print("precision_score: {}".format(precision_2))
#     print("recall_score: {}".format(recall_2))
#     print("roc_auc_score: {}".format(roc_auc_2))
#     print('-------------')
    fold+=1

In [None]:
pdf_results_KNN_ros.mean()