In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
data = pd.read_csv('/kaggle/input/credit-card-customers/BankChurners.csv')
data.head()

In [None]:
data = data.drop(['Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1', 
                  'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'], axis = 1)
data.head()

In [None]:
len(data)

In [None]:
data.info()

so there is no NaN's and empty cells

In [None]:
for column in data:
    print(column)
    print(data[column].unique())
    print()

## 1 Exploring data
### 1.0 CLIENTNUM

it is unique id of a person. before we delete it, let's check if there is any duplicates

In [None]:
len(data['CLIENTNUM'].unique()) == len(data)

In [None]:
sorted(data['CLIENTNUM'].unique()) == sorted(data['CLIENTNUM'])

so there isn't. now we can drop it

In [None]:
data = data.drop(['CLIENTNUM'], axis = 1)

### 1.1 Attrition_Flag

Internal event (customer activity) variable - if the account is closed then 1 else 0

basicly is what we need to predict

In [None]:
data.groupby(['Attrition_Flag'])['Attrition_Flag'].count().plot.bar()

In [None]:
len(data[data['Attrition_Flag'] == 'Attrited Customer']) / len(data)

16% of customers are attrited so in next steps (when we will be separating our data on train and test parts) we need to remember that. Now I'll just change values on 0's and 1's


In [None]:
data.loc[data['Attrition_Flag'] == 'Attrited Customer', 'Attrition_Flag'] = 0
data.loc[data['Attrition_Flag'] == 'Existing Customer', 'Attrition_Flag'] = 1

data.groupby(['Attrition_Flag'])['Attrition_Flag'].count().plot.bar()

### 1.2 Customer_Age

In [None]:
plt.figure(figsize = (10, 5))
data_to_plot = data.groupby(['Customer_Age'])['Customer_Age'].count()

data_to_plot.plot.bar(width = 0.75, color = 'C4')

it seems like it has normal distribution. I wouldn't change anything (for now) here

### 1.3 Gender

In [None]:
data.groupby(['Gender'])['Gender'].count().plot.bar(color = 'C2')

In [None]:
len(data[data['Gender'] == 'F']) / len(data)

### 1.4 Dependent_count

Demographic variable - Number of dependents


In [None]:
plt.figure(figsize = (8, 4))
data_to_plot = data.groupby(['Dependent_count'])['Dependent_count'].count()

data_to_plot.plot.bar(width = 0.75, color = 'C8')

it seems like it has normal distribution. I wouldn't change anything (for now) here

### 1.5 Education_Level, Marital_Status, Income_Category

In [None]:
for column in ['Education_Level', 'Marital_Status', 'Income_Category']:

    plt.figure(figsize = (8, 4))
    data_to_plot = data.groupby([column])[column].count()

    data_to_plot.plot.bar(width = 0.75, color = 'C7')

there is some columns with variables 'unknown'. I think we shoudn't do anything with it

### 1.6 Card_Category
Product Variable - Type of Card (Blue, Silver, Gold, Platinum)

In [None]:
plt.figure(figsize = (7, 4))
data_to_plot = data.groupby(['Card_Category'])['Card_Category'].count()

data_to_plot.plot.bar(width = 0.75)

Blue cards are the the most popular (which is pretty obvious, they are the cheapest). But other cards are rare:


In [None]:
len(data[data['Card_Category'] == 'Platinum'])

only 20 cards out of 10K -- is very small amount

In [None]:
len(data[data['Card_Category'] != 'Blue'])

I think we should connect not Blue cards in one category bc there is too little of them

In [None]:
data.loc[data['Card_Category'] != 'Blue', 'Card_Category'] = 'not Blue'

In [None]:
plt.figure(figsize = (7, 4))
data_to_plot = data.groupby(['Card_Category'])['Card_Category'].count()

data_to_plot.plot.bar(width = 0.75)

### 1.7 Months_on_book

Period of relationship with bank

In [None]:
plt.figure(figsize = (10, 5))
data_to_plot = data.groupby(['Months_on_book'])['Months_on_book'].count()

data_to_plot.plot.bar(width = 0.75, color = 'C5')

there is much more values with 36 values. maybe there was an discount or raffle prizes

### 1.8 Total_Relationship_Count, Months_Inactive_12_mon, Contacts_Count_12_mon

* Total_Relationship_Count -- Total no. of products held by the customer
* Months_Inactive_12_mon -- No. of months inactive in the last 12 months
* Contacts_Count_12_mon -- No. of Contacts in the last 12 months

In [None]:
data_to_plot = data.groupby(['Total_Relationship_Count'])['Total_Relationship_Count'].count()
data_to_plot.plot.bar(width = 0.75, color = 'C5')

In [None]:
data_to_plot = data.groupby(['Months_Inactive_12_mon'])['Months_Inactive_12_mon'].count()
data_to_plot.plot.bar(width = 0.75, color = 'C8')

In [None]:
len(data[data['Months_Inactive_12_mon'] == 0])

In [None]:
len(data[data['Months_Inactive_12_mon'] > 4])

there is to little examples with 0 and 5&6. but because it is numerical variable (not categorical) we won't do anything

In [None]:
data_to_plot = data.groupby(['Contacts_Count_12_mon'])['Contacts_Count_12_mon'].count()
data_to_plot.plot.bar(width = 0.75, color = 'C9')

### 1.9 Credit_Limit, Total_Revolving_Bal, Avg_Open_To_Buy, Total_Amt_Chng_Q4_Q1, Total_Trans_Amt, Total_Trans_Ct, Total_Ct_Chng_Q4_Q1, Avg_Utilization_Ratio

In [None]:
for column in ['Credit_Limit', 'Total_Revolving_Bal', 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 
               'Total_Trans_Amt', 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']:
    plt.figure(figsize = (10, 4))
    plt.hist(data[column], bins = 50)
    plt.title(column)

## 2 Data Preparation

We should transform object values to numeric values.

In [None]:
#we are giving a point if person are Male(I`m not sexist I swear)
data.loc[data['Gender'] == 'F', 'Gender'] = 0
data.loc[data['Gender'] == 'M', 'Gender'] = 1

#and if person has not a Blue card (because it`s more prestigious) 
data.loc[data['Card_Category'] == 'Blue', 'Card_Category'] = 0
data.loc[data['Card_Category'] == 'not Blue', 'Card_Category'] = 1

data[['Gender', 'Card_Category', 'Attrition_Flag']] = data[['Gender', 'Card_Category', 'Attrition_Flag']].astype('int32')

#We can change Education_Level and Income_Category on numeric variable:
#the better education/the higher income -- the higher number
education_dict = {'Unknown': 0, 'Uneducated': 1, 'High School': 2, 'College': 3, 'Graduate': 4, 'Post-Graduate': 5, 'Doctorate': 6}
income_category_dict = {'Unknown' : 0, 'Less than $40K' : 1, '$40K - $60K' : 2, '$60K - $80K' : 3, '$80K - $120K' : 4, '$120K +' : 5}

data['Education_Level'] = data['Education_Level'].replace(education_dict)
data['Income_Category'] = data['Income_Category'].replace(income_category_dict)

#But we can`t do the same with Marital_Status so we'll do a column for each status
data = pd.concat([data, pd.get_dummies(data['Marital_Status'], prefix='Marital_Status')], axis=1)
data = data.drop(['Marital_Status', 'Marital_Status_Unknown'], axis = 1)

data.head()


In [None]:
data.info()

In [None]:
plt.figure(figsize = (12, 10))
sns.heatmap(data.corr(), annot=True, fmt=".2f", cmap="YlGnBu", cbar=False)

In [None]:
data[['Credit_Limit', 'Avg_Open_To_Buy']].head()

In [None]:
len(data[data['Credit_Limit'] == data['Avg_Open_To_Buy']]) / len(data)

In [None]:
len(data[data['Credit_Limit'] >= data['Avg_Open_To_Buy']]) / len(data)

Avg_Open_To_Buy describes Open to Buy Credit Line (Average of last 12 months). That's why it more or equals then Credit_Limit and corr between them =1.

In [None]:
len(data[data['Credit_Limit'] - data['Avg_Open_To_Buy'] == data['Total_Revolving_Bal']]) / len(data)

so 'Credit_Limit' = 'Avg_Open_To_Buy' + 'Total_Revolving_Bal'

## 3 Predicting

First of all I want to find the most important features (to drop some of them)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.drop(['Attrition_Flag'], axis=1), data['Attrition_Flag'], 
                                                    test_size=0.2, stratify = data['Attrition_Flag'])

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

In [None]:
list(zip(X_train.columns, clf.feature_importances_))

I think we can try to drop Gender and Card_Category. That makes sence: Gender is almost 50/50, in Card_Category mostly people have 'Blue' value.

Here we see that Marital_Status also doen't make big effort. Let's try to drop it

In [None]:
from sklearn.metrics import accuracy_score
print('accuracy_score:', accuracy_score(y_test, y_pred))

col_to_drop = ['Gender', 'Card_Category', 'Marital_Status_Divorced', 'Marital_Status_Married', 'Marital_Status_Single']

X_train = X_train.drop(col_to_drop, axis=1)
X_test = X_test.drop(col_to_drop, axis=1)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print('accuracy_score after dropping Gender, Marital_Status and Card_Category:', accuracy_score(y_test, y_pred))

After redoing train_test_split a few times, result of dropping columns almost always gave better results

In [None]:
list(zip(X_train.columns, clf.feature_importances_))

sometimes Desission Trees gets retraining. The way to predict it: set max depth to the trees. Let's check if it does this time:

In [None]:
x, test_res, train_res = [], [], []

for i in range(3,25):
    clf = RandomForestClassifier(max_depth = i)
    clf.fit(X_train, y_train)
    
    train_res += [accuracy_score(y_train, clf.predict(X_train))]
    test_res += [accuracy_score(y_test, clf.predict(X_test))]
    x += [i]
    
    
plt.figure(figsize = (12, 4))
plt.plot(x, train_res, label = 'accuracy on train data')
plt.plot(x, test_res, label = 'accuracy on test data')
plt.legend()

so it does. in next steps i suggest to set max_depth around 10-11. It won't make results much worse, but the model will be working faster

now we can check other parameters. one of them if class_weight ("how important" each class is for us):

In [None]:
x, test_res, train_res = [], [], []

for i in range(1, 10):
    clf = RandomForestClassifier(max_depth = 11, class_weight = {0 : i, 1: 10-i})
    clf.fit(X_train, y_train)
    
    train_res += [accuracy_score(y_train, clf.predict(X_train))]
    test_res += [accuracy_score(y_test, clf.predict(X_test))]
    x += [i]
    
    
plt.figure(figsize = (12, 4))
plt.plot(x, train_res, label = 'accuracy on train data')
plt.plot(x, test_res, label = 'accuracy on test data')
plt.legend()

X axis means our value to class '0' (and 10 - x will be value to class '1'). Standart values are eqnal (5:5 on our graph)

Before that making any decisions let's make one more graph, but for another metric: recall (which is important for us)

In [None]:
from sklearn.metrics import recall_score

x, test_res, train_res = [], [], []

for i in range(1, 10):
    clf = RandomForestClassifier(max_depth = 11, class_weight = {0 : i, 1: 10-i})
    clf.fit(X_train, y_train)
    
    train_res += [recall_score(y_train, clf.predict(X_train))]
    test_res += [recall_score(y_test, clf.predict(X_test))]
    x += [i]
    
    
plt.figure(figsize = (12, 4))
plt.plot(x, train_res, label = 'recall_score on train data')
plt.plot(x, test_res, label = 'recall_score on test data')
plt.legend()

Our top priority in this business problem is to identify customers who are getting churned. Even if we predict non-churning customers as churned (FP), it won't harm our business. But predicting churning customers as Non-churning will do. So recall (TP/TP + FN) need to be higher.

So basicly, we sholdn't care about accuracy, but we should care about recall. That's why changing class_weight will be the right decision.


In [None]:
clf = RandomForestClassifier(max_depth = 11, class_weight = {0 : 1, 1: 9})
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print('accuracy_score:', accuracy_score(y_test, y_pred))
print('recall_score:', recall_score(y_test, y_pred))

In [None]:
confusion_matrix(y_test, y_pred)