# This is a script to analyze the dataset and provide visualisations of the most relatable variables affecting customers. with ML predictions to support them!

In [None]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from statistics import mean
from sklearn.preprocessing import LabelEncoder
import sys
import os

Importing the liberaries to be used

In [None]:
def correct_indexing(dataset):
    dataset.insert(0, 'index', range(0, len(dataset)))
    dataset.set_index("index", inplace = True)
    return

This function only corrects the indexing of the dataset for further use

In [None]:
dataset = pd.read_csv("../input/credit-card-customers/BankChurners.csv")
dataset.drop(['CLIENTNUM', 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1', 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'], axis = 1, inplace = True)

dataset['Attrition_Flag'].replace({'Existing Customer' : 1, 'Attrited Customer' : 0}, inplace = True)
dataset['Gender'].replace({'M' : 1, 'F' : 0}, inplace = True)
dataset['Education_Level'].replace({'Uneducated' : 0, 'Unknown' : -1, 'High School' : 1, 'College' : 2, 'Graduate' : 3, 'Post-Graduate' : 4, 'Doctorate' : 5}, inplace = True)
dataset['Marital_Status'].replace({'Married' : 1, 'Single' : 0, 'Divorced' : 2, 'Unknown' : -1}, inplace = True)
dataset['Income_Category'].replace({'Unknown' : -1, 'Less than $40K' : 1, '$40K - $60K' : 2, '$60K - $80K' : 3, '$80K - $120K' : 4, '$120K +' : 5}, inplace = True)
dataset['Card_Category'].replace({'Blue' : 0, 'Silver' : 1, 'Gold' : 2, 'Platinum' : 3}, inplace = True)

dataset.drop(dataset[dataset['Education_Level'] == -1 ].index, inplace = True)
dataset.drop(dataset[dataset['Income_Category'] == -1 ].index, inplace = True)
dataset.drop(dataset[dataset['Marital_Status'] == -1 ].index, inplace = True)
dataset = dataset.dropna()
correct_indexing(dataset)

dataset.head(5)

Here with the help of this corellation plot we can see that most of the data is not related to 'Attrition_Flag' of a customer and we can see a moderate corellation between **'Attrition_Flag'** and **'Contacts_Count_12_mon', 'Total_Revolving_Bal', 'Total_Trans_Amt', 'Total_Trans_Ct', 'Total_Amt_Chng_Q4_Q1', 'Total_Ct_Chng_Q4_Q1'

We can now see that these variable are more closely related to 'Attrition_Flag' than 'Income_Category'. So moving on we will only focus on finding type of relation between these variables and supporting this point!

In [None]:
def approximator(dataset, column, approximation):
    rounded = []
    j = 0
    for i in dataset[column].items():
        j = approximation * round(i[1] / approximation)
        rounded.append(j)
    dataset[column] = rounded
    return

approximator(dataset, 'Credit_Limit', 1000)
approximator(dataset, 'Total_Revolving_Bal', 100)
approximator(dataset, 'Avg_Open_To_Buy', 1000)
approximator(dataset, 'Total_Trans_Amt', 500)
approximator(dataset, 'Total_Trans_Ct', 10)
approximator(dataset, 'Total_Amt_Chng_Q4_Q1', 0.05)
approximator(dataset, 'Total_Ct_Chng_Q4_Q1', 0.05)
approximator(dataset, 'Avg_Utilization_Ratio', 0.15)

This function is used to approximate most of the wildely scattered values. Basically it rounded up the values in order of the number provided to the function! Useful to reduce scattered data into some symmetry

In [None]:
matrix = dataset.corr()
sns.heatmap(matrix, vmax = 1, square = True)

In [None]:
def catagory_churn_unchurn(type_of_card, data):
    dataset_card = data[data['Card_Category'] == type_of_card]
    correct_indexing(dataset_card)
    return dataset_card

blue = catagory_churn_unchurn(0, dataset)
silver = catagory_churn_unchurn(1, dataset)
gold = catagory_churn_unchurn(2, dataset)
platinum = catagory_churn_unchurn(3, dataset)

This function produced dataset based on the type of credit card the customer have

In [None]:
def plotter(column, Group):
    if (column == 'Contacts_Count_12_mon') | (column == 'Total_Revolving_Bal') | (column == 'Total_Trans_Amt') | (column == 'Total_Trans_Ct'):
        plot = pd.crosstab(dataset[column], dataset[Group])
        plot.div(plot.sum(1).astype(float), axis = 0).plot(kind = "bar", stacked = True) 
        plot1 = pd.crosstab(blue[column], blue[Group])
        plot1.div(plot1.sum(1).astype(float), axis = 0).plot(kind = "bar", stacked = True) 
        plot2 = pd.crosstab(silver[column], silver[Group])
        plot2.div(plot2.sum(1).astype(float), axis = 0).plot(kind = "bar", stacked = True)
        plot3 = pd.crosstab(gold[column], gold[Group])
        plot3.div(plot3.sum(1).astype(float), axis = 0).plot(kind = "bar", stacked = True)
        plot4 = pd.crosstab(platinum[column], platinum[Group])
        plot4.div(plot4.sum(1).astype(float), axis = 0).plot(kind = "bar", stacked = True)
        plt.show()
    else:
        plot = pd.crosstab(dataset[column], dataset[Group])
        plot.div(plot.sum(1).astype(float), axis = 0).plot(kind = "line", stacked = True)
        plot1 = pd.crosstab(blue[column], blue[Group])
        plot1.div(plot1.sum(1).astype(float), axis = 0).plot(kind = "line", stacked = True)
        plot2 = pd.crosstab(silver[column], silver[Group])
        plot2.div(plot2.sum(1).astype(float), axis = 0).plot(kind = "line", stacked = True)
        plot3 = pd.crosstab(gold[column], gold[Group])
        plot3.div(plot3.sum(1).astype(float), axis = 0).plot(kind = "line", stacked = True)
        plot4 = pd.crosstab(platinum[column], platinum[Group])
        plot4.div(plot4.sum(1).astype(float), axis = 0).plot(kind = "line", stacked = True)
        plt.show()
    return

This function will plot the graphs of the given column1 against column2 and for 5 datasets.
* complete dataset
* blue credit card holders
* silver credit card holders
* gold credit card holders
* platinum credit card holders

Graphs are in this specific orders

In [None]:
plotter('Contacts_Count_12_mon', 'Attrition_Flag')

Contacts_Counts_12_mon shows a weak negative corellation with the Attirition_flag of the customer BUT is very consistant and the relationship can be used for predictiong the future churning customers

In [None]:
plotter('Total_Revolving_Bal', 'Attrition_Flag')

Total_Revolving_Bal shows a strong positive corellation with the Attirition_flag of the customer and is very consistant and the relationship can be used for predictiong the future churning customers

In [None]:
plotter('Total_Trans_Amt', 'Attrition_Flag')

Total_Transaction_Amt shows a strong positive corellation preak between 5000 to 10000 with the Attirition_flag of the customer and is very consistant and the relationship can be used for predictiong the future churning customers

In [None]:
plotter('Total_Trans_Ct', 'Attrition_Flag')

Total_Trans_Ct shows a strong positive corellation with the Attirition_flag of the customer and is very consistant and the relationship can be used for predictiong the future churning customers

In [None]:
plotter('Total_Amt_Chng_Q4_Q1', 'Attrition_Flag')

Total_Amt_Chng_Q4_Q1 shows a strong positive corellation with the Attirition_flag of the customer and is very consistant and the relationship can be used for predictiong the future churning customers

In [None]:
plotter('Total_Ct_Chng_Q4_Q1', 'Attrition_Flag')

Total_Ct_Chng_Q4_Q1 shows a strong positive corellation with the Attirition_flag of the customer and is very consistant and the relationship can be used for predictiong the future churning customers

In [None]:
plot = pd.crosstab(dataset['Credit_Limit'], dataset['Income_Category'])
plot.div(plot.sum(1).astype(float), axis = 0).plot(kind = "bar", stacked = True)
plt.show()

We can see that The amount of customers with high end card with low credit limit. This seems to be the problem in future.
Lets furture investigate it
* Here we can see that the customers with lowest credit limit are still 0.2 in ratio of whole for high end cards.

In [None]:
plot = pd.crosstab(dataset['Credit_Limit'], dataset['Attrition_Flag'])
plot.div(plot.sum(1).astype(float), axis = 0).plot(kind = "bar", stacked = True)
plt.show()

Here we can see the above plot of complete dataset is approximating the 0.2 ration of churned customers as seen in above plots.

In [None]:
le = LabelEncoder()

def categorisation(data_col, ds1, ds2, ds3, ds4, ds5):
    ds1 = pd.concat([ds1, pd.get_dummies(ds1[data_col], prefix = data_col)], axis = 1)
    ds1.drop([data_col], axis = 1, inplace = True)
    
    ds2 = pd.concat([ds2, pd.get_dummies(ds2[data_col], prefix = data_col)], axis = 1)
    ds2.drop([data_col], axis = 1, inplace = True)
    
    ds3 = pd.concat([ds3, pd.get_dummies(ds3[data_col], prefix = data_col)], axis = 1)
    ds3.drop([data_col], axis = 1, inplace = True)
    
    ds4 = pd.concat([ds4, pd.get_dummies(ds4[data_col], prefix = data_col)], axis = 1)
    ds4.drop([data_col], axis = 1, inplace = True)
    
    ds5 = pd.concat([ds5, pd.get_dummies(ds5[data_col], prefix = data_col)], axis = 1)
    ds5.drop([data_col], axis = 1, inplace = True)
    return ds1, ds2, ds3, ds4, ds5

dataset, blue, silver, gold, platinum = categorisation('Credit_Limit', dataset, blue, silver, gold, platinum)
dataset, blue, silver, gold, platinum = categorisation('Months_Inactive_12_mon', dataset, blue, silver, gold, platinum)
dataset, blue, silver, gold, platinum = categorisation('Contacts_Count_12_mon', dataset, blue, silver, gold, platinum)
dataset, blue, silver, gold, platinum = categorisation('Total_Revolving_Bal', dataset, blue, silver, gold, platinum)
dataset, blue, silver, gold, platinum = categorisation('Total_Trans_Amt', dataset, blue, silver, gold, platinum)
dataset, blue, silver, gold, platinum = categorisation('Total_Trans_Ct', dataset, blue, silver, gold, platinum)
dataset, blue, silver, gold, platinum = categorisation('Total_Amt_Chng_Q4_Q1', dataset, blue, silver, gold, platinum)
dataset, blue, silver, gold, platinum = categorisation('Total_Ct_Chng_Q4_Q1', dataset, blue, silver, gold, platinum)

Label encoding the 'Y' column and getting the dummied from the rest of the column

In [None]:
def test_train(dataset):
    train, test = train_test_split(dataset, train_size=0.4, random_state = 0)
    
    train.drop(['Customer_Age', 'Dependent_count', 'Months_on_book', 'Total_Relationship_Count', 'Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category', 'Avg_Open_To_Buy', 'Avg_Utilization_Ratio'], axis = 1, inplace = True)
    test.drop(['Customer_Age', 'Dependent_count', 'Months_on_book', 'Total_Relationship_Count', 'Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category', 'Avg_Open_To_Buy', 'Avg_Utilization_Ratio'], axis = 1, inplace = True)
    
    train.columns
    test.columns
    
    y_test = test['Attrition_Flag']
    y_train = train['Attrition_Flag']
    x_test = test.drop(['Attrition_Flag'], axis = 1)
    x_train = train.drop(['Attrition_Flag'], axis = 1)
    
    oversample = SMOTE(k_neighbors = 1)
    x_train_upsampled, y_train_upsampled = oversample.fit_resample(x_train, y_train)
    
    return x_train_upsampled, x_test, y_train_upsampled, y_test

def classifier_models(dataset):
    x_train, x_test, y_train, y_test = test_train(dataset)
    classifier = RandomForestClassifier(n_estimators = 50, random_state = 0)
    classifier.fit(x_train, y_train)
    
    predict = classifier.predict(x_test)
    acc = accuracy_score(y_test, predict) *  100
    return acc

def model_looper():
    datasets = [dataset, blue, silver, gold, platinum]
    for j in range (0, 5):
        avg_acc = []
        for i in range (0, 10):
            avg_acc.append(classifier_models(datasets[j]))
        print ("Accuracy of model in predicting churned vs unchurned in ", j, " is :-> ", round (mean(avg_acc), 2), "%")
    return

model_looper()

* Accuracy of model in predicting churned vs unchurned in  0  is :->  92.06 %
* Accuracy of model in predicting churned vs unchurned in  1  is :->  91.42 %
* Accuracy of model in predicting churned vs unchurned in  2  is :->  82.09 %
* Accuracy of model in predicting churned vs unchurned in  3  is :->  83.67 %
* Accuracy of model in predicting churned vs unchurned in  4  is :->  100.0 %

* The **'0'** means the complete **dataset**
* The **'1'** means the complete **Blue Card Owners Only**
* The **'2'** means the complete **Silver Card Owners Only**
* The **'3'** means the complete **Gold Card Owners Only**
* The **'4'** means the complete **Platinum Card Owners Only**

# The conclusion from this plot comparision is to limit the lower limit of credit card which are high end to avoide customers from getting high end cards with low credit limits

# Another thing we learned is that 'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Total_Revolving_Bal', 'Total_Trans_Amt', 'Total_Trans_Ct', 'Total_Amt_Chng_Q4_Q1', 'Total_Ct_Chng_Q4_Q1' data is most accurate in predicting in the churned and unchurned datasets.

# The churned customers are the ones with lower income and high end card or high expenditure.

# currently hidden churning customers are those with lower credit limit and high end card.