In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import plotly.express as px
from collections import Counter
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score,recall_score
from sklearn.ensemble import RandomForestClassifier
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv("../input/credit-card-customers/BankChurners.csv")
df.drop(['Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1'
         ,'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'],axis=1,inplace=True)
df.head()

In [None]:
c=df.groupby(['Attrition_Flag'])["CLIENTNUM"].agg('count')
fig=px.pie(values=c.values,names=['Non-attrited','Attrited'],title='% of Attrited Customers')
fig.show()

In [None]:
def percentage_analysis_of_categoricalindicator_and_attrition(indicator):
    c=df.groupby(['Attrition_Flag',indicator])['CLIENTNUM'].agg("count").reset_index()
    edu=df.groupby(indicator)['CLIENTNUM'].agg("count").reset_index()
    c=pd.merge(c,edu,on=indicator)
    c['percentage attirtion']=c['CLIENTNUM_x']/c['CLIENTNUM_y']
    fig=px.bar(c,x=indicator,y='percentage attirtion',color='Attrition_Flag',barmode='group')
    fig.show()

The function is to do find the relationship between categorical indicators and the attrition of credit card holders.

In [None]:
fig=percentage_analysis_of_categoricalindicator_and_attrition("Card_Category")

Not much relation is seen between attrition and the type of category. The platinum card holders attritioned slightly more than the other card holders.

In [None]:
percentage_analysis_of_categoricalindicator_and_attrition("Gender")

In [None]:
percentage_analysis_of_categoricalindicator_and_attrition('Marital_Status')

In [None]:
percentage_analysis_of_categoricalindicator_and_attrition('Income_Category')

There is no relation between the income level and attrition.

In [None]:
percentage_analysis_of_categoricalindicator_and_attrition("Education_Level")

**No single categorical variable itself does not have much correlation with attrition.**

In [None]:
sns.scatterplot(data=df,x='Total_Trans_Amt',y='Total_Trans_Ct',hue='Attrition_Flag')

Here we can certainly see that those customers that have spent large amounts through the credit card are very less likely to atrrition. Those customers that have performed very less Transactions and have spent a small amount are more likely to attrition

In [None]:
sns.scatterplot(data=df,x='Total_Amt_Chng_Q4_Q1',y='Total_Ct_Chng_Q4_Q1',hue='Attrition_Flag')

The customers which have done less number of transactions and also less amount in transactions in Q4 as compared to Q1 are more likely to show attrition.

In [None]:
sns.catplot(data=df,y='Total_Ct_Chng_Q4_Q1',x='Attrition_Flag',col='Card_Category',kind='violin')

In [None]:
label=['A+','A','B','C',"D"]
df['Customer index']=pd.cut(df['Avg_Utilization_Ratio'],bins=[-0.001,0.2,0.4,0.6,0.8,1],precision=2,labels=label)

Dividing the customers based on the utilization ratio helps us find the more 'honest' customers that have a less revolving balance.

In [None]:
sns.catplot(data=df,col='Customer index',y='Credit_Limit',x='Attrition_Flag')

Credit Limit and avg utilization Ratio have a strong correlation as most banks use the utilization ratio to determine the customer index.A low utilization ratio means that the customer repays the amount borrowed . The less the utilization ratio the better. In the above figure also as the customer index decreases the credit limit also goes down.

In [None]:
df.drop(['Customer_Age','Gender','Education_Level','Marital_Status','Income_Category','Card_Category','Dependent_count',
        "Months_on_book",'Credit_Limit','Avg_Open_To_Buy','Contacts_Count_12_mon','Customer index','CLIENTNUM','Total_Revolving_Bal','Months_Inactive_12_mon','Total_Ct_Chng_Q4_Q1'],inplace=True,axis=1)
df.replace({'Attrition_Flag':{'Existing Customer':0,'Attrited Customer':1}},inplace=True)

From the scatter plots and bar plots it can be seen that the attrition does not depend on any of the categorical variables. Also, most numeric variables such as Credit Limit, Revolving Balance are accounted for in the Avg Utilization Ratio. The relationship of the customer also plays a vital role as if the Customer has debit card, account,etc. in the same bank they're less likely to change credit cards.

In [None]:
train=df.loc[:8000]  # 80% for training
test=df.loc[8000:]   # 20% for testing

In [None]:
X_train=train.drop('Attrition_Flag',axis=1)
X_test=test.drop('Attrition_Flag',axis=1)
y_train=train['Attrition_Flag']
y_test=test['Attrition_Flag']

In [None]:
def predict(df,model,labels):
    preds=[]
    for index,x in df.iterrows():
        if int(x['Total_Trans_Amt'])>=12500:   #if amount spent is more than 12500 we know the customer would 
            preds.append(0)                       #not change credit cards 
        else :
            i=np.array(x).reshape(1,-1)
            p=model.predict_proba(i)[0]
            if p[1]>0.30:
                preds.append(1)
            else :
                preds.append(0)
    print(len(preds))
    preds=pd.Series(preds)
    print("Accuracy = ",accuracy_score(labels,preds),'%')
    print("Precision = ",precision_score(labels,preds,pos_label=1),'%')
    print("Recall = ",recall_score(labels,preds,pos_label=1),"%")
    return preds
    

In the scatterplot it can be clearly seen that those accounts that have done higher amount of transactions and also spent more amount on those credit cards are not likely to change credit cards.

In [None]:
forest=RandomForestClassifier(max_depth=4,n_estimators=100)
forest.fit(X_train,y_train)

In [None]:
train_preds=predict(X_train,forest,y_train)

In [None]:
preds=predict(X_test,forest,y_test)

In [None]:
train.columns