In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Loading dataframe

In [None]:
df = pd.read_csv('/kaggle/input/credit-card-customers/BankChurners.csv')
column_list = df.columns
df = df.loc[:,column_list[:21]]
df.head()

# Basic data analysis

**Plotting the target variable**

In [None]:
flag =  df['Attrition_Flag'].unique()
flag_count = []
for f in flag:
    filt = (df['Attrition_Flag'] == f)
    flag_count.append(df.loc[filt,'Attrition_Flag'].count())    
fig,ax = plt.subplots(2,figsize=(10,10))
ax[0].pie(flag_count,labels=flag,autopct='%1.1f%%',shadow=True)
ax[0].set_title('Current Situation')
sns.countplot(df['Attrition_Flag'],ax=ax[1])
plt.show()

This is a highly unbalanced dataset.

Variation of age groups in dataset.

In [None]:
df['Customer_Age'].max()
age_bin = list(range(20,100,5))
fig,ax = plt.subplots(2,figsize=(10,10))
sns.countplot(df['Customer_Age'],ax=ax[0])
ax[0].set_title('Actual count of age groups')
sns.distplot(df['Customer_Age'],bins=age_bin,ax=ax[1])
ax[1].set_title('Variation of age group')
plt.show()


Age looks to be uniformly distributed

1- Gender plot.
2- Dependent Distribution
3- Education Level Plot
4- Income Category Plot
5- Card Category

In [None]:
fig,ax = plt.subplots(5,figsize=(20,20))
sns.countplot(df['Gender'],ax=ax[0])
ax[0].set_title('Gender count plot')
sns.countplot(df['Dependent_count'],ax=ax[1])
ax[1].set_title('Dependent count distribution')
sns.countplot(df['Education_Level'],ax=ax[2])
ax[2].set_title('Education_Level level')
sns.countplot(df['Income_Category'],ax=ax[3])
ax[3].set_title('Income_Category plot')
card_list = df['Card_Category'].unique()
card_count = []
for card in card_list:
    filt = (df['Card_Category'] == card)
    card_count.append(df.loc[filt,'Card_Category'].count())
ax[4].pie(card_count,labels = card_list,autopct='%1.1f%%',shadow=True)    
ax[4].set_title('Percentge of various cards')
plt.show()


**Let's visualize trend among people churned**

In [None]:
#['Existing Customer', 'Attrited Customer']
fig, ax = plt.subplots(4,figsize=(30,30))
df_EC = df[df['Attrition_Flag'] == 'Existing Customer']
df_AC = df[df['Attrition_Flag'] == 'Attrited Customer']
for i,card in enumerate(card_list):
    filt_EC = (df_EC['Card_Category'] == card)
    filt_AC = (df_AC['Card_Category'] == card)
    ax[i].scatter(df_EC.loc[filt_EC,'Months_on_book'],df_EC.loc[filt_EC,'Credit_Limit'],color='r',marker='.',label='Existing Customer')
    ax[i].scatter(df_AC.loc[filt_AC,'Months_on_book'],df_AC.loc[filt_AC,'Credit_Limit'],color='g',marker='x',label='Churned Customer')
    ax[i].set_title('for {} card'.format(card))
    ax[i].set_xlabel('Duration in months with cutomer')
    ax[i].set_ylabel('Credit Limit')
plt.legend()
plt.show()    
#df.head()
#sns.countplot('Card_Category',hue='Customer_Age',data=df)

**As we know majority of cutomers hold Blue card and looking at the plot for blue card it seems that people with medium to low credit limit are most likely to churn, alleast for the blue card, not much of a noticeble pattern in other products**

**Let's see if contacting customer from the bank has any pattern with churning**

In [None]:
sns.countplot(df['Contacts_Count_12_mon'],hue=df['Attrition_Flag'])
plt.show()

**customers contacted arund 2-4 times churned most, may be more follow ups were required**

# Feature engineering.

Dropping 'CLIENTNUM'column, Checking for Null values

In [None]:
df.drop(columns=['CLIENTNUM'],inplace=True)
df.isnull().sum()

**Now we need to OneHotEncode our categorical features except 'Income_Category' as this is a ordinal feature and we need to map it to ordinal values.**

'Income_Category' column has some unknown values, we need to figure out a strategy to fill in suitable value there.


Lets see if any column correlates with income catagory, so that we can use that to replace unknown.
We average out the range for Income Category and draw heatmap, from heatmap we can see that income catagory is highly negative correlated to Avg_Utilization_Ratio, we'll have to create a linear regression model to predict unknown values.

In [None]:
#['Attrition_Flag', 'Gender', 'Education_Level', 'Marital_Status','Income_Category', 'Card_Category']
income_map = {'Less than $40K':20,
              '$40K - $60K':50,
              '$60K - $80K':70,
              '$80K - $120K':100,
              '$120K +':140,
              'Unknown':0}
df['Income_Category'] = df['Income_Category'].map(income_map)
df.head()
filt = (df['Income_Category'] != 0)
num_cols = df.select_dtypes(include=['int32','int64','float64']).columns
df_num = df.loc[filt,num_cols]
corr = df_num.corr()
sns.heatmap(corr)
#income_array = np.array(df.loc[filt,'Income_Category'])
#limit_array = np.array(df.loc[filt,'Avg_Utilization_Ratio'])
#plt.plot(limit_array,income_array)

**Building linear regression model to predict unknown values in income category**

In [None]:
from sklearn.linear_model import LinearRegression
num_feat = ['Customer_Age', 'Dependent_count', 'Months_on_book','Total_Relationship_Count', 'Months_Inactive_12_mon','Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal','Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt','Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']
y = df_num['Income_Category']
X = df_num.drop(columns = 'Income_Category')
lr = LinearRegression()
lr.fit(X,y)
filt2 = (df['Income_Category'] == 0)
for i in range(len(df)):
    if df.loc[i,'Income_Category'] == 0:
               feature = np.array(df.loc[i,num_feat]).reshape(1,-1)
               temp_y = lr.predict(feature)
               df.loc[i,'Income_Category'] = temp_y 

Finally we have populated the Unknown values in 'Income_Category' column with the help of linear regression.

**Encoding the Catagorical Variables**
**We need to encode our target column 'Attrition_Flag' as well into numeric(0,1)**

In [None]:
df['Attrition_Flag'] = df['Attrition_Flag'].map({'Existing Customer':0,'Attrited Customer':1})
y = df['Attrition_Flag']
X = df.drop(columns='Attrition_Flag')
cat_cols = X.select_dtypes(include=['object']).columns
X_cat = pd.get_dummies(X[cat_cols],drop_first=True)
X.drop(columns=cat_cols,inplace=True)
X = pd.concat([X,X_cat],axis=1)
X.head()

# # Balancing our dataset
**As we know there is an imbalance of classes in our dataset, we need to balance it before moving to build our model**

In [None]:
y.value_counts()

Above result shows the high degree of imbalance, next we balance it

In [None]:
from imblearn.over_sampling import SMOTE
fig,ax = plt.subplots(2,figsize=(15,15))
print('class counts before balancing dataset')
print(y.value_counts())
sns.countplot(y,ax=ax[0])
ax[0].set_title('Before balancing')

smote = SMOTE(sampling_strategy='minority')
X_bal, y_bal = smote.fit_sample(X,y)
print('class count after balancing the dataset')
print(y_bal.value_counts())
sns.countplot(y_bal,ax=ax[1])
ax[1].set_title('After balancing')
plt.show()


# Machine learning

In [None]:
# KNN model
clf = KNeighborsClassifier(n_neighbors = 9)
scoring = {'accuracy': 'accuracy',
           'precision': 'precision',
           'recall': 'recall',
           'f1': 'f1'
           }
Model = ''
test_accuracy = 0
test_precision = 0
test_recall = 0
test_f1 = 0
performance_metrics = pd.DataFrame(columns=['Model','test_accuracy','test_precision','test_recall','test_f1'])
metrics_dict = {'Model':Model,'test_accuracy':test_accuracy,'test_precision':test_precision,'test_recall':test_recall,'test_f1':test_f1}
scores_dict = cross_validate(clf, X_bal, y_bal, scoring=scoring, n_jobs=-1)
fit_time = scores_dict['fit_time'].mean()
score_time = scores_dict['score_time'].mean()
metrics_dict['Model'] = 'KNN'
metrics_dict['test_accuracy'] = scores_dict['test_accuracy'].mean()
metrics_dict['test_precision'] = scores_dict['test_precision'].mean()
metrics_dict['test_recall'] = scores_dict['test_recall'].mean()
metrics_dict['test_f1'] = scores_dict['test_f1'].mean()
performance_metrics = performance_metrics.append(metrics_dict, ignore_index=True)

#Logistic regression
clf = LogisticRegression()
scores_dict = cross_validate(clf, X_bal, y_bal, scoring=scoring, n_jobs=-1)
fit_time = scores_dict['fit_time'].mean()
score_time = scores_dict['score_time'].mean()
metrics_dict['Model'] = 'LogisticRegression'
metrics_dict['test_accuracy'] = scores_dict['test_accuracy'].mean()
metrics_dict['test_precision'] = scores_dict['test_precision'].mean()
metrics_dict['test_recall'] = scores_dict['test_recall'].mean()
metrics_dict['test_f1'] = scores_dict['test_f1'].mean()
performance_metrics = performance_metrics.append(metrics_dict, ignore_index=True)

#Support Vector
clf = SVC()
scores_dict = cross_validate(clf, X_bal, y_bal, scoring=scoring, n_jobs=-1)
fit_time = scores_dict['fit_time'].mean()
score_time = scores_dict['score_time'].mean()
metrics_dict['Model'] = 'SVM'
metrics_dict['test_accuracy'] = scores_dict['test_accuracy'].mean()
metrics_dict['test_precision'] = scores_dict['test_precision'].mean()
metrics_dict['test_recall'] = scores_dict['test_recall'].mean()
metrics_dict['test_f1'] = scores_dict['test_f1'].mean()
performance_metrics = performance_metrics.append(metrics_dict, ignore_index=True)

#Naive bayes

clf = GaussianNB()
scores_dict = cross_validate(clf, X_bal, y_bal, scoring=scoring, n_jobs=-1)
fit_time = scores_dict['fit_time'].mean()
score_time = scores_dict['score_time'].mean()
metrics_dict['Model'] = 'Naive bayes'
metrics_dict['test_accuracy'] = scores_dict['test_accuracy'].mean()
metrics_dict['test_precision'] = scores_dict['test_precision'].mean()
metrics_dict['test_recall'] = scores_dict['test_recall'].mean()
metrics_dict['test_f1'] = scores_dict['test_f1'].mean()
performance_metrics = performance_metrics.append(metrics_dict, ignore_index=True)

#Random forest 
clf = RandomForestClassifier(n_estimators = 100)
scores_dict = cross_validate(clf, X_bal, y_bal, scoring=scoring, n_jobs=-1)
fit_time = scores_dict['fit_time'].mean()
score_time = scores_dict['score_time'].mean()
metrics_dict['Model'] = 'Randon Forest'
metrics_dict['test_accuracy'] = scores_dict['test_accuracy'].mean()
metrics_dict['test_precision'] = scores_dict['test_precision'].mean()
metrics_dict['test_recall'] = scores_dict['test_recall'].mean()
metrics_dict['test_f1'] = scores_dict['test_f1'].mean()
performance_metrics = performance_metrics.append(metrics_dict, ignore_index=True)

#


performance_metrics
