In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Happy Customer Bank is a mid-sized private bank that deals in all kinds of banking products, like Savings accounts, Current 
accounts, investment products, credit products, among other offerings.

The bank also cross-sells products to its existing customers and to do so they use different kinds of communication 
like tele-calling, e-mails, recommendations on net banking, mobile banking, etc.

In this case, the Happy Customer Bank wants to cross sell its credit cards to its existing customers. The bank has 
identified a set of customers that are eligible for taking these credit cards.

Now, the bank is looking for your help in identifying customers that could show higher intent towards a recommended 
credit card, given:

- Customer details (gender, age, region etc.)
- Details of his/her relationship with the bank (ChannelCode,Vintage, 'AvgAsset_Value etc.)

<b> Data Dictionary </b>

- ID - Unique Identifier for a row

- Gender - Gender of the Customer

- Age - Age of the Customer (in Years)

- Region_Code - Code of the Region for the customers

- Occupation - Occupation Type for the customer

- Channel_Code - Acquisition Channel Code for the Customer (Encoded)

- Vintage - Vintage for the Customer (In Months)

- Credit_Product - If the Customer has any active credit product (Home loan, Personal loan, Credit Card etc.)

- AvgAccountBalance - Average Account Balance for the Customer in last 12 Months

- Is_Active - If the Customer is Active in last 3 Months

- Is_Lead(Target) - If the Customer is interested for the Credit Card

0 : Customer is not interested

1 : Customer is interested

#  Table of Contents


## 1. Libraries and the Dataset

*        1.1   Importing the Libraries 
*        1.2   Importing the Dataset


## 2. Data Exploration

### 2.1 Visualising Categorical Data

*       2.1.1   Lead Counts
*       2.1.2   Lead Counts and Percentages for Different Genders
*       2.1.3   Lead Counts and Percentages for Different Regions
*       2.1.4   Lead Counts and Percentages for Different Occupations
*       2.1.5   Lead Counts and Percentages for Different Channel Codes
*       2.1.6   Lead Counts and Percentages Based on Credit Product Status
*       2.1.7   Lead Counts and Percentages Based on Active Status

### 2.2 Visualising Numerical Data

*       2.2.1   Distribution of Variables
*       2.2.2   Distribution of Variables Based on Lead Counts
  

### 2.3 Correlation Analysis

*       2.3.1   Correlation Among Numerical VariableS


## 3. Data Preparation

*       3.1 Feature Engineering


## 4. Training and Tuning Classification Models

*       4.1   Logistic Regression
*       4.2   Random Forest Classifier
*       4.3   XGBoost Classifier   

# 1. Libraries and the Dataset

## 1.1 Importing the Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from scipy.stats import f_oneway
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, classification_report
from sklearn.feature_selection import SelectFromModel, SelectKBest, chi2, f_regression, RFE
from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier


from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import cufflinks as cf
import plotly
from plotly import tools
import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.express as px
from plotly.offline import iplot
from plotly.subplots import make_subplots
plotly.offline.init_notebook_mode(connected=True)

pd.options.display.max_columns = None

import warnings
warnings.filterwarnings('ignore')

## 1.2 Importing the Dataset

In [None]:
train_data = pd.read_csv("/kaggle/input/jobathon-may-2021-credit-card-lead-prediction/train.csv")

In [None]:
train_data.shape

In [None]:
train_data.head(10)

In [None]:
train_data.tail()

In [None]:
train_data.info()

Only Credit_Product column has some Null values, which we will deal with later

# 2. Data Exploration

## 2.1 Visualising Categorial Data

### 2.1.1 Lead Counts

In [None]:
#ploting bar plot
lead_per = train_data['Is_Lead'].value_counts(normalize=True)[1]*100
non_lead_per = train_data['Is_Lead'].value_counts(normalize=True)[0]*100
fig = px.bar(x=["0","1"], y=train_data["Is_Lead"].value_counts(),color=["red", "goldenrod"],
             hover_name = ['Percentage: {}%'.format(round(non_lead_per,3)), 'Percentage: {}%'.format(round(lead_per,3))])

for dt in fig.data:
    dt["width"] = 0.4 

#updating plot details
fig.update_layout(
    title_text = "Counts for Leads and Non-Leads",
    title_x=0.5,
    width=800,
    height=550,
    xaxis_title="Non Leads vs Leads",
    yaxis_title="Count",
    showlegend=False
).show()

As we can see from the plot, the data is quite unbalanced. Only about 24% of the customers in the dataset turned were interested in buying a credit card

### 2.1.2 Lead Counts and Percentages for Different Genders

In [None]:
fig = go.Figure(data = [
    go.Bar(name = 'Lead', x = ['Male', 'Female'], 
           y = train_data[train_data['Is_Lead'] == 1]['Gender'].value_counts(), 
           marker_line=dict(color='black'),marker_line_width=1.2),
    go.Bar(name = 'Non Lead', x = ['Male', 'Female'], 
           y = train_data[train_data['Is_Lead'] == 0]['Gender'].value_counts(),
           marker_line=dict(color='black'),marker_line_width=1.2),
])

fig.update_layout(title = 'Lead Counts for Males and Females')

In [None]:
print('While {}% of the males bought a credit card, only {}% of the females bought a credit when offered.'.format(
    round((train_data[train_data['Gender'] == 'Male']['Is_Lead'].value_counts(normalize = True)*100)[1],3),
    round((train_data[train_data['Gender'] == 'Female']['Is_Lead'].value_counts(normalize = True)*100)[1],3)))

### 2.1.3 Lead Counts and Percentages for Different Regions

In [None]:
region_lead_count = dict()
region_non_lead_count = dict()
for region in list(train_data['Region_Code'].unique()):
    region_lead_count[region] = train_data[train_data['Region_Code']==region]['Is_Lead'].value_counts()[1]
    region_non_lead_count[region] = train_data[train_data['Region_Code']==region]['Is_Lead'].value_counts()[0]

In [None]:
list(region_lead_count.keys()) == list(region_non_lead_count.keys()) 

In [None]:
fig = go.Figure(data = [
    go.Bar(name = 'Lead', x = list(region_lead_count.keys()), 
           y = list(region_lead_count.values()),
           marker_line=dict(color='black'),marker_line_width=1.2),
    go.Bar(name = 'Non Lead', x = list(region_non_lead_count.keys()) , 
           y = list(region_non_lead_count.values()) ,
           marker_line=dict(color='black'),marker_line_width=1.2),
])

fig.update_layout(title = 'Lead Counts for Different Regions')

In [None]:
region_name = []
region_lead_per = []

for region in train_data['Region_Code'].unique():
    region_name.append(region)
    region_lead_per.append(round(train_data[train_data['Region_Code'] == region]['Is_Lead'].value_counts(normalize=True)[1]*100,3))

In [None]:
fig = px.bar(x=region_name, y=region_lead_per, color=region_name)

#updating plot details
fig.update_layout(
    title_text = "Lead Percentages for Each Region",
    title_x=0.5,
    width=800,
    height=550,
    xaxis_title="Region",
    yaxis_title="Lead Percentage",
    showlegend=False
).show()

As data shows, lead percentages for regions vary from 13% to 30%. I will classify the regions based on these percentages in the data preparation phase

### 2.1.4 Lead Counts and Percentages for Different Occupations

In [None]:
occ_lead_count = dict()
occ_non_lead_count = dict()
for occ in list(train_data['Occupation'].unique()):
    occ_lead_count[occ] = train_data[train_data['Occupation']==occ]['Is_Lead'].value_counts()[1]
    occ_non_lead_count[occ] = train_data[train_data['Occupation']==occ]['Is_Lead'].value_counts()[0]

In [None]:
list(occ_lead_count.keys()) == list(occ_non_lead_count.keys()) 

In [None]:
fig = go.Figure(data = [
    go.Bar(name = 'Lead', x = list(occ_lead_count.keys()), 
           y = list(occ_lead_count.values()),
           marker_line=dict(color='black'),marker_line_width=1.2),
    go.Bar(name = 'Non Lead', x = list(occ_non_lead_count.keys()) , 
           y = list(list(occ_non_lead_count.values())),
           marker_line=dict(color='black'),marker_line_width=1.2),
])

fig.update_layout(title = 'Lead Counts for Different Occupations')

In [None]:
occ_name = []
occ_lead_per = []

for occ in train_data['Occupation'].unique():
    occ_name.append(occ)
    occ_lead_per.append(round(train_data[train_data['Occupation'] == occ]['Is_Lead'].value_counts(normalize=True)[1]*100,3))

In [None]:
fig = px.bar(x=occ_name, y=occ_lead_per, color=occ_name)

#updating plot details
fig.update_layout(
    title_text = "Lead Percentages for Each Occupation",
    title_x=0.5,
    width=800,
    height=550,
    xaxis_title="Occupation",
    yaxis_title="Lead Percentage",
    showlegend=False
).show()

As data shows, while entreprenuers have an outstanding lead percentage of 66%, the lead percentage for the remaining occupations vary between 15% and 27%

### 2.1.5 Lead Counts and Percentages for Different Occupations

In [None]:
train_data['Channel_Code'].value_counts()

In [None]:
cc_lead_count = dict()
cc_non_lead_count = dict()
for cc in list(train_data['Channel_Code'].unique()):
    cc_lead_count[cc] = train_data[train_data['Channel_Code']==cc]['Is_Lead'].value_counts()[1]
    cc_non_lead_count[cc] = train_data[train_data['Channel_Code']==cc]['Is_Lead'].value_counts()[0]

In [None]:
list(cc_lead_count.keys()) == list(cc_non_lead_count.keys())

In [None]:
fig = go.Figure(data = [
    go.Bar(name = 'Lead', x = list(cc_lead_count.keys()), 
           y = list(cc_lead_count.values()),
           marker_line=dict(color='black'),marker_line_width=1.2),
    go.Bar(name = 'Non Lead', x = list(cc_non_lead_count.keys()) , 
           y = list(cc_non_lead_count.values()),
           marker_line=dict(color='black'),marker_line_width=1.2),
])

fig.update_layout(title = 'Lead Counts for Different Channel Codes')

In [None]:
cc_name = []
cc_lead_per = []

for cc in train_data['Channel_Code'].unique():
    cc_name.append(cc)
    cc_lead_per.append(round(train_data[train_data['Channel_Code'] == cc]['Is_Lead'].value_counts(normalize=True)[1]*100,3))

In [None]:
fig = px.bar(x=cc_name, y=cc_lead_per, color=cc_name)

#updating plot details
fig.update_layout(
    title_text = "Lead Percentages for Each Channel Code",
    title_x=0.5,
    width=800,
    height=550,
    xaxis_title="Channel Code",
    yaxis_title="Lead Percentage",
    showlegend=False
).show()

While X1 channel has a noticably low lead percentage of 9%, the lead percentage for other channels vary in between 24% and 36%

### 2.1.6 Lead Counts and Percentages Based on Credit Products

In [None]:
list(train_data['Credit_Product'].unique())

In [None]:
#Replacing the nan values with Na (Not available) to be able to include them in the analysis

train_data['Credit_Product'].fillna('Na', inplace=True)

In [None]:
cp_lead_count = dict()
cp_non_lead_count = dict()
for cp in list(train_data['Credit_Product'].unique()):
    cp_lead_count[cp] = train_data[train_data['Credit_Product']==cp]['Is_Lead'].value_counts()[1]
    cp_non_lead_count[cp] = train_data[train_data['Credit_Product']==cp]['Is_Lead'].value_counts()[0]

In [None]:
list(cp_lead_count.keys()) == list(cp_non_lead_count.keys()) 

In [None]:
fig = go.Figure(data = [
    go.Bar(name = 'Lead', x = list(cp_lead_count.keys()), 
           y = list(cp_lead_count.values()),
           marker_line=dict(color='black'),marker_line_width=1.2),
    go.Bar(name = 'Non Lead', x = list(cp_non_lead_count.keys()) , 
           y = list(cp_non_lead_count.values()),
           marker_line=dict(color='black'),marker_line_width=1.2),
])

fig.update_layout(title = 'Lead Counts Based on Credit Products')

In [None]:
cp_name = []
cp_lead_per = []

for cp in train_data['Credit_Product'].unique():
    cp_name.append(cp)
    cp_lead_per.append(round(train_data[train_data['Credit_Product'] == cp]['Is_Lead'].value_counts(normalize=True)[1]*100,3))

In [None]:
fig = px.bar(x=cp_name, y=cp_lead_per, color=cp_name)

#updating plot details
fig.update_layout(
    title_text = "Lead Percentages Based on Credit Product",
    title_x=0.5,
    width=800,
    height=550,
    xaxis_title="Credit Product",
    yaxis_title="Lead Percentage",
    showlegend=False
).show()

In [None]:
len(train_data[train_data['Credit_Product'] == 'Na'])

Suprisingly 29325 people whose credit product status is not known had an outstanding 85% lead percentage.

### 2.1.7 Lead Counts and Percentages Based on Credit Products

In [None]:
train_data[train_data['Is_Lead']==1]['Is_Active'].value_counts()

In [None]:
train_data[train_data['Is_Lead']==0]['Is_Active'].value_counts()

In [None]:
fig = go.Figure(data = [
    go.Bar(name = 'Lead', x = ['No', 'Yes'], 
           y = train_data[train_data['Is_Lead']==1]['Is_Active'].value_counts(), 
           marker_line=dict(color='black'),marker_line_width=1.2),
    go.Bar(name = 'Non Lead', x = ['No', 'Yes'], 
           y = train_data[train_data['Is_Lead']==0]['Is_Active'].value_counts(),
           marker_line=dict(color='black'),marker_line_width=1.2),
])

fig.update_layout(title = 'Lead Counts for Active and Not Active Customers')

In [None]:
print('While {}% of the active customers bought a credit card, only {}% of the non active customers bought a credit card   when offered.'.format(
    round((train_data[train_data['Is_Active'] == 'Yes']['Is_Lead'].value_counts(normalize = True)*100)[1],3),
    round((train_data[train_data['Is_Active'] == 'No']['Is_Lead'].value_counts(normalize = True)*100)[1],3)))

As data shows, active customers are more likely to buy a credit card

## 2.2 Visualising Numerical Data

### 2.2.1 Distribution of Variables

In [None]:
trace1 = go.Histogram(
   x = train_data['Age'],
    name = 'Age',
    marker_line=dict(color='black'), marker_line_width=.2
)
trace2 = go.Histogram(
   x = train_data['Vintage'],
   name = 'Vintage',
    marker_line=dict(color='black'),marker_line_width=.2
)
trace3 = go.Histogram(
    x = train_data['Avg_Account_Balance'],
    name = 'Average Account Balance',
    marker_line=dict(color='black'),marker_line_width=.2
)
fig = tools.make_subplots(rows = 2, cols = 2)
fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig.append_trace(trace3, 2, 1)
iplot(fig)

As plots show, all numerical variables are right skewed and average account balance seemed to have large number of outliers.

### 2.2.2 Distribution of Variables Based on Lead Counts

In [None]:
fig = px.box(train_data, y = 'Avg_Account_Balance', color = 'Is_Lead')
fig.show()

As boxplots show, poeple with large avg account balance that we consider "outliers" constitute a significant portion of the dataset. The outliers also do not seem to impact the lead conversion ratio, since the distribution of average account balances for people who end up buying a credit card and not buying a credit card are very similar.

I will take the log transformation of average account balance to obtain a more normal distribution

In [None]:
train_data['Log_Avg_Account_Balance'] = np.log(train_data['Avg_Account_Balance'])

In [None]:
fig = px.histogram(x = train_data['Log_Avg_Account_Balance'])
fig.update_layout(
xaxis_title = 'Log of Average Account Balance')

In [None]:
fig = px.box(train_data, y = ['Age', 'Vintage'], color = 'Is_Lead')
fig.show()

As box plots show, people who are older and had an account for a longer period of time are more likely to buy a credit card.

Intuitively, older people are also more likely to have higher account balances with longer vintage periods compared to younger people. It suggets that there could be high correlation among these variables, which brings us to the next phase of our analysis. 

## 2.3 Correlation Analysis


### 2.3.1 Correlation Amon Numerical Variables

In [None]:
corr_mat = train_data[['Age', 'Vintage', 'Avg_Account_Balance']].corr()
fig = px.imshow(corr_mat, color_continuous_scale= 'blues')
fig.show()

Age and Vintage variables are slightly correlated (0.63) as expected.

# 4. Data Preparation

## 4.1 Feature Engineering

In [None]:
# Dropping ID column from both datasets since it does not add value to the analysis as well as the average account balance
# since we have taken the log transformation and created a new column for it

train_data.drop(['ID', 'Avg_Account_Balance'], axis=1, inplace=True)

In [None]:
# Scaling the numerical categories Age and Vintage
sc = StandardScaler()

train_data[['Age', 'Vintage']] = sc.fit_transform(train_data[['Age', 'Vintage']])

In [None]:
regionLeadPer = pd.Series(region_lead_per)
regionLeadPer.describe()

In [None]:
#creating dictionary to label each region with its lead percentage
region_conversion_rate = dict()
for i in range(0, len(region_name)):
    region_conversion_rate[region_name[i]] = region_lead_per[i]
region_conversion_rate

In [None]:
#Classifying regions based on their lead percentages

regionHigh = [] # high conversion rates
regionMod = [] # moderate conversion rates
regionLow = [] # low conversion rates
for key, val in region_conversion_rate.items():
    if val>23:
        regionHigh.append(key)
    elif val>16:
        regionMod.append(key)
    else:
        regionLow.append(key)

print(len(regionHigh),len(regionMod), len(regionLow))

In [None]:
#encoding the region code column based on different groups

train_data['High_Region'] = train_data['Region_Code'].map(lambda x: 1 if x in regionHigh else 0)
train_data['Moderate_Region'] = train_data['Region_Code'].map(lambda x: 1 if x in regionMod else 0)
train_data['Low_Region'] = train_data['Region_Code'].map(lambda x: 1 if x in regionLow else 0)

In [None]:
#dropping the Region_Code column

train_data.drop(['Region_Code'], axis=1, inplace=True)

In [None]:
#encoding the gender and is_active variables

train_data['Gender'] = train_data['Gender'].map(lambda x: 1 if x == 'Male' else 0)
train_data['Is_Active'] = train_data['Is_Active'].map(lambda x: 1 if x == 'Yes' else 0)

In [None]:
# One Hot Encoding the occupation, channel code and credit product variables

x1 = pd.get_dummies(train_data['Occupation'])
x2 = pd.get_dummies(train_data['Channel_Code'])
x3 = pd.get_dummies(train_data['Credit_Product'])
train_data = pd.concat([train_data, x1, x2, x3], axis = 1)

In [None]:
train_data.drop(['Occupation', 'Channel_Code', 'Credit_Product'],1,inplace=True)

In [None]:
train_data.columns

In [None]:
#splitting train set into train and validation set

features = train_data[['Gender', 'Age', 'Vintage', 'Is_Active',
       'Log_Avg_Account_Balance', 'High_Region', 'Moderate_Region',
       'Low_Region', 'Entrepreneur', 'Other', 'Salaried', 'Self_Employed',
       'X1', 'X2', 'X3', 'X4', 'Na', 'No', 'Yes']]

target = train_data['Is_Lead']

x_train, x_test, y_train, y_test = train_test_split(features, target, test_size = 0.25, random_state=0)

# 4. Training and Tuning Classification Models

## 4.1 Logistic Regression

In [None]:
log_reg = LogisticRegression(random_state=0)
log_reg.fit(x_train, y_train)

In [None]:
y_pred = log_reg.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(accuracy_score(y_test, y_pred))

In [None]:
# ROC curve

fpr, tpr, thresholds = metrics.roc_curve(y_test, log_reg.predict_proba(x_test)[:,1])
fig = px.line(x=fpr, y=tpr)
fig.show()

## 4.2 Random Forest Classifier

In [None]:
rf_class = RandomForestClassifier(n_estimators=300, criterion='entropy',random_state=0)
rf_class.fit(x_train, y_train)

In [None]:
y_pred = rf_class.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(accuracy_score(y_test, y_pred))

In [None]:
for feature in zip(list(x_train.columns), rf_class.feature_importances_):
    print(feature)

## 4.3 XGBoost Classifier

In [None]:
xgb = XGBClassifier()
xgb.fit(x_train, y_train)

In [None]:
y_pred = xgb.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(accuracy_score(y_test, y_pred))