# Table of Contents
   1. [Load Data](#1)
   1. [Missing Values](#2)
   1. [Detect Outlier](#3)
   1. [Data Analysis](#4)
   1. [Data Visualization](#5)
   1. [Correlation Matrix](#6)
   1. [Feature Engineering](#7)
   1. [Drop](#8)
   1. [Dummies](#9)
   1. [Train-Test Split](#10)
   1. [Scaling](#11)
   1. [Parameters](#12)
   1. [Classifiers](#13)
   1. [Modeling and Optimization](#14)
   1. [Ensemble Modeling](#15)
   1. [Prediction](#16)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import missingno

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<a id='1'></a>
# Load Data

In [None]:
data = pd.read_csv('/kaggle/input/bank-personal-loan-modelling/Bank_Personal_Loan_Modelling.csv')

<a id='2'></a>
# Missing Values

In [None]:
missingno.bar(data,figsize=(10,5),fontsize=12)

<a id='3'></a>
# Detect Outlier

In [None]:
from collections import Counter

#drop the label 
outlier_list = ['Age', 'Experience', 'Income', 'Family', 'CCAvg', 'Education','Mortgage',
                'Securities Account', 'CD Account', 'Online', 'CreditCard'] 

def detect_outlier(df,feature):
    
    outlier_indices = []
    
    for f in feature:
        
        #lower quartile
        q1 = np.percentile(df[f],25)
        
        #upper quartile
        q3 = np.percentile(df[f],75)
        
        #interquartile range
        iqr = q3-q1
        
        #with coefficient
        with_coef = 1.5*iqr
        
        #detect outlier(s)
        lower_rule = q1-with_coef
        upper_rule = q3+with_coef
        
        outlier_variable = df[(df[f]<lower_rule) | (df[f]>upper_rule)].index
        outlier_indices.extend(outlier_variable)
    
    #converting to amount
    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list(i for i,v in outlier_indices.items() if v>2)
    
    return multiple_outliers

 
#drop outliers
data = data.drop(detect_outlier(data,outlier_list),axis=0).reset_index(drop=True)

<a id='4'></a>
# Data Analysis

In [None]:
data.info()

In [None]:
data.head()

In [None]:
data.drop(['ZIP Code','ID'],axis=1,inplace=True)

In [None]:
data.head()

In [None]:
data.columns

<a id='5'></a>
# Data Visualization

In [None]:
feature_list=['Age', 'Experience', 'Income', 'Family', 'CCAvg', 'Education','Mortgage', 'CreditCard', 'Securities Account', 'CD Account','Online']

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

def draw_distribution(df,feature):
    """
    density
    """
    
    sns.distplot(df[feature])
    plt.show()

In [None]:
for i in feature_list:
    draw_distribution(data,i)
    plt.show()

<a id='6'></a>
# Correlation Matrix

In [None]:
plt.figure(figsize=(15,5))
sns.heatmap(data.corr(),annot=True,linewidths=.5,fmt='.2f')
plt.show()

<a id='7'></a>
# Feature Engineering

In [None]:
data.columns

In [None]:
def catplot_func(df,x):
    """
    to analyze the impact between feature and target for categorical variables
    """
    plt.figure(figsize=(15,4))
    g = sns.catplot(data=df,x=x,y='Personal Loan',kind='bar',height=4)
    plt.xticks(rotation = 90)
    plt.show()

In [None]:
def pointplot_func(df,x):
    """
    to analyze the impact between feature and target for numerical variables
    """
    plt.figure(figsize=(30,4))
    plt.xticks(rotation=90)
    sns.pointplot(x=x,y='Personal Loan',data=df,color='darkblue')
    plt.xlabel(x,fontsize=14)
    plt.grid()
    

### CD Account

In [None]:
data['CD Account'].value_counts()

In [None]:
data.drop(['CD Account'],axis=1,inplace=True)

### Securities Account

In [None]:
data['Securities Account'].value_counts()

In [None]:
data.drop(['Securities Account'],axis=1,inplace=True)

### Mortgage

In [None]:
"""
Mortgage Group
(0) Group 1: 0 
(1) Group 2: Other Values
"""
data['Mrt_Grp']=[0 if i==0 else 1 for i in data.Mortgage]

### Age

In [None]:
data['Age_Grp'] = [0 if i <35 else 1 if i<55 and i>=35 else 2 for i in data.Age]

In [None]:
catplot_func(data,'Age_Grp')

### Experience

In [None]:
"""
Experience Group
(0) 0-10
(1) 11-20
(2) 20+
"""
data['Exp_Grp'] = [0 if i <=10 else 1 if i<=20 and i>10 else 2 for i in data.Experience]

In [None]:
catplot_func(data,'Exp_Grp')

### Family

In [None]:
catplot_func(data,'Family')

In [None]:
"""
Family Group
(0): 1-2
(1): 3-4
"""
threshold = 3
data['Fml_Grp']= [0 if i <threshold else 1 for i in data.Family]

### CCAvg

In [None]:
pointplot_func(data,'CCAvg')

In [None]:
"""
CCAvg Group
(0): 0-27.499
(1): 27.5+
"""

threshold = 27.5
data['CCA_Grp']=[0 if i < threshold else 1 for i in data.CCAvg]

### Education

In [None]:
catplot_func(data,'Education')

In [None]:
"""
Education Group
(0): 1
(1): 2-3
"""
data['Edu_Grp']=[0 if i ==1 else 1 for i in data.Education]

### Online

In [None]:
catplot_func(data,'Online')

### Credit Card

In [None]:
catplot_func(data,'CreditCard')

### Income

In [None]:
plt.figure(figsize=(30,4))
plt.xticks(rotation=90)
sns.pointplot(x='Income',y='Personal Loan',data=data,color='darkblue')
plt.xlabel(i,fontsize=14)
plt.grid()

In [None]:
threshold = 103
data['Inc_Grp'] = [0 if i<threshold else 1 for i in data.Income]

<a id='8'></a>
# Drop

In [None]:
data.columns

In [None]:
#data.drop(['Age','Experience','Income','Family','CCAvg','Education','Mortgage'],axis=1,inplace=True)

In [None]:
data.head()

<a id='9'></a>
# Dummies

In [None]:
data = pd.get_dummies(data=data,columns=['Age_Grp','Exp_Grp'])

In [None]:
data.head()

<a id='10'></a>
# Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
data.columns

In [None]:
data.head()

In [None]:
X = data.drop('Personal Loan',axis=1)
y = data['Personal Loan']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=42)

<a id='11'></a>
## Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

<a id='12'></a>
# Parameters

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

lr_params = {'C':np.logspace(-3,3,7),
             'penalty':['l1','l2']}

        
knn_params = {'n_neighbors':np.arange(1,50)}

nb_params = {'var_smoothing': np.logspace(0,-9, num=100)}

rf_params =  {'max_features':[1,3,10],
              'min_samples_split':[2,3,10],
              'min_samples_leaf':[1,3,10],
              'bootstrap':[False],
              'n_estimators':[100,300],
              'criterion':['gini']}

gb_params = {'learning_rate':[0.001,0.01,0.1,0.05],
            'n_estimators':[100,500,100],
            'max_depth':[3,5,10],
            'min_samples_split':[2,5,10]
    
}

<a id='13'></a>
# Classifiers

In [None]:
classifier = [RandomForestClassifier(),
              LogisticRegression(),
              KNeighborsClassifier(),
              GaussianNB(),
              GradientBoostingClassifier()
             ]

In [None]:
param = [rf_params,lr_params,knn_params,nb_params,gb_params]

<a id='14'></a>
# Modeling and Optimization

In [None]:
ml_list = ['Random Forest','Logistic Regression','KNN','Naive Bayes','GradientBoosting']

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score


cv_results = []
best_estimators = []

for i in range(len(classifier)):
    clf = GridSearchCV(classifier[i],
                            param_grid=[param[i]],
                            cv = StratifiedKFold(n_splits=5),
                            scoring = 'roc_auc',
                             n_jobs= -1,
                             verbose = 1
                        
                            )
    clf.fit(X_train_scaled,y_train)
    cv_results.append(clf.best_score_)
    best_estimators.append(clf.best_estimator_)
    print('Method: {}  Score: {} Best: {}' .format(classifier[i],cv_results[i],clf.best_estimator_))
    
results = pd.DataFrame({'CV Means':cv_results,
                       'ML Models':ml_list})


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

g = sns.barplot('CV Means','ML Models',data=results)
g.set_title('ROC-AUC Score')
plt.show()


<a id='15'></a>
# Ensemble Modeling

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
voting_c = VotingClassifier(estimators=[('rf',best_estimators[0]),
                                        ('lr',best_estimators[1]),
                                        ('knn',best_estimators[2]),
                                        ('nb',best_estimators[3]),
                                        ('gb',best_estimators[4])
                                       ],
                           voting='soft',
                           n_jobs=-1)

<a id='16'></a>
# Prediction

In [None]:
voting_c = voting_c.fit(X_train_scaled,y_train)
my_score = accuracy_score(voting_c.predict(X_test_scaled),y_test)
print(my_score)