In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
%matplotlib inline

import numpy as np
import pandas as pd
import seaborn as sns
sns.set(style = 'darkgrid')
import plotly.graph_objs as go
import plotly.offline as py
import matplotlib.pyplot as plt
import plotly.figure_factory as ff
from IPython.display import HTML, display
from IPython.core import display as ICD
from plotly.offline import init_notebook_mode, iplot
from pandas.plotting import scatter_matrix
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score

#init_notebook_mode(connected=True)
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

ModuleNotFoundError: No module named 'plotly'

In [None]:
# Read the column description and ensure you understand each attribute well
data = pd.read_csv('Bank_Personal_Loan_Modelling.csv')
data.columns = ["ID","Age","Experience","Income","ZIPCode","Family","CCAvg","Education","Mortgage","PersonalLoan","SecuritiesAccount","CDAccount","Online","CreditCard"]

In [None]:
data.info()

In [None]:
data.head()

In [None]:
# Check whether any column has null values
data.apply(lambda x : sum(x.isnull()))

In [None]:
# View and understand the data
data.describe().transpose()

In [None]:
# Finding the unique data
data.apply(lambda x: len(x.unique()))

In [None]:
# Number of people with Zero Mortgage
data[data['Mortgage'] < 1]['Mortgage'].count()

In [None]:
# Number of people with Zero Credit Card Spending per month
data[data['CreditCard'] < 1]['CreditCard'].count()

In [None]:
# Check the relation between each columns
sns.pairplot(data.iloc[:,1:])

In [None]:
#sns.pairplot()
sns.pairplot(data.iloc[:,1:].sample(100),diag_kind='kde', hue="PersonalLoan")

In [None]:
#Observations:
# Age feature is normally distributed with majority of customers falling between 30 years and 60 years of age. We can confirm this by looking at the describe statement above, which shows mean is almost equal to median
# Experience is normally distributed with more customer having experience starting from 8 years. Here the mean is equal to median. There are negative values in the Experience. This could be a data input error as in general it is not possible to measure negative years of experience. We can delete these values, because we have 3 or 4 records from the sample.
# Income is positively skewed. Majority of the customers have income between 45K and 55K. We can confirm this by saying the mean is greater than the median
# CCAvg is also a positively skewed variable and average spending is between 0K to 10K and majority spends less than 2.5K
# Mortgage 70% of the individuals have a mortgage of less than 40K. However the max value is 635K
# The variables family and education are ordinal variables. The distribution of families is evenly distributes

In [None]:
#Data Cleaning:

In [None]:
# Found some negative values on Experience column
data[data['Experience'] < 0]['Experience'].count()

In [None]:
#clean the negative variable
dfExp = data.loc[data['Experience'] >0]
negExp = data.Experience < 0
column_name = 'Experience'
mylist = data.loc[negExp]['ID'].tolist() # getting the customer ID who has negative experience

In [None]:
negExp.value_counts()

In [None]:
for id in mylist:
    age = data.loc[np.where(data['ID']==id)]["Age"].tolist()[0]
    education = data.loc[np.where(data['ID']==id)]["Education"].tolist()[0]
    df_filtered = dfExp[(dfExp.Age == age) & (dfExp.Education == education)]
    exp = df_filtered['Experience'].median()
    data.loc[data.loc[np.where(data['ID']==id)].index, 'Experience'] = exp

In [None]:
data[data['Experience'] < 0]['Experience'].count()

In [None]:
data.describe().transpose()

In [None]:
# Here the target column is PersonalLoan
sns.boxplot(x='Education',y='Income',hue='PersonalLoan',data=data)

In [None]:
# Observation : It seems the customers whose education level is 1 is having more income. 
# However customers who have taken the personal loan have the same income levels

In [None]:
sns.boxplot(x="Education", y='Mortgage', hue="PersonalLoan", data=data)

In [None]:
# Inference : The customers who have personal loan have high mortgage

In [None]:
sns.countplot(x="SecuritiesAccount", data=data,hue="PersonalLoan")

In [None]:
sns.countplot(x='Family',data=data,hue='PersonalLoan')

In [None]:
# Observations - Family size does not have any impact in personal loan

In [None]:
colors = {1:'red',2:'yellow',3:'green'}
plt.scatter(data['Experience'],data['Age'],c=data['Education'].apply(lambda x:colors[x]))
plt.xlabel('Experience')
plt.ylabel('Age')

In [None]:
# Above plot shows that experinece and age are having positive correlation.

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(data.corr().abs(), vmax=.8, square=True, fmt='.2f', annot=True, linecolor='white', linewidths=0.01, cmap='coolwarm')
plt.title('Correlation')
plt.show()

In [None]:
# From the above correlation chart, we can observe that
# Correlation between Age and Experience is 0.99.
# Correlation between Income and CCAvg is 0.65.
# These are the pairs of features having high correlation i.e (>0.5)

In [None]:
sns.countplot(x='CDAccount',data=data,hue='PersonalLoan')

In [None]:
# Observation: Customers who does not have CD account , does not have loan as well. This seems to be majority. But almost all customers who has CD account have loan as well

In [None]:
sns.boxplot(x=data.Family,y=data.Income,hue=data.PersonalLoan)

In [None]:
# Observation - Families with income less than 100K are less likely to take loan than families with high income

In [None]:
#Distribution of each feature
import matplotlib.pyplot as plt

features = data.copy(deep=True)
f, axes = plt.subplots(4, 3, figsize=(15, 10), sharex=True)
sns.distplot(features["Age"], rug=False, color="skyblue", ax=axes[0, 0])
sns.distplot(features["Experience"], rug=False, color="olive", ax=axes[0, 1])
sns.distplot(features["Income"], rug=False, color="gold", ax=axes[0, 2])
sns.distplot(features["Family"], rug=False, color="teal", ax=axes[1, 0])
sns.distplot(features["CCAvg"], rug=False, ax=axes[1, 1])
sns.distplot(features["Education"], rug=False, color="red", ax=axes[1, 2])
sns.distplot(features["Mortgage"], rug=False, color="skyblue", ax=axes[2, 0])
sns.distplot(features["SecuritiesAccount"], rug=False, color="olive", ax=axes[2, 1])
sns.distplot(features["CDAccount"], rug=False, color="gold", ax=axes[2, 2])
sns.distplot(features["Online"], rug=False, color="teal", ax=axes[3, 0])
sns.distplot(features["CreditCard"], rug=False, ax=axes[3, 1])

In [None]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(data.drop(['ID','Experience'], axis=1), test_size=0.3 , random_state=100)

In [None]:
train_labels = train_set.pop('PersonalLoan')
test_labels = test_set.pop('PersonalLoan')

In [None]:
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
#train_set, test_set, train_labels, test_labels = train_test_split(X, Y, test_size=0.30, random_state=1)

# Invoking the NB Gaussian function to create the model
# fitting the model in the training data set
model_gnb = GaussianNB()
model_gnb.fit(train_set, train_labels)

model_gnb.score(train_set , train_labels)      # performance on test data

test_pred = model_gnb.predict(test_set)

print(metrics.classification_report(test_labels, test_pred))

In [None]:
# Naive Bayes
from sklearn.model_selection import cross_val_score, cross_val_predict

y_predict_gnb = model_gnb.predict(test_set)
gnb_acc=metrics.accuracy_score(test_labels,y_predict_gnb)
print("Naive Bayes Accuracy is: ", gnb_acc)
print(metrics.confusion_matrix(test_labels, test_pred))
scores = cross_val_score(model_gnb, train_set, train_labels, cv=10)
print("Cross-validated scores:", scores , scores)
print("Average score:" , np.average(scores))

In [None]:
# LogisticRegression
from sklearn.linear_model import LogisticRegression
model_lr = LogisticRegression(solver='lbfgs' , max_iter=5000 , multi_class='multinomial')
model_lr.fit(train_set, train_labels)

model_lr.score(train_set , train_labels)      # performance on test data
test_pred = model_lr.predict(test_set)

print(metrics.classification_report(test_labels, test_pred))

lr_acc=metrics.accuracy_score(test_labels,y_predict_gnb)
print("Logistic Regression Accuracy is: ", lr_acc)

print(metrics.confusion_matrix(test_labels, test_pred))

scores = cross_val_score(model_lr, train_set, train_labels, cv=10)
print("Cross-validated scores:", scores , scores)
print("Average score:" , np.average(scores))

In [None]:
print("Logistic Regression score:" , model_lr.score(train_set , train_labels))
print("Nave Bayes score:" , model_gnb.score(train_set , train_labels))

In [None]:
# Here Logistic Regression model seems like have the highest accuracy and we can choose that as our final model

In [None]:
# Compare Models
X=data.drop(['PersonalLoan','Experience','ID'],axis=1)
y=data.pop('PersonalLoan')

In [None]:
from sklearn import model_selection
import matplotlib.pyplot as plt

In [None]:
models = []
models.append(('LR', LogisticRegression()))
models.append(('NB', GaussianNB()))
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
	kfold = model_selection.KFold(n_splits=10, random_state=12345)
	cv_results = model_selection.cross_val_score(model, X, y, cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

In [None]:
# Here Logistic Regression model seems like have the highest accuracy and we can choose that as our final model

In [None]:
# Putting feature variable to X
X = data.drop(['ID','ZIPCode','PersonalLoan'],axis=1)

# Putting response variable to y
y = data['PersonalLoan']

In [None]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X=sc.fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split
# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.7,test_size=0.3,random_state=100)

In [None]:
# Importing decision tree classifier from sklearn library
from sklearn.tree import DecisionTreeClassifier

# Fitting the decision tree with default hyperparameters, apart from
# max_depth which is 5 so that we can plot and read the tree.
dt_default = DecisionTreeClassifier(max_depth=5)
dt_default.fit(X_train, y_train)

In [None]:
# Let's check the evaluation metrics of our default model

# Importing classification report and confusion matrix from sklearn metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Making predictions
y_pred_default = dt_default.predict(X_test)

# Printing classification report
print(classification_report(y_test, y_pred_default))

In [None]:
# GridSearchCV to find optimal max_depth
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

In [None]:
# Create the parameter grid 
param_grid = {
    'max_depth': range(5, 15, 5),
    'min_samples_leaf': range(10, 100, 10),
    'min_samples_split': range(10, 100, 10),
    'criterion': ["entropy", "gini"]
}

n_folds = 5

# Instantiate the grid search model
dtree = DecisionTreeClassifier()
grid_search = GridSearchCV(estimator = dtree, param_grid = param_grid, 
                          cv = n_folds, verbose = 1)

# Fit the grid search to the data
grid_search.fit(X,y)

In [None]:
# cv results
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results.sort_values('mean_test_score', ascending=False)[0:5]

In [None]:
# Fitting the decision tree with best hyperparameters

model = DecisionTreeClassifier(max_depth=10,criterion='gini',min_samples_leaf=10,min_samples_split=60)
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Making predictions
y_pred_default = model.predict(X_test)

# Printing classification report
print(classification_report(y_test, y_pred_default))

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_jobs=-1)
param={'n_neighbors':np.arange(1,50),'weights':['uniform','distance']}
GS=GridSearchCV(knn,param,cv=5,scoring='recall')
GS.fit(X,y)

In [None]:
GS.best_params_

In [None]:
#update and redifine your knn model
KNN=KNeighborsClassifier(n_neighbors=1, weights='uniform')

In [None]:
from sklearn.linear_model import LogisticRegression
LR=LogisticRegression()

In [None]:
from sklearn.naive_bayes import GaussianNB
NB=GaussianNB()

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
LR_bag=BaggingClassifier(base_estimator=LR,n_estimators=25,random_state=0,n_jobs=-1)
NB_bag=BaggingClassifier(base_estimator=NB,n_estimators=24,random_state=0,n_jobs=-1)
KNN_bag=BaggingClassifier(base_estimator=KNN,n_estimators=80,random_state=0,n_jobs=-1)
DT_reg=DecisionTreeClassifier(max_depth=10,criterion='gini',min_samples_leaf=10,min_samples_split=60)
DT_bag=BaggingClassifier(n_estimators=10,random_state=0,n_jobs=-1)
RF=RandomForestClassifier(n_estimators=130,criterion='entropy',random_state=0,n_jobs=-1)

In [None]:
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_curve, auc
kf=KFold(n_splits=5,shuffle=True,random_state=0)
for model, name in zip([LR,LR_bag,NB,NB_bag,KNN,KNN_bag,DT_reg,DT_bag,RF], 
      ['LR','BaggedLR','NB','BaggedNB','KNN','BaggedKNN','DT_Reg','BaggedDT','RF']):
    roc_auc=[]
    for train,test in kf.split(X,y):
        Xtrain,Xtest=X[train,:],X[test,:]
        Ytrain,Ytest=y[train],y[test]
        model.fit(Xtrain,Ytrain)
        Y_predict=model.predict(Xtest)
        #cm=metrics.confusion_matrix(Ytest,Y_predict)
        fpr,tpr, _ = roc_curve(Ytest,Y_predict)
        roc_auc.append(auc(fpr, tpr))
    print("AUC scores: %0.02f (+/- %0.5f) [%s]" % (np.mean(roc_auc),
                                    np.var(roc_auc,ddof=1), name ))

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier

In [None]:
#Boosting models
LR_boost=AdaBoostClassifier(base_estimator=LR,n_estimators=170)
NB_boost=AdaBoostClassifier(base_estimator=NB,n_estimators=350)
RF_boost=AdaBoostClassifier(base_estimator=RF,n_estimators=100)
DT_boost=AdaBoostClassifier(n_estimators=100)
GB_boost=GradientBoostingClassifier(n_estimators=100)

In [None]:
#Stacked Model
stacked = VotingClassifier(estimators = [('Bagged_LR',LR_bag),('BoostedRF', RF_boost), ('GBoost', GB_boost)],voting='soft')

In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
from sklearn.model_selection import KFold
#from sklearn.metrics import roc_curve, auc
kf=KFold(n_splits=5,shuffle=True,random_state=0)
for model, name in zip([LR,LR_boost,NB,NB_boost,DT_boost,RF,RF_boost,GB_boost,stacked], 
      ['LR','BoostedLR','NB','BoostedNB','BoostedDT','RF','BoostedRF','GradientBoost','stacked']):
    roc_auc=[]
    for train,test in kf.split(X,y):
        Xtrain,Xtest=X[train,:],X[test,:]
        Ytrain,Ytest=y[train],y[test]
        model.fit(Xtrain,Ytrain)
        Y_predict=model.predict(Xtest)
        #cm=metrics.confusion_matrix(Ytest,Y_predict)
        fpr,tpr, _ = roc_curve(Ytest,Y_predict)
        roc_auc.append(auc(fpr, tpr))
    print("AUC scores: %0.02f (+/- %0.5f) [%s]" % (np.mean(roc_auc),
                                    np.var(roc_auc,ddof=1), name ))

In [None]:
# Inference
# From the above model, we can clearly infer Gradient Boosting followed by Boosted Random forest performs better with respect to AUC as well as Variance

In [None]:
# Overall understanding:

# Age feature is normally distributed with majority of customers falling between 30 years and 60 years of age 
# Experience is normally distributed with more customer having experience starting from 8 years 
# Income is positively skewed. Majority of the customers have income between 45K and 55K 
# CCAvg is also a positively skewed variable and average spending is between 0K to 10K and majority spends less than 2.5K
# Mortgage 70% of the individuals have a mortgage of less than 40K. However the max value is 635K
# The variables family and education are ordinal variables. The distribution of families is evenly distributed
# It seems the customers whose education level is 1 is having more income. 
# However customers who have taken the personal loan have the same income levels
# The customers who have personal loan have high mortgage
# Family size does not have any impact in personal loan
# Experinece and age are having positive correlation
# Correlation between Age and Experience is 0.99
# Correlation between Income and CCAvg is 0.65
# Customers who do not have CD account, do not have loan as well and almost all customers who have CD account have loan as well
# Families with income less than 100K are less likely to take loan than families with high income
# Logistic Regression has higher accuracy than other models
# The LR Model can be boosted using Gradient Boosting and Boosted Random Forest 

In [None]:
# Campaign Model for Personal Loan: 

# Target customers aged between 30-60 years with 8+ years of experience having income between 45k-55k preferably having a credit card and existing mortgage. 