In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd

**Loading file...**

In [None]:
df=pd.read_csv('/kaggle/input/personal-loan-modeling/Bank_Personal_Loan_Modelling.csv')
df.head()

# Exploratory Data Analysis

In [None]:
df.shape

In [None]:
df.info()

Above info shows that some of the categorical feature treated as integer, we need to change type to category

In [None]:
df.isnull().any()

**dropping ID and ZIP code column as it's not relevant for our analysis**

In [None]:
df.drop(['ID', 'ZIP Code'], axis = 1, inplace = True)

**Dividing the columns in the dataset in to numeric and categorical attributes**

In [None]:
cols = set(df.columns)
cols_numeric = set(['Age', 'Experience', 'Income', 'CCAvg', 'Mortgage'])
cols_categorical = list(cols - cols_numeric)
cols_categorical

In [None]:
for x in cols_categorical:
    df[x] = df[x].astype('category')

df.info()

Now we have category type for categorical feature variable

# Plotting graph for analysis different columns

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Univariate Analysis

**Creating function to show Density distribution for non-category column**

In [None]:
def summary_non_category(x):
    x_min = df[x].min()
    x_max = df[x].max()
    Q1 = df[x].quantile(0.25)
    Q2 = df[x].quantile(0.50)
    Q3 = df[x].quantile(0.75)
    print(f'Summary of {x.capitalize()} Attribute:\n'
          f'{x.capitalize()}(min) : {x_min}\n'
          f'Q1                    : {Q1}\n'
          f'Q2(Median)            : {Q2}\n'
          f'Q3                    : {Q3}\n'
          f'{x.capitalize()}(max) : {x_max}')
# Plotting Graph
    sns.distplot(df[x])
    plt.title(f'{x.capitalize()} Density Distribution')
    plt.show()

In [None]:
for column in cols_numeric:
    summary_non_category(column)

****From above plot Income,Ccavg,Mortgage doesn't seeems good,need to alter the outlier,which we will do in later process***

****Now Creating function to show Density distribution for category column****

In [None]:
def summary_category(category_column):
    count_category= []
    value_category = []
    category_loan = []
    category_no_loan =[]
    category = df[category_column].unique()
    for x in category:
        value_category.append(x)
        count_category.append(df[category_column][df[category_column] ==x].count())
    value_category = np.array(value_category)  
    for x in np.nditer(value_category):
        category_loan.append(df[category_column][df[category_column]==x][df["Personal Loan"] ==1].count())
        category_no_loan.append(df[category_column][df[category_column]==x][df["Personal Loan"] ==0].count())
# Plotting Graph
    fig, (ax1,ax2) = plt.subplots(1,2)
    ax1.pie(count_category,labels=value_category, autopct='%1.1f%%')
    ax2.bar(value_category-0.2,category_loan, width=0.4, label="Loan")
    ax2.bar(value_category+0.2,category_no_loan, width=0.4,label="No Loan")
    plt.title(category_column)
    plt.legend()
    plt.show()

In [None]:
for category_column in cols_categorical:
    summary_category(category_column)

****The distribution of 'CD Account' and 'Securities Account' are not in proper ratio.Ideally we should have more data for these columns for good analysis****

# Bivariate Analysis

In [None]:
X = df.drop('Personal Loan', axis = 1)
y = df['Personal Loan']
data_num = df.select_dtypes(include='number')
sns.pairplot(X ,diag_kind = 'kde', vars = list(data_num.columns))

# Multivariate Analysis

In [None]:
X = df.drop('Personal Loan', axis = 1)
y = df['Personal Loan']

In [None]:
# thanks to Anirban Datta
corr = X.corr()
plt.figure(figsize=(10, 8))
g = sns.heatmap(corr, annot=True, cmap = 'summer_r', square=True, linewidth=1, cbar_kws={'fraction' : 0.02})
g.set_yticklabels(g.get_yticklabels(), rotation=0, horizontalalignment='right')
bottom, top = g.get_ylim()
g.set_ylim(bottom + 0.5, top - 0.5)
plt.show()

We see there is a very strong positive correlation between Age and Work Experience, which is expected. There is also a positive correlation between Income and Credit Card spending.

# Data Pre-processing

In [None]:
df.Experience.unique()

We see some negative value, let's count it

In [None]:
df["Experience"][df["Experience"]<0].count()

In [None]:
df["Experience"][df["Experience"]>=0].count()

We see that count is very less for negative experience as compare to positive,so we gonna drop negative value as experience negative value doesn't make sense to me

In [None]:
df2 = df[df["Experience"]>=0]
df2.head()

In [None]:
df2.describe().transpose()

As we see earlier Mortgage,income and CCavg contains outlier

Let's look form mortgage first

In [None]:
df2["Mortgage"][df2["Mortgage"]==0].count()

As the count of not having Mortagage is very large, we might think to treat mortgage as category variable. Let's see whether it will be a good choice or not

In [None]:
mortgage = {}
mortgage["Personal_loan_and_no_mortagage"]    = df2["Mortgage"][df2["Mortgage"]==0][df2["Personal Loan"]==1].count()
mortgage["no_Personal_loan_and_no_mortagage"] = df2["Mortgage"][df2["Mortgage"]==0][df2["Personal Loan"]==0].count()
mortgage["no_Personal_loan_and_mortagage"]    = df2["Mortgage"][df2["Mortgage"]>0][df2["Personal Loan"]==0].count()
mortgage["Personal_loan_and_mortagage"]       = df2["Mortgage"][df2["Mortgage"]>0][df2["Personal Loan"]==1].count()
mortgage

In [None]:
xpos = np.arange(len(mortgage))
value = [x for x in mortgage.values()]
keys = [x for x in mortgage.keys()]
plt.bar(xpos,value)
plt.xticks(xpos)
plt.ylabel("Count")
plt.title('Mortgage')
plt.show()

As we can see above we can't neglect any value as it might affect our target variable "Personal loan,so we can't treat it as category.

now SCALING the non-category column

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
df2.columns


In [None]:
scale = MinMaxScaler()

In [None]:
X = df2[['Income', 'CCAvg',"Mortgage","Age","Experience"]]
scaledX = scale.fit_transform(X)
df2['Income']     = (scaledX[:,0])
df2["CCAvg"]      = (scaledX[:,1])
df2["Mortgage"]   = (scaledX[:,2])
df2["Age"]        = (scaledX[:,3])
df2["Experience"] = (scaledX[:,4])
df2.head()

In [None]:
sns.distplot(df2["Income"])

In [None]:
sns.distplot(df2["CCAvg"])

As income and CCAvg graph is skewed left ,we will remove outlier

In [None]:
upper_limit_income = df2["Income"].mean() + 3*df2["Income"].std()
upper_limit_income

In [None]:
upper_limit_ccavg = df2["CCAvg"].mean() + 2*df2["CCAvg"].std()
upper_limit_ccavg

In [None]:
df2.shape

In [None]:
df3 = df2[df2["Income"]<upper_limit_income][df2["CCAvg"]<upper_limit_ccavg]
df3.shape

In [None]:
sns.distplot(df3["Income"])

In [None]:
sns.distplot(df3["CCAvg"])

above graph looks better than previous

Now we have prepared the good data..Let's build the model

# Building model

In [None]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split


Feature variable X

In [None]:
X = df3.drop(['Personal Loan'],axis='columns')
X.head(3)

Target Variable y

In [None]:
y = df3["Personal Loan"]
y.head()

**Train-test data split**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1)

**Testing Random forest**

In [None]:
from sklearn.model_selection import ShuffleSplit          # for random suffle rather than in order
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
cross_val_score(RandomForestClassifier(), X, y, cv=cv)

**Testing Decision Tree**

In [None]:
from sklearn.model_selection import ShuffleSplit          # for random suffle rather than in order
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

cross_val_score(DecisionTreeClassifier(), X, y, cv=cv)

**Parameter tuning using GridSearhCv**

In [None]:
model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    },
    'DecisionTree': {
        'model' : DecisionTreeClassifier(),
        'params' : {
            'criterion' : ["gini", "entropy"]
        }
    },
    'GaussianNB' : {
        'model' : GaussianNB(),
        'params' : {}
          
 },
    'MultinomialNB' : {
        'model' : MultinomialNB(),
        'params' : {}
            
            
        
    }
}

In [None]:
from sklearn.model_selection import GridSearchCV
scores = []
best_estimators = {}
for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=cv, return_train_score=False)
    clf.fit(X_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    best_estimators[model_name] = clf.best_estimator_
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

from above we can see Decision Tree comes out to be best for our case

In [None]:
best_clf = best_estimators["DecisionTree"]

**CONFUSION Matrix**

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, best_clf.predict(X_test))
cm

In [None]:
import seaborn as sn
plt.figure(figsize = (10,7))
sn.heatmap(cm, annot=True)
plt.xlabel('Predicted')
plt.ylabel('Truth')

**Relative importance of feature variable**

In [None]:
# Thanks to Anirban Datta
best_clf.fit(X_train, y_train)

features = list(X_train.columns)
importances = best_clf.feature_importances_
indices = np.argsort(importances)

fig, ax = plt.subplots(figsize=(10, 7))
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
ax.tick_params(axis="x", labelsize=12)
ax.tick_params(axis="y", labelsize=14)
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance', fontsize = 18)
plt.show()

****Thus we conclude Income is the main key feature then comes education,CCavg,Family. Seems logical too. and best fit model for our case comes out to be Decision Tree****