In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Set default setting of seaborn
sns.set()


In [3]:
# read the data
raw_data = pd.read_csv('personal_loan.csv')
raw_data.head()



Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


In [6]:
raw_data.shape

(5000, 14)

In [4]:
data = raw_data.drop(['ID','ZIP Code'],axis= 1)
data.head()

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,25,1,49,4,1.6,1,0,0,1,0,0,0
1,45,19,34,3,1.5,1,0,0,1,0,0,0
2,39,15,11,1,1.0,1,0,0,0,0,0,0
3,35,9,100,1,2.7,2,0,0,0,0,0,0
4,35,8,45,4,1.0,2,0,0,0,0,0,1


In [5]:
data[['Experience']] = data[['Experience']].replace([-3,-2,-1],np.NaN)

In [7]:
data = data.dropna()

data.describe()



Unnamed: 0,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
count,4948.0,4948.0,4948.0,4948.0,4948.0,4948.0,4948.0,4948.0,4948.0,4948.0,4948.0,4948.0
mean,45.557195,20.331043,73.81447,2.391471,1.935926,1.878941,56.634398,0.097009,0.104285,0.061035,0.597009,0.294058
std,11.320735,11.311973,46.112596,1.148444,1.747694,0.839745,101.828885,0.296,0.30566,0.239418,0.490549,0.455664
min,24.0,0.0,8.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,36.0,10.75,39.0,1.0,0.7,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,46.0,20.0,64.0,2.0,1.5,2.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,55.0,30.0,98.0,3.0,2.6,3.0,101.0,0.0,0.0,0.0,1.0,1.0
max,67.0,43.0,224.0,4.0,10.0,3.0,635.0,1.0,1.0,1.0,1.0,1.0


In [8]:
# get the count of missing values
missing_values = data.isnull().sum()

# print the count of missing values
print(missing_values)

Age                   0
Experience            0
Income                0
Family                0
CCAvg                 0
Education             0
Mortgage              0
Personal Loan         0
Securities Account    0
CD Account            0
Online                0
CreditCard            0
dtype: int64


In [9]:
# check correlation
data_num = data.drop(['Personal Loan','Family','Education','Online','Securities Account','CD Account','CreditCard'], axis=1)
corr = data_num.corr()
corr


Unnamed: 0,Age,Experience,Income,CCAvg,Mortgage
Age,1.0,0.994101,-0.058006,-0.050879,-0.015184
Experience,0.994101,1.0,-0.049245,-0.048939,-0.013459
Income,-0.058006,-0.049245,1.0,0.646178,0.206921
CCAvg,-0.050879,-0.048939,0.646178,1.0,0.109905
Mortgage,-0.015184,-0.013459,0.206921,0.109905,1.0


In [29]:
drop_list = ['Age']
data_cleaned = data.drop(drop_list,axis = 1 )        
data_cleaned.head()

Unnamed: 0,Experience,Income,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1.0,49,4,1.6,1,0,0,1,0,0,0
1,19.0,34,3,1.5,1,0,0,1,0,0,0
2,15.0,11,1,1.0,1,0,0,0,0,0,0
3,9.0,100,1,2.7,2,0,0,0,0,0,0
4,8.0,45,4,1.0,2,0,0,0,0,0,1


In [38]:
y = data_cleaned['Personal Loan']
x = data_cleaned.drop(['Personal Loan'],axis = 1)

In [39]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(x)

X = scaler.transform(x)
y = np.array(y)

In [40]:
x.set_index('Experience').to_csv('scale.csv')

In [41]:
# Oversample and plot imbalanced dataset with SMOTE
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from matplotlib import pyplot
from numpy import where


In [42]:
print(Counter(y),len(X))

Counter({0: 4468, 1: 480}) 4948


In [43]:
# transform the dataset
oversample = SMOTE()

In [44]:
X, y = oversample.fit_resample(X, y)

In [46]:
# summarize the new class distribution
counter = Counter(y)
print(counter,len(X))

Counter({0: 4468, 1: 4468}) 8936


In [48]:
from sklearn.model_selection import train_test_split
# let us now split the dataset into train & test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=10)

# print the shape of 'x_train'
print("X_train ",X_train.shape)

# print the shape of 'x_test'
print("X_test ",X_test.shape)

# print the shape of 'y_train'
print("y_train ",y_train.shape)

# print the shape of 'y_test'
print("y_test ",y_test.shape)

X_train  (6255, 10)
X_test  (2681, 10)
y_train  (6255,)
y_test  (2681,)


In [49]:
from sklearn.naive_bayes import GaussianNB

# build the model
GNB_smote = GaussianNB()

# fit the model
GNB_smote.fit(X_train, y_train)

GaussianNB()

In [50]:
# predict the values
y_pred_smote  = GNB_smote.predict(X_test)

In [54]:
from sklearn import metrics
# compiling the required information
GNB_smote_metrics = pd.Series({'Model': "Navie bayes model",
                     'AUC Score' : metrics.roc_auc_score(y_test, y_pred_smote),
                 'Precision Score': metrics.precision_score(y_test, y_pred_smote),
                 'Recall Score': metrics.recall_score(y_test, y_pred_smote),
                 'Accuracy Score': metrics.accuracy_score(y_test, y_pred_smote),
                  'f1-score':metrics.f1_score(y_test, y_pred_smote)})



# appending our result table
result_tabulation = pd.DataFrame(GNB_smote_metrics)

# view the result table
result_tabulation

Unnamed: 0,0
Model,Navie bayes model
AUC Score,0.856118
Precision Score,0.873117
Recall Score,0.830317
Accuracy Score,0.856397
f1-score,0.851179


In [55]:
import pickle

In [56]:
pickle.dump(GNB_smote,open('Personal.pkl','wb'))