In [1]:
import pandas as pd
import numpy as np
from scipy import spatial

In [2]:
colnames = ['id','age','yoe','income','zip','family','monthlyexp','education',
           'mortagev','class','otherbankacc','certificate','internetbanking','creditcard']
dataset = pd.read_csv("../input_data/LoanDataset/data.csv", names=colnames, header=None)
dataset = dataset.drop([0])

In [3]:
dataset.head()

Unnamed: 0,id,age,yoe,income,zip,family,monthlyexp,education,mortagev,class,otherbankacc,certificate,internetbanking,creditcard
1,2701,31,5.0,39,94590,4,2.2,2.0,0,0.0,0.0,0.0,1.0,1.0
2,2716,42,18.0,54,90089,1,1.8,1.0,0,0.0,0.0,0.0,1.0,0.0
3,3359,59,35.0,40,94536,4,0.4,1.0,0,0.0,0.0,0.0,0.0,0.0
4,2492,38,14.0,80,92868,2,2.7,1.0,0,0.0,0.0,0.0,1.0,0.0
5,937,62,32.0,19,92109,1,1.5,3.0,0,0.0,1.0,0.0,0.0,0.0


In [38]:
def safe_div(x,y):
    if y == 0:
        return 0
    return x / y

In [39]:
def train_validate_test_split(dataset):
    size = len(dataset)
    tsize = int(size*0.6)
    vsize = int(size*0.8)
    training_data = dataset.iloc[:tsize].reset_index(drop=True)
    validation_data = dataset.iloc[tsize:vsize].reset_index(drop=True)
    testing_data = dataset.iloc[vsize:].reset_index(drop=True)
    return training_data,validation_data,testing_data

In [40]:
def meanstdv(col):
    mean = col.mean()
    stdv = col.std()
    return mean,stdv

In [41]:
def pdf(x,mean,stdv):
    exp = np.exp(-(np.power(x-mean,2)/(2*np.power(stdv,2))))
    return (1/(np.sqrt(2*np.pi)*stdv))*exp

In [42]:
def classprobablities(column):
    counts = column.value_counts()
    prob = np.zeros(len(column.unique()))
    for i in range(len(prob)):
        prob[i] = counts.iloc[[i]].iloc[0]/column.size
    return prob    

In [43]:
def summaries(dataset):
    summary = {}
    attributes = dataset.keys()
    for att in attributes:
        summary[att] = []
        summary[att].append([meanstdv(dataset[att])])
#     print(summary)
    return summary    

In [44]:
def conditionalprobablities(dataset):
    sets = []
    classes = dataset['class'].unique()
    for c in classes:
        sets.append([(dataset.loc[dataset['class'] == c]).drop('class',axis=1)])
    summary = []
    for s in sets:
        summary.append(summaries(s[0]))
    return summary

In [45]:
def predict(classprob,summary,classes,sample):
    l = sample.size
    pred = []
    noc = len(classes)
#     print(l,noc,classes[0])
    attr = sample.keys()
    for i in range(noc):
        csummary = summary[i]
        cprob = 1
        for j in range(l):
            tmean, tstdv = csummary[attr[j]][0][0][0],csummary[attr[j]][0][0][0]
            cprob *= pdf(sample.iloc[j],tmean,tstdv)
#             print(cprob)
        pred.append(cprob*classprob[i])
    pred = np.asarray(pred)
    return classes[np.argmax(pred)]

In [46]:
def getpredictions(classprob,summary,classes,data):
    predictions = []
    l = len(data)
    for i in range(l):
        predictions.append(predict(classprob,summary,classes,data.iloc[i]))
    return predictions

In [47]:
def sklearnnaivebayes(data):
    from sklearn import metrics
    from sklearn.naive_bayes import GaussianNB
    model = GaussianNB()
    X = data[0].drop('class',axis=1)
    y = data[0]['class']
    model.fit(X,y)
    print("Stats on training data")
    predictions = model.predict(X)
    stats(predictions,data[0]['class'])
    print("\n\n")
    print("Stats on validation data")
    predictions = model.predict(data[1].drop('class',axis=1))
    stats(predictions,data[1]['class'])
    print("\n\n")
    print("Stats on testing data")
    predictions = model.predict(data[2].drop('class',axis=1))
    stats(predictions,data[2]['class'])

In [48]:
def stats(preds,y): 
    TP,TN,FP,FN = 0,0,0,0
    for i in range(len(preds)):
        if preds[i] == 0.0:
            if y[i] == 0:
                TN += 1
            else:
                FN += 1
        else:
            if y[i] == 0:
                FP += 1
            else:
                TP += 1
    classification_error = safe_div((FP+FN),(TP+FP+TN+FN)) 
    accuracy = safe_div((TP+TN),(TP+FP+TN+FN)) 
    recall = safe_div(TP,(TP+FN))
    precision = safe_div(TP,(TP+FP))
    f1_score = safe_div(2,(safe_div(1,precision))+safe_div(1,recall))
    #print(TP,TN,FP,FN)
    print("Classification error:",classification_error)
    print("Accuracy:",accuracy)
    print("Recall:",recall)
    print("Precision:",precision)
    print("F1 Score:",f1_score)

In [49]:
training_data,validation_data,testing_data = train_validate_test_split(dataset)
summary = conditionalprobablities(training_data)
classprob = classprobablities(training_data['class'])
print("Stats on training data")
predictions = getpredictions(classprob,summary,training_data['class'].unique(),training_data.drop('class',axis=1))
stats(predictions,training_data['class'])

# for i in range(5):
#     print('i am i',i)
#     print(predict(classprob,summary,dataset['class'].unique(),tdataset.iloc[i]))

Stats on training data
Classification error: 0.19340496480177843
Accuracy: 0.8065950351982215
Recall: 0.47876447876447875
Precision: 0.24266144814090018
F1 Score: 0.3220779220779221


In [50]:
print("Stats on validation data")
predictions = getpredictions(classprob,summary,training_data['class'].unique(),validation_data.drop('class',axis=1))
stats(predictions,validation_data['class'])

Stats on validation data
Classification error: 0.20444444444444446
Accuracy: 0.7955555555555556
Recall: 0.45263157894736844
Precision: 0.24571428571428572
F1 Score: 0.3185185185185185


In [51]:
print("Stats on testing data")
predictions = getpredictions(classprob,summary,training_data['class'].unique(),testing_data.drop('class',axis=1))
stats(predictions,testing_data['class'])

Stats on testing data
Classification error: 0.21333333333333335
Accuracy: 0.7866666666666666
Recall: 0.3375
Precision: 0.16265060240963855
F1 Score: 0.21951219512195122


In [52]:
# SkLearn library results

In [53]:
sklearnnaivebayes([training_data,validation_data,testing_data])

Stats on training data
Classification error: 0.11374583178955168
Accuracy: 0.8862541682104483
Recall: 0.5714285714285714
Precision: 0.43023255813953487
F1 Score: 0.49087893864013266



Stats on validation data
Classification error: 0.11444444444444445
Accuracy: 0.8855555555555555
Recall: 0.5263157894736842
Precision: 0.46296296296296297
F1 Score: 0.4926108374384236



Stats on testing data
Classification error: 0.10222222222222223
Accuracy: 0.8977777777777778
Recall: 0.55
Precision: 0.44
F1 Score: 0.4888888888888889


In [34]:
# Observations:
# The results produced by naive bayes is -80%, which is lower to the scikit learn results. The results can be 
# improved if the data is used with other classification algorithms or more data is available. 
# The new samples can also be add to the training data to make the model performance better. 