In [1]:
#Handling imbalanced datasets in machine learning

from __future__ import division, print_function, unicode_literals
import numpy as np
import pandas as pd
from pandas import read_csv
import math
import sklearn
from sklearn import linear_model
import random
from sklearn import cluster
from sklearn.naive_bayes import GaussianNB

In [2]:
#loading data

creditData=np.loadtxt('creditcard.csv',dtype=np.str,delimiter=',',skiprows=1)


In [3]:
print(creditData.shape)

#stripping "" from output variable
creditData[:,creditData.shape[1]-1]=np.core.defchararray.strip(creditData[:,creditData.shape[1]-1], chars='"')
#type conversion
creditData=creditData.astype(np.float)
print(creditData.dtype)

(284807, 31)
float64


In [4]:
#1. Data Level Approach
fraudRate=(np.count_nonzero(creditData[:,creditData.shape[1]-1])/creditData.shape[0])
print(fraudRate)#ratio of non fraud to fraud data

0.001727485630620034


In [5]:
#splitting between train and test set
xTrain, xTest, yTrain, yTest=sklearn.model_selection.train_test_split(creditData[:,:creditData.shape[1]-1],creditData[:,creditData.shape[1]-1],test_size=0.20)

#test contains test input and output both
test=pd.DataFrame(xTest)
test[test.shape[1]]=yTest
testF=test[test[test.shape[1]-1]==1]#test data for fraud examples
testNf=test[test[test.shape[1]-1]==0]#test data for non fraud examples


train=pd.DataFrame(xTrain)
train[train.shape[1]]=yTrain
trainF=train[train[train.shape[1]-1]==1]#train data for fraud examples
trainNf=train[train[train.shape[1]-1]==0]#train data for non fraud examples

#reducing the size of data
train=pd.DataFrame()
xTrain=pd.DataFrame()
yTrain=pd.DataFrame()
for i in range(300):
    r=random.randint(0,trainNf.shape[0])
    train=train.append(trainNf.iloc[r,:])  
for i in range(30):
    r=random.randint(0,trainF.shape[0])
    train=train.append(trainF.iloc[r,:])

xTrain=xTrain.append(train.loc[:,:train.shape[1]-2])
yTrain[train.shape[1]-1]=train.loc[:,train.shape[1]-1]

trainF=train[train[train.shape[1]-1]==1]
trainNf=train[train[train.shape[1]-1]==0]

testN=pd.DataFrame()
for i in range(testF.shape[0]):
    r=random.randint(0,testNf.shape[0])
    testN=testN.append(testNf.iloc[r,:])
testNf=testN

print(xTrain.shape,yTrain.shape,trainF.shape,trainNf.shape,testF.shape,testNf.shape)

(330, 30) (330, 1) (30, 31) (300, 31) (87, 31) (87, 31)


In [6]:
#Random undersampling
train=pd.DataFrame()
for i in range(np.count_nonzero(yTrain)):
    r=np.random.randint(0,trainNf.shape[0])
    train=train.append(trainNf.iloc[r,:])  
train=train.append(trainF)


In [7]:
#prototype based classification on original data(bad classification with eucledian distance metric)

clf=sklearn.neighbors.NearestCentroid().fit(xTrain,np.ravel(yTrain))

print(clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples

#prototype based classification on random undersampled data
clf=sklearn.neighbors.NearestCentroid().fit(train.loc[:,:train.shape[1]-2],np.ravel(train.loc[:,train.shape[1]-1]))
print(clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples

0.45977011494252873
0.5172413793103449
0.45977011494252873
0.6091954022988506


In [8]:
#prototype based classification on original data(classification with minkowski distance with p=-1)

def dist(x,y):
    x=abs(x-y)
    x=x**(-0.1)
    s=np.sum(x)
    s=s**(-1/0.1)
    return s
clf=sklearn.neighbors.NearestCentroid(metric=dist).fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

#prototype based classification on random undersampled data
clf=sklearn.neighbors.NearestCentroid(metric=dist).fit(train.loc[:,:train.shape[1]-2],np.ravel(train.loc[:,train.shape[1]-1]))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))


#Naive bayes classifier

clf=GaussianNB().fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

#naive bayes classification on random undersampled data
clf=GaussianNB().fit(train.loc[:,:train.shape[1]-2],np.ravel(train.loc[:,train.shape[1]-1]))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))


#Decision tree classifier

clf = sklearn.tree.DecisionTreeClassifier().fit(xTrain,yTrain)

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))


#decision tree classification on random undersampled data
clf=sklearn.tree.DecisionTreeClassifier().fit(train.loc[:,:29],train.loc[:,30])
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))

clf = sklearn.tree.DecisionTreeClassifier(class_weight={1:10}).fit(xTrain,yTrain)

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))


# K- nearest neighbor

clf = sklearn.neighbors.KNeighborsClassifier(metric=dist).fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))

#K nearest neighbor classification on random undersampled data
clf=sklearn.neighbors.KNeighborsClassifier(metric=dist).fit(train.loc[:,:29],np.ravel(train.loc[:,30]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precision
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))


#logistic regression on original data
#class sklearn.linear_model.LogisticRegression(penalty=’l2’, dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver=’warn’, max_iter=100, multi_class=’warn’, verbose=0, warm_start=False, n_jobs=None)
clf=linear_model.LogisticRegression().fit(xTrain,np.ravel(yTrain))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))

#logistic regression on random undersampled data
clf=linear_model.LogisticRegression().fit(train.loc[:,:29],np.ravel(train.loc[:,30]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print(TN,TP,FP)
print("precision ",precision)#precision
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))

clf=linear_model.LogisticRegression(class_weight={1:10}).fit(xTrain,np.ravel(yTrain))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))




#SVM on original data
clf=sklearn.svm.SVC(kernel='linear').fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))

#SVM on random undersampled data
clf=sklearn.svm.SVC(kernel='linear').fit(train.loc[:,:29],np.ravel(train.loc[:,30]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))

clf=sklearn.svm.SVC(kernel='linear',class_weight={1:10}).fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))


#Random Oversampling

train=pd.DataFrame()
for i in range(yTrain.shape[0]-2*np.count_nonzero(yTrain)):
    r=np.random.randint(0,trainF.shape[0])
    train=train.append(trainF.iloc[r,:])  
train=train.append(trainNf)


#prototype based classification on original data(classification with minkowski distance)


def dist(x,y):
    x=abs(x-y)
    x=x**(-0.1)
    s=np.sum(x)
    s=s**(-1/0.1)
    return s
clf=sklearn.neighbors.NearestCentroid(metric=dist).fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))

#prototype based classification on random undersampled data
clf=sklearn.neighbors.NearestCentroid(metric=dist).fit(train.loc[:,:29],np.ravel(train.loc[:,30]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))


#Naive bayes classifier

clf=GaussianNB().fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))

#naive bayes classification on random undersampled data
clf=GaussianNB().fit(train.loc[:,:29],np.ravel(train.loc[:,30]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))


#Decision tree classifier

clf = sklearn.tree.DecisionTreeClassifier().fit(xTrain,yTrain)

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))

#decision tree classification on random undersampled data
clf=sklearn.tree.DecisionTreeClassifier().fit(train.loc[:,:29],train.loc[:,30])
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))


# K- nearest neighbor

clf = sklearn.neighbors.KNeighborsClassifier(metric=dist).fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))

#K nearest neighbor classification on random undersampled data
clf=sklearn.neighbors.KNeighborsClassifier(metric=dist).fit(train.loc[:,:29],np.ravel(train.loc[:,30]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))


#logistic regression on original data
#class sklearn.linear_model.LogisticRegression(penalty=’l2’, dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver=’warn’, max_iter=100, multi_class=’warn’, verbose=0, warm_start=False, n_jobs=None)
clf=linear_model.LogisticRegression().fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))

#logistic regression on random undersampled data
clf=linear_model.LogisticRegression().fit(train.loc[:,:29],np.ravel(train.loc[:,30]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))



#SVM on original data
clf=sklearn.svm.SVC(kernel='linear').fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))
               
#SVM on random undersampled data
clf=sklearn.svm.SVC(kernel='linear').fit(train.loc[:,:29],np.ravel(train.loc[:,30]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))


#k means on non fraud data with number of centres=number of fraud data
clf=cluster.KMeans(n_clusters=trainF.shape[0]).fit(trainNf.loc[:,:29])
train=pd.DataFrame(clf.cluster_centers_)
train[30]=np.zeros(train.shape[0])
train=train.append(trainF)
print(train.shape)

#prototype based classification on original data(classification with minkowski distance)


def dist(x,y):
    x=abs(x-y)
    x=x**(-0.1)
    s=np.sum(x)
    s=s**(-1/0.1)
    return s
clf=sklearn.neighbors.NearestCentroid(metric=dist).fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))

#prototype based classification on random undersampled data
clf=sklearn.neighbors.NearestCentroid(metric=dist).fit(train.loc[:,:29],np.ravel(train.loc[:,30]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))


accuracy on non fraud samples  1.0
recall  0.7816091954022989
precision  1.0
F1 score  0.8774193548387098
accuracy on non fraud samples  1.0
recall  0.7701149425287356
precision  1.0
F1 score  0.8701298701298702




In [39]:
#Naive bayes classifier

clf=GaussianNB().fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))

#naive bayes classification on random undersampled data
clf=GaussianNB().fit(train.loc[:,:29],np.ravel(train.loc[:,30]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))


accuracy on non fraud samples  0.9809523809523809
recall  0.6
precision  0.9692307692307692
F1 score  0.7411764705882353
accuracy on non fraud samples  0.9619047619047619
recall  0.7047619047619048
precision  0.9487179487179487
F1 score  0.8087431693989072


In [40]:
#Decision tree classifier

clf = sklearn.tree.DecisionTreeClassifier().fit(xTrain,yTrain)

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))

#decision tree classification on random undersampled data
clf=sklearn.tree.DecisionTreeClassifier().fit(train.loc[:,:29],train.loc[:,30])
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))


accuracy on non fraud samples  0.9809523809523809
recall  0.7904761904761904
precision  0.9764705882352941
F1 score  0.8736842105263157
accuracy on non fraud samples  0.8571428571428571
recall  0.8952380952380953
precision  0.8623853211009175
F1 score  0.8785046728971964


In [41]:
# K- nearest neighbor

clf = sklearn.neighbors.KNeighborsClassifier(metric=dist).fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))

#K nearest neighbor classification on random undersampled data
clf=sklearn.neighbors.KNeighborsClassifier(metric=dist).fit(train.loc[:,:29],np.ravel(train.loc[:,30]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))


  


accuracy on non fraud samples  0.9523809523809523
recall  0.5904761904761905
precision  0.9253731343283582
F1 score  0.7209302325581395
accuracy on non fraud samples  0.9428571428571428
recall  0.819047619047619
precision  0.9347826086956522
F1 score  0.8730964467005077


In [42]:
#logistic regression on original data
#class sklearn.linear_model.LogisticRegression(penalty=’l2’, dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver=’warn’, max_iter=100, multi_class=’warn’, verbose=0, warm_start=False, n_jobs=None)
clf=linear_model.LogisticRegression().fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))

#logistic regression on random undersampled data
clf=linear_model.LogisticRegression().fit(train.loc[:,:29],np.ravel(train.loc[:,30]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))


accuracy on non fraud samples  0.9809523809523809
recall  0.7904761904761904
precision  0.9764705882352941
F1 score  0.8736842105263157
accuracy on non fraud samples  0.9523809523809523
recall  0.9047619047619048
precision  0.95
F1 score  0.9268292682926829


In [43]:
#SVM on original data
clf=sklearn.svm.SVC(kernel='linear').fit(xTrain,np.ravel(yTrain))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))

#SVM on random undersampled data
clf=sklearn.svm.SVC(kernel='linear').fit(train.loc[:,:29],np.ravel(train.loc[:,30]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))


accuracy on non fraud samples  0.9809523809523809
recall  0.3619047619047619
precision  0.95
F1 score  0.5241379310344827
accuracy on non fraud samples  0.9904761904761905
recall  0.5333333333333333
precision  0.9824561403508771
F1 score  0.691358024691358


In [44]:
#cluster based oversampling
dataCluster=pd.DataFrame()
dataCluster=dataCluster.append(trainNf)
#making clusters
clf1=cluster.KMeans(n_clusters=20).fit(trainNf.loc[:,:29])
clusterTotal=pd.DataFrame(clf1.predict(trainNf.loc[:,:29]))
dataCluster[31]=np.array(clusterTotal)
clusterTotal1=clusterTotal.groupby([0]).size()
clusterMax1=clusterTotal1.max()

clf2=cluster.KMeans(n_clusters=2).fit(trainF.loc[:,:29])
clusterTotal=pd.DataFrame(clf2.predict(trainF.loc[:,:29]))
clusterTotal2=clusterTotal.groupby([0]).size()
clusterMax2=20*clusterMax1//2

#oversampling data of each cluster

dataCluster=dataCluster.sort_values(by=30)
train=pd.DataFrame()
train=train.append(trainNf)
currind=0
for i in range(20):
    for j in (clusterMax1-clusterTotal1):
        r=np.random.randint(0,clusterTotal1[i])
        train=train.append(dataCluster.iloc[r+currind,:31])
    currind=currind+clusterTotal1[i]
#print(train.head)

dataCluster=pd.DataFrame(trainF)
dataCluster=dataCluster.sort_values(by=30)
train=train.append(trainF)
currind=0
for i in range(2):
    for j in range(clusterMax2):
        r=np.random.randint(0,clusterTotal2[i])
        train=train.append(dataCluster.iloc[r+currind,:31])
    currind=currind+clusterTotal2[i]
print(train.groupby([30]).size())

30
0.0    700
1.0    470
dtype: int64


In [45]:
#prototype based classification on original data(classification with minkowski distance)


def dist(x,y):
    x=abs(x-y)
    x=x**(-0.1)
    s=np.sum(x)
    s=s**(-1/0.1)
    return s
clf=sklearn.neighbors.NearestCentroid(metric=dist).fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))

#prototype based classification on random undersampled data
clf=sklearn.neighbors.NearestCentroid(metric=dist).fit(train.loc[:,:29],np.ravel(train.loc[:,30]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))


accuracy on non fraud samples  0.9904761904761905
recall  0.6
precision  0.984375
F1 score  0.7455621301775147
accuracy on non fraud samples  0.9904761904761905
recall  0.5714285714285714
precision  0.9836065573770492
F1 score  0.7228915662650602




In [46]:
#Naive bayes classifier

clf=GaussianNB().fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))

#naive bayes classification on random undersampled data
clf=GaussianNB().fit(train.loc[:,:29],np.ravel(train.loc[:,30]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))


accuracy on non fraud samples  0.9809523809523809
recall  0.6
precision  0.9692307692307692
F1 score  0.7411764705882353
accuracy on non fraud samples  0.9809523809523809
recall  0.6571428571428571
precision  0.971830985915493
F1 score  0.7840909090909092


In [47]:
#Decision tree classifier

clf = sklearn.tree.DecisionTreeClassifier().fit(xTrain,yTrain)

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))

#decision tree classification on random undersampled data
clf=sklearn.tree.DecisionTreeClassifier().fit(train.loc[:,:29],train.loc[:,30])
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))


accuracy on non fraud samples  0.9714285714285714
recall  0.8
precision  0.9655172413793104
F1 score  0.8750000000000001
accuracy on non fraud samples  0.9809523809523809
recall  0.7714285714285715
precision  0.9759036144578314
F1 score  0.8617021276595745


In [48]:
# K- nearest neighbor

clf = sklearn.neighbors.KNeighborsClassifier(metric=dist).fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))


#K nearest neighbor classification on random undersampled data
clf=sklearn.neighbors.KNeighborsClassifier(metric=dist).fit(train.loc[:,:29],np.ravel(train.loc[:,30]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))


  


accuracy on non fraud samples  0.9523809523809523
recall  0.5904761904761905
precision  0.9253731343283582
F1 score  0.7209302325581395
accuracy on non fraud samples  0.9428571428571428
recall  0.7904761904761904
precision  0.9325842696629213
F1 score  0.8556701030927835


In [49]:
#logistic regression on original data
#class sklearn.linear_model.LogisticRegression(penalty=’l2’, dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver=’warn’, max_iter=100, multi_class=’warn’, verbose=0, warm_start=False, n_jobs=None) 
clf=linear_model.LogisticRegression().fit(xTrain,np.ravel(yTrain))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))

#logistic regression on random undersampled data 
clf=linear_model.LogisticRegression().fit(train.loc[:,:29],np.ravel(train.loc[:,30]) )
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))


accuracy on non fraud samples  0.9809523809523809
recall  0.7904761904761904
precision  0.9764705882352941
F1 score  0.8736842105263157
accuracy on non fraud samples  0.9904761904761905
recall  0.7619047619047619
precision  0.9876543209876543
F1 score  0.8602150537634409


In [50]:
#SVM

#SVM on original data
clf=sklearn.svm.SVC(kernel='linear').fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))

#SVM on random undersampled data
clf=sklearn.svm.SVC(kernel='linear').fit(train.loc[:,:29],np.ravel(train.loc[:,30]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))


accuracy on non fraud samples  0.9809523809523809
recall  0.3619047619047619
precision  0.95
F1 score  0.5241379310344827
accuracy on non fraud samples  0.9904761904761905
recall  0.6857142857142857
precision  0.9863013698630136
F1 score  0.8089887640449438


In [51]:
#smote
train=pd.DataFrame(trainF)
print(train.shape)
#print(trainF.iloc[0,:29])
for i in range(yTrain.shape[0]-2*np.count_nonzero(yTrain)):
    j=np.random.randint(0,trainF.shape[0])
    neigh=sklearn.neighbors.KNeighborsClassifier(n_neighbors=1).fit(trainF.loc[:,:29],trainF.loc[:,30])
    neighbour=neigh.kneighbors(np.reshape(np.array(trainF.iloc[j,:30]),(1,-1)), 2, False)
    p=np.random.random()
    newPoint=(trainF.iloc[j,:30])+p*(trainF.iloc[neighbour[0][1],:30]-trainF.iloc[j,:30])
    newPoint=newPoint.append(pd.Series(1))
    newPoint=newPoint.reset_index(drop=True)
    train=train.append(newPoint,ignore_index=True)
print(train.shape)
train=train.append(trainNf)
print(train.shape)

(30, 31)
(300, 31)
(600, 31)


In [52]:
#prototype based classification on original data(classification with minkowski distance)


def dist(x,y):
    x=abs(x-y)
    x=x**(-0.1)
    s=np.sum(x)
    s=s**(-1/0.1)
    return s
clf=sklearn.neighbors.NearestCentroid(metric=dist).fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))

#prototype based classification on random undersampled data
clf=sklearn.neighbors.NearestCentroid(metric=dist).fit(train.loc[:,:29],np.ravel(train.loc[:,30]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))


accuracy on non fraud samples  0.9904761904761905
recall  0.6
precision  0.984375
F1 score  0.7455621301775147
accuracy on non fraud samples  0.9904761904761905
recall  0.4666666666666667
precision  0.98
F1 score  0.632258064516129




In [53]:
#Naive bayes classifier

clf=GaussianNB().fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))

#naive bayes classification on random undersampled data
clf=GaussianNB().fit(train.loc[:,:29],np.ravel(train.loc[:,30]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))


accuracy on non fraud samples  0.9809523809523809
recall  0.6
precision  0.9692307692307692
F1 score  0.7411764705882353
accuracy on non fraud samples  0.9809523809523809
recall  0.6571428571428571
precision  0.971830985915493
F1 score  0.7840909090909092


In [54]:
#Decision tree classifier

clf = sklearn.tree.DecisionTreeClassifier().fit(xTrain,yTrain)

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))

#decision tree classification on random undersampled data
clf=sklearn.tree.DecisionTreeClassifier().fit(train.loc[:,:29],train.loc[:,30])
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))


accuracy on non fraud samples  0.9714285714285714
recall  0.7619047619047619
precision  0.963855421686747
F1 score  0.8510638297872339
accuracy on non fraud samples  0.9714285714285714
recall  0.8
precision  0.9655172413793104
F1 score  0.8750000000000001


In [55]:
# K- nearest neighbor

clf = sklearn.neighbors.KNeighborsClassifier(metric=dist).fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))

#K nearest neighbor classification on random undersampled data
clf=sklearn.neighbors.KNeighborsClassifier(metric=dist).fit(train.loc[:,:29],np.ravel(train.loc[:,30]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))


  


accuracy on non fraud samples  0.9523809523809523
recall  0.5904761904761905
precision  0.9253731343283582
F1 score  0.7209302325581395
accuracy on non fraud samples  0.9523809523809523
recall  0.8
precision  0.9438202247191011
F1 score  0.8659793814432991


In [56]:
#logistic regression on original data
#class sklearn.linear_model.LogisticRegression(penalty=’l2’, dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver=’warn’, max_iter=100, multi_class=’warn’, verbose=0, warm_start=False, n_jobs=None)
clf=linear_model.LogisticRegression().fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))

#logistic regression on random undersampled data
clf=linear_model.LogisticRegression().fit(train.loc[:,:29],np.ravel(train.loc[:,30]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))


accuracy on non fraud samples  0.9809523809523809
recall  0.7904761904761904
precision  0.9764705882352941
F1 score  0.8736842105263157
accuracy on non fraud samples  0.9714285714285714
recall  0.8476190476190476
precision  0.967391304347826
F1 score  0.9035532994923857


In [57]:
#SVM on original data
clf=sklearn.svm.SVC(kernel='linear').fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))

#SVM on random undersampled data
clf=sklearn.svm.SVC(kernel='linear').fit(train.loc[:,:29],np.ravel(train.loc[:,30]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:29],testNf.loc[:,30]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:29],testF.loc[:,30]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:29],testF.loc[:,30])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:29],testNf.loc[:,30])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:29],testF.loc[:,30])/(precision+(clf.score(testF.loc[:,:29],testF.loc[:,30]))))


accuracy on non fraud samples  0.9809523809523809
recall  0.3619047619047619
precision  0.95
F1 score  0.5241379310344827
accuracy on non fraud samples  0.9809523809523809
recall  0.7428571428571429
precision  0.975
F1 score  0.8432432432432432
