In [1]:
#Handling imbalanced datasets in machine learning

from __future__ import division, print_function, unicode_literals
import numpy as np
import pandas as pd
from pandas import read_csv
import math
import sklearn
from sklearn import linear_model
import random
from sklearn import cluster
from sklearn.naive_bayes import GaussianNB
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import matplotlib
import matplotlib.pyplot as plt

In [2]:
#loading data

creditData=np.loadtxt('creditcard.csv',dtype=np.str,delimiter=',',skiprows=1)

In [3]:
print(creditData.shape)

#stripping "" from output variable
creditData[:,creditData.shape[1]-1]=np.core.defchararray.strip(creditData[:,creditData.shape[1]-1], chars='"')
#type conversion
creditData=creditData.astype(np.float)
print(creditData.dtype)

(284807, 31)
float64


In [4]:
#1. Data Level Approach
fraudRate=(np.count_nonzero(creditData[:,creditData.shape[1]-1])/creditData.shape[0])
print(fraudRate)#ratio of non fraud to fraud data

0.001727485630620034


In [None]:
#splitting between train and test set
xTrain, xTest, yTrain, yTest=sklearn.model_selection.train_test_split(creditData[:,:creditData.shape[1]-1],creditData[:,creditData.shape[1]-1],test_size=0.20)

#test contains test input and output both
test=pd.DataFrame(xTest)
test[test.shape[1]]=yTest
testF=test[test[test.shape[1]-1]==1]#test data for fraud examples
testNf=test[test[test.shape[1]-1]==0]#test data for non fraud examples


train=pd.DataFrame(xTrain)
train[train.shape[1]]=yTrain
trainF=train[train[train.shape[1]-1]==1]#train data for fraud examples
trainNf=train[train[train.shape[1]-1]==0]#train data for non fraud examples

#reducing the size of data
train=pd.DataFrame()
xTrain=pd.DataFrame()
yTrain=pd.DataFrame()
for i in range(30000):
    r=np.random.randint(0,trainNf.shape[0])
    train=train.append(trainNf.iloc[r,:])  
for i in range(30):
    r=np.random.randint(0,trainF.shape[0])
    train=train.append(trainF.iloc[r,:])

xTrain=xTrain.append(train.loc[:,:train.shape[1]-2])
yTrain[train.shape[1]-1]=train.loc[:,train.shape[1]-1]

trainF=train[train[train.shape[1]-1]==1]
trainNf=train[train[train.shape[1]-1]==0]

testN=pd.DataFrame()
for i in range(testF.shape[0]):
    r=np.random.randint(0,testNf.shape[0])
    testN=testN.append(testNf.iloc[r,:])
testNf=testN
print(xTrain.shape,yTrain.shape,trainF.shape,trainNf.shape,testF.shape,testNf.shape)

'''#tsne
Xnew=TSNE(n_components=2).fit_transform(xTrain)
#print("K means for t-SNE")

#kmeans = KMeans(n_clusters=2).fit(Xnew)
#y=kmeans.predict(Xnew)
fig = plt.figure(figsize=(10,10))
#matplotlib.pyplot.scatter(Xnew[:,0], Xnew[:,1], c=y, cmap='tab10')
colors = ['blue','red']
matplotlib.pyplot.scatter(Xnew[:,0], Xnew[:,1], c=np.ravel(yTrain.astype(np.int)), cmap=matplotlib.colors.ListedColormap(colors))
#plt.title('Clustering of points after running the kmeans algorithm on tSNE redued data')
plt.xlabel('x1')
plt.ylabel('x2')
plt.show()'''

In [None]:
print(Xnew.shape,yTrain.shape)
print(np.ravel(yTrain.astype(np.int)))

In [None]:
#Random undersampling
train=pd.DataFrame()
for i in range(np.count_nonzero(yTrain)):
    r=np.random.randint(0,trainNf.shape[0])
    train=train.append(trainNf.iloc[r,:])  
train=train.append(trainF)


In [None]:
#prototype based classification on original data(bad classification with eucledian distance metric)

clf=sklearn.neighbors.NearestCentroid().fit(xTrain,np.ravel(yTrain))

print(clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples

#prototype based classification on random undersampled data
clf=sklearn.neighbors.NearestCentroid().fit(train.loc[:,:train.shape[1]-2],np.ravel(train.loc[:,train.shape[1]-1]))
print(clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples

In [None]:
#prototype based classification on original data(classification with minkowski distance with p=-1)

def dist(x,y):
    x=abs(x-y)
    x=x**(-0.1)
    s=np.sum(x)
    s=s**(-1/0.1)
    return s
clf=sklearn.neighbors.NearestCentroid(metric=dist).fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

#prototype based classification on random undersampled data
clf=sklearn.neighbors.NearestCentroid(metric=dist).fit(train.loc[:,:train.shape[1]-2],np.ravel(train.loc[:,train.shape[1]-1]))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

In [None]:
#Naive bayes classifier

clf=GaussianNB().fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

#naive bayes classification on random undersampled data
clf=GaussianNB().fit(train.loc[:,:train.shape[1]-2],np.ravel(train.loc[:,train.shape[1]-1]))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

In [None]:
#Decision tree classifier

clf = sklearn.tree.DecisionTreeClassifier().fit(xTrain,yTrain)

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))


#decision tree classification on random undersampled data
clf=sklearn.tree.DecisionTreeClassifier().fit(train.loc[:,:train.shape[1]-2],train.loc[:,train.shape[1]-1])
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

clf = sklearn.tree.DecisionTreeClassifier(class_weight={1:10}).fit(xTrain,yTrain)

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

In [None]:
# K- nearest neighbor

clf = sklearn.neighbors.KNeighborsClassifier(metric=dist).fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

#K nearest neighbor classification on random undersampled data
clf=sklearn.neighbors.KNeighborsClassifier(metric=dist).fit(train.loc[:,:train.shape[1]-2],np.ravel(train.loc[:,train.shape[1]-1]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precision
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

In [None]:
#logistic regression on original data
#class sklearn.linear_model.LogisticRegression(penalty=’l2’, dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver=’warn’, max_iter=100, multi_class=’warn’, verbose=0, warm_start=False, n_jobs=None)
clf=linear_model.LogisticRegression().fit(xTrain,np.ravel(yTrain))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

#logistic regression on random undersampled data
clf=linear_model.LogisticRegression().fit(train.loc[:,:train.shape[1]-2],np.ravel(train.loc[:,train.shape[1]-1]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print(TN,TP,FP)
print("precision ",precision)#precision
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

clf=linear_model.LogisticRegression(class_weight={1:10}).fit(xTrain,np.ravel(yTrain))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

In [None]:
#SVM on original data
clf=sklearn.svm.SVC(kernel='linear',cache_size=4096,max_iter=200,verbose=True).fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

#SVM on random undersampled data
clf=sklearn.svm.SVC(kernel='linear',cache_size=4096,max_iter=200,verbose=True).fit(train.loc[:,:train.shape[1]-2],np.ravel(train.loc[:,train.shape[1]-1]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

clf=sklearn.svm.SVC(kernel='linear',class_weight={1:10},cache_size=4096,verbose=True).fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))


In [None]:
#Random Oversampling

train=pd.DataFrame()
for i in range(yTrain.shape[0]-2*np.count_nonzero(yTrain)):
    r=np.random.randint(0,trainF.shape[0])
    train=train.append(trainF.iloc[r,:])  
train=train.append(trainNf)

In [None]:
#prototype based classification on original data(classification with minkowski distance)


def dist(x,y):
    x=abs(x-y)
    x=x**(-0.1)
    s=np.sum(x)
    s=s**(-1/0.1)
    return s
clf=sklearn.neighbors.NearestCentroid(metric=dist).fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

#prototype based classification on random undersampled data
clf=sklearn.neighbors.NearestCentroid(metric=dist).fit(train.loc[:,:train.shape[1]-2],np.ravel(train.loc[:,train.shape[1]-1]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

In [None]:
#Naive bayes classifier

clf=GaussianNB().fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

#naive bayes classification on random undersampled data
clf=GaussianNB().fit(train.loc[:,:train.shape[1]-2],np.ravel(train.loc[:,train.shape[1]-1]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

In [None]:
#Decision tree classifier

clf = sklearn.tree.DecisionTreeClassifier().fit(xTrain,yTrain)

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

#decision tree classification on random undersampled data
clf=sklearn.tree.DecisionTreeClassifier().fit(train.loc[:,:train.shape[1]-2],train.loc[:,train.shape[1]-1])
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

In [None]:
# K- nearest neighbor

clf = sklearn.neighbors.KNeighborsClassifier(metric=dist).fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

#K nearest neighbor classification on random undersampled data
clf=sklearn.neighbors.KNeighborsClassifier(metric=dist).fit(train.loc[:,:train.shape[1]-2],np.ravel(train.loc[:,train.shape[1]-1]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

In [None]:
#logistic regression on original data
#class sklearn.linear_model.LogisticRegression(penalty=’l2’, dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver=’warn’, max_iter=100, multi_class=’warn’, verbose=0, warm_start=False, n_jobs=None)
clf=linear_model.LogisticRegression().fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

#logistic regression on random undersampled data
clf=linear_model.LogisticRegression().fit(train.loc[:,:train.shape[1]-2],np.ravel(train.loc[:,train.shape[1]-1]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

In [None]:
#SVM on original data
clf=sklearn.svm.SVC(kernel='linear',cache_size=4096,max_iter=200,verbose=True).fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))
               
#SVM on random undersampled data
clf=sklearn.svm.SVC(kernel='linear',cache_size=4096,max_iter=200,verbose=True).fit(train.loc[:,:train.shape[1]-2],np.ravel(train.loc[:,train.shape[1]-1]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

In [None]:
#k means on non fraud data with number of centres=number of fraud data
clf=cluster.KMeans(n_clusters=trainF.shape[0]).fit(trainNf.loc[:,:trainNf.shape[1]-2])
train=pd.DataFrame(clf.cluster_centers_)
train[train.shape[1]-1]=np.zeros(train.shape[0])
train=train.append(trainF)
print(train.shape)

In [None]:
#prototype based classification on original data(classification with minkowski distance)


def dist(x,y):
    x=abs(x-y)
    x=x**(-0.1)
    s=np.sum(x)
    s=s**(-1/0.1)
    return s
clf=sklearn.neighbors.NearestCentroid(metric=dist).fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

#prototype based classification on random undersampled data
clf=sklearn.neighbors.NearestCentroid(metric=dist).fit(train.loc[:,:train.shape[1]-2],np.ravel(train.loc[:,train.shape[1]-1]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

In [None]:
#Naive bayes classifier

clf=GaussianNB().fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

#naive bayes classification on random undersampled data
clf=GaussianNB().fit(train.loc[:,:train.shape[1]-2],np.ravel(train.loc[:,train.shape[1]-1]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))


In [None]:
#Decision tree classifier

clf = sklearn.tree.DecisionTreeClassifier().fit(xTrain,yTrain)

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

#decision tree classification on random undersampled data
clf=sklearn.tree.DecisionTreeClassifier().fit(train.loc[:,:train.shape[1]-2],train.loc[:,train.shape[1]-1])
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))


In [None]:
# K- nearest neighbor

clf = sklearn.neighbors.KNeighborsClassifier(metric=dist).fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

#K nearest neighbor classification on random undersampled data
clf=sklearn.neighbors.KNeighborsClassifier(metric=dist).fit(train.loc[:,:train.shape[1]-2],np.ravel(train.loc[:,train.shape[1]-1]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))


In [None]:
#logistic regression on original data
#class sklearn.linear_model.LogisticRegression(penalty=’l2’, dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver=’warn’, max_iter=100, multi_class=’warn’, verbose=0, warm_start=False, n_jobs=None)
clf=linear_model.LogisticRegression().fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

#logistic regression on random undersampled data
clf=linear_model.LogisticRegression().fit(train.loc[:,:train.shape[1]-2],np.ravel(train.loc[:,train.shape[1]-1]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))


In [None]:
#SVM on original data
clf=sklearn.svm.SVC(kernel='linear',cache_size=4096,max_iter=200,verbose=True).fit(xTrain,np.ravel(yTrain))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

#SVM on random undersampled data
clf=sklearn.svm.SVC(kernel='linear',cache_size=4096,max_iter=200,verbose=True).fit(train.loc[:,:train.shape[1]-2],np.ravel(train.loc[:,train.shape[1]-1]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))


In [None]:
#cluster based oversampling
dataCluster=pd.DataFrame()
dataCluster=dataCluster.append(trainNf)
#making clusters
clf1=cluster.KMeans(n_clusters=20).fit(trainNf.loc[:,:trainNf.shape[1]-2])
clusterTotal=pd.DataFrame(clf1.predict(trainNf.loc[:,:trainNf.shape[1]-2]))
dataCluster[31]=np.array(clusterTotal)
clusterTotal1=clusterTotal.groupby([0]).size()
clusterMax1=clusterTotal1.max()

clf2=cluster.KMeans(n_clusters=2).fit(trainF.loc[:,:trainF.shape[1]-2])
clusterTotal=pd.DataFrame(clf2.predict(trainF.loc[:,:trainF.shape[1]-2]))
clusterTotal2=clusterTotal.groupby([0]).size()
clusterMax2=20*clusterMax1//2

#oversampling data of each cluster

dataCluster=dataCluster.sort_values(by=trainNf.shape[1]-1)
train=pd.DataFrame()
train=train.append(trainNf)
currind=0
for i in range(20):
    for j in (clusterMax1-clusterTotal1):
        r=np.random.randint(0,clusterTotal1[i])
        train=train.append(dataCluster.iloc[r+currind,:31])
    currind=currind+clusterTotal1[i]
#print(train.head)

dataCluster=pd.DataFrame(trainF)
dataCluster=dataCluster.sort_values(by=trainF.shape[1]-1)
train=train.append(trainF)
currind=0
for i in range(2):
    for j in range(clusterMax2):
        r=np.random.randint(0,clusterTotal2[i])
        train=train.append(dataCluster.iloc[r+currind,:31])
    currind=currind+clusterTotal2[i]
print(train.groupby([train.shape[1]-1]).size())

In [None]:
#prototype based classification on original data(classification with minkowski distance)


def dist(x,y):
    x=abs(x-y)
    x=x**(-0.1)
    s=np.sum(x)
    s=s**(-1/0.1)
    return s
clf=sklearn.neighbors.NearestCentroid(metric=dist).fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

#prototype based classification on random undersampled data
clf=sklearn.neighbors.NearestCentroid(metric=dist).fit(train.loc[:,:train.shape[1]-2],np.ravel(train.loc[:,train.shape[1]-1]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))


In [None]:
#Naive bayes classifier

clf=GaussianNB().fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

#naive bayes classification on random undersampled data
clf=GaussianNB().fit(train.loc[:,:train.shape[1]-2],np.ravel(train.loc[:,train.shape[1]-1]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))


In [None]:
#Decision tree classifier

clf = sklearn.tree.DecisionTreeClassifier().fit(xTrain,yTrain)

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

#decision tree classification on random undersampled data
clf=sklearn.tree.DecisionTreeClassifier().fit(train.loc[:,:train.shape[1]-2],train.loc[:,train.shape[1]-1])
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))


In [None]:
# K- nearest neighbor

clf = sklearn.neighbors.KNeighborsClassifier(metric=dist).fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))


#K nearest neighbor classification on random undersampled data
clf=sklearn.neighbors.KNeighborsClassifier(metric=dist).fit(train.loc[:,:train.shape[1]-2],np.ravel(train.loc[:,train.shape[1]-1]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))


In [None]:
#logistic regression on original data
#class sklearn.linear_model.LogisticRegression(penalty=’l2’, dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver=’warn’, max_iter=100, multi_class=’warn’, verbose=0, warm_start=False, n_jobs=None) 
clf=linear_model.LogisticRegression().fit(xTrain,np.ravel(yTrain))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

#logistic regression on random undersampled data 
clf=linear_model.LogisticRegression().fit(train.loc[:,:train.shape[1]-2],np.ravel(train.loc[:,train.shape[1]-1]) )
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))


In [None]:
#SVM

#SVM on original data
clf=sklearn.svm.SVC(kernel='linear',cache_size=4096,max_iter=200,verbose=True).fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

#SVM on random undersampled data
clf=sklearn.svm.SVC(kernel='linear',cache_size=4096,max_iter=200,verbose=True).fit(train.loc[:,:train.shape[1]-2],np.ravel(train.loc[:,train.shape[1]-1]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))


In [None]:
#smote
train=pd.DataFrame(trainF)
print(train.shape)
#print(trainF.iloc[0,:trainF.shape[1]-2])
for i in range(yTrain.shape[0]-2*np.count_nonzero(yTrain)):
    j=np.random.randint(0,trainF.shape[0])
    neigh=sklearn.neighbors.KNeighborsClassifier(n_neighbors=1).fit(trainF.loc[:,:trainF.shape[1]-2],trainF.loc[:,trainF.shape[1]-1])
    neighbour=neigh.kneighbors(np.reshape(np.array(trainF.iloc[j,:trainF.shape[1]-1]),(1,-1)), 2, False)
    p=np.random.random()
    newPoint=(trainF.iloc[j,:trainF.shape[1]-1])+p*(trainF.iloc[neighbour[0][1],:trainF.shape[1]-1]-trainF.iloc[j,:trainF.shape[1]-1])
    newPoint=newPoint.append(pd.Series(1))
    newPoint=newPoint.reset_index(drop=True)
    train=train.append(newPoint,ignore_index=True)
print(train.shape)
train=train.append(trainNf)
print(train.shape)

In [None]:
#prototype based classification on original data(classification with minkowski distance)


def dist(x,y):
    x=abs(x-y)
    x=x**(-0.1)
    s=np.sum(x)
    s=s**(-1/0.1)
    return s
clf=sklearn.neighbors.NearestCentroid(metric=dist).fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

#prototype based classification on random undersampled data
clf=sklearn.neighbors.NearestCentroid(metric=dist).fit(train.loc[:,:train.shape[1]-2],np.ravel(train.loc[:,train.shape[1]-1]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))


In [None]:
#Naive bayes classifier

clf=GaussianNB().fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

#naive bayes classification on random undersampled data
clf=GaussianNB().fit(train.loc[:,:train.shape[1]-2],np.ravel(train.loc[:,train.shape[1]-1]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))


In [None]:
#Decision tree classifier

clf = sklearn.tree.DecisionTreeClassifier().fit(xTrain,yTrain)

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

#decision tree classification on random undersampled data
clf=sklearn.tree.DecisionTreeClassifier().fit(train.loc[:,:train.shape[1]-2],train.loc[:,train.shape[1]-1])
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))


In [None]:
# K- nearest neighbor

clf = sklearn.neighbors.KNeighborsClassifier(metric=dist).fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

#K nearest neighbor classification on random undersampled data
clf=sklearn.neighbors.KNeighborsClassifier(metric=dist).fit(train.loc[:,:train.shape[1]-2],np.ravel(train.loc[:,train.shape[1]-1]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))


In [None]:
#logistic regression on original data
#class sklearn.linear_model.LogisticRegression(penalty=’l2’, dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver=’warn’, max_iter=100, multi_class=’warn’, verbose=0, warm_start=False, n_jobs=None)
clf=linear_model.LogisticRegression().fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

#logistic regression on random undersampled data
clf=linear_model.LogisticRegression().fit(train.loc[:,:train.shape[1]-2],np.ravel(train.loc[:,train.shape[1]-1]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))


In [None]:
#SVM on original data
clf=sklearn.svm.SVC(kernel='linear',cache_size=4096,max_iter=200,verbose=True).fit(xTrain,np.ravel(yTrain))

print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))

#SVM on random undersampled data
clf=sklearn.svm.SVC(kernel='linear',cache_size=4096,max_iter=200,verbose=True).fit(train.loc[:,:train.shape[1]-2],np.ravel(train.loc[:,train.shape[1]-1]))
print("accuracy on non fraud samples ",clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1]))#accuracy on non fraud examples
print("recall ", clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))#accuracy on fraud examples(recall)
TP=clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])*(testF.shape[0])
TN=clf.score(testNf.loc[:,:testNf.shape[1]-2],testNf.loc[:,testNf.shape[1]-1])*(testNf.shape[0])
FP=(testNf.shape[0])-TN
precision=(TP)/(TP+FP)
print("precision ",precision)#precisio
print("F1 score ",2*precision*clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1])/(precision+(clf.score(testF.loc[:,:testF.shape[1]-2],testF.loc[:,testF.shape[1]-1]))))
