In [5]:
import numpy as np
import pandas as pd
from pandas import read_csv
import math
from sklearn import mixture
from sklearn import cluster
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score
import scipy.stats
import matplotlib.pyplot as plt
import sklearn

### Import data. Split data as test and train data.

In [None]:
creditData = np.loadtxt('creditcard.csv', dtype = np.str, delimiter = ',', skiprows = 1)
print(creditData.shape)
creditData[:,30] = np.core.defchararray.strip(creditData[:,30], chars = '"')
creditData = creditData.astype(np.float)

fraud_bool, counts = np.unique(creditData[:, 30], return_counts = True)
print("Fraudulent transaction percentage: {}%".format(counts[1] / (counts[0] + counts[1]) * 100))

#splitting between train and test set
xTrain, xTest, yTrain, yTest=sklearn.model_selection.train_test_split(creditData[:,:creditData.shape[1]-1],creditData[:,creditData.shape[1]-1],test_size=0.20)

#test contains test input and output both
test=pd.DataFrame(xTest)
test[test.shape[1]]=yTest
testF=test[test[test.shape[1]-1]==1]#test data for fraud examples
testNf=test[test[test.shape[1]-1]==0]#test data for non fraud examples


train=pd.DataFrame(xTrain)
train[train.shape[1]]=yTrain
trainF=train[train[train.shape[1]-1]==1]#train data for fraud examples
trainNf=train[train[train.shape[1]-1]==0]#train data for non fraud examples

#reducing the size of data
train=pd.DataFrame()
xTrain=pd.DataFrame()
yTrain=pd.DataFrame()
for i in range(30000):
    r=np.random.randint(0,trainNf.shape[0])
    train=train.append(trainNf.iloc[r,:])  
for i in range(3000):
    r=np.random.randint(0,trainF.shape[0])
    train=train.append(trainF.iloc[r,:])

xTrain=xTrain.append(train.loc[:,:train.shape[1]-2])
yTrain[train.shape[1]-1]=train.loc[:,train.shape[1]-1]

trainF=train[train[train.shape[1]-1]==1]
trainNf=train[train[train.shape[1]-1]==0]

testN=pd.DataFrame()
for i in range(testF.shape[0]):
    r=np.random.randint(0,testNf.shape[0])
    testN=testN.append(testNf.iloc[r,:])
testNf=testN
print(xTrain.shape,yTrain.shape,trainF.shape,trainNf.shape,testF.shape,testNf.shape)

trainX=xTrain
trainY=yTrain
testY=yTest
trainY=yTrain

(284807, 31)


In [None]:
# creditDataDF = pd.read_csv('creditcard.csv')
# plt.figure(figsize = [7,7])
# plt.matshow(creditDataDF.corr(), fignum = 1)
# plt.colorbar()
# plt.show()

# creditDataDF.describe()

### Bagging
of individual decision trees

In [None]:
# 5% of data for each model
bagModel = BaggingClassifier(base_estimator = None, n_estimators = 25, max_samples = 14240, 
                    bootstrap = True).fit(trainX, trainY)
predictedY = bagModel.predict(testX)

confustion_matrix = confusion_matrix(testY, predictedY)
print(confustion_matrix)
print("Accuracy of fradulent transaction detection {}"
      .format(confustion_matrix[1][1] / (confustion_matrix[1][0] + confustion_matrix[1][1])))
print("Accuracy of legitimate transaction detection {}"
      .format(confustion_matrix[0][0] / (confustion_matrix[0][0] + confustion_matrix[0][1])))
print("Precision: {}".format(confustion_matrix[1][1] / (confustion_matrix[1][1] + confustion_matrix[0][1])))
print("Recall: {}".format(confustion_matrix[1][1] / (confustion_matrix[1][1] + confustion_matrix[1][0])))

### Random forest classifier

In [None]:
rfcModel = RandomForestClassifier().fit(trainX, trainY)
predictedY = rfcModel.predict(testX)

confustion_matrix = confusion_matrix(testY, predictedY)
print(confustion_matrix)
print("Accuracy of fradulent transaction detection {}"
      .format(confustion_matrix[1][1] / (confustion_matrix[1][0] + confustion_matrix[1][1])))
print("Accuracy of legitimate transaction detection {}"
      .format(confustion_matrix[0][0] / (confustion_matrix[0][0] + confustion_matrix[0][1])))
print("Precision: {}".format(confustion_matrix[1][1] / (confustion_matrix[1][1] + confustion_matrix[0][1])))
print("Recall: {}".format(confustion_matrix[1][1] / (confustion_matrix[1][1] + confustion_matrix[1][0])))

### EasyEnsemble
Ensemble of decision trees, each trained on an independently drawn sample from the legitimate transactions and the same training data of fradulent transactions (balanced training set).

In [None]:
trainData = np.hstack((trainX, trainY.reshape(trainY.shape[0], 1)))
legitimateTrans = trainData[np.where(trainY[:] == 0)][:, 0:31]
fraudulentTrans = trainData[np.where(trainY[:] == 1)][:, 0:31]
fraudX = fraudulentTrans[:,0:30]

num_DT = 8

running_prediction = np.zeros(testX.shape[0])
for i in range(num_DT):
    x = legitimateTrans[np.random.choice(legitimateTrans.shape[0], fraudulentTrans.shape[0]), :]
    m = np.vstack((x, fraudulentTrans))
    clf = tree.DecisionTreeClassifier().fit(m[:,0:30], m[:,30])
    running_prediction = running_prediction + clf.predict(testX)
    
predictedY = (running_prediction / num_DT).astype(np.int)

confustion_matrix = confusion_matrix(testY, predictedY)
print(confustion_matrix)
print("Accuracy of fradulent transaction detection {}"
      .format(confustion_matrix[1][1] / (confustion_matrix[1][0] + confustion_matrix[1][1])))
print("Accuracy of legitimate transaction detection {}"
      .format(confustion_matrix[0][0] / (confustion_matrix[0][0] + confustion_matrix[0][1])))
print("Precision: {}".format(confustion_matrix[1][1] / (confustion_matrix[1][1] + confustion_matrix[0][1])))
print("Recall: {}".format(confustion_matrix[1][1] / (confustion_matrix[1][1] + confustion_matrix[1][0])))

### Gradient boosting (try XGBoost instead)

In [None]:
boostModel = GradientBoostingClassifier(learning_rate = 0.03).fit(trainX, trainY)
predictedY = boostModel.predict(testX)

confustion_matrix = confusion_matrix(testY, predictedY)
print(confustion_matrix)
print("Accuracy of fradulent transaction detection {}"
      .format(confustion_matrix[1][1] / (confustion_matrix[1][0] + confustion_matrix[1][1])))
print("Accuracy of legitimate transaction detection {}"
      .format(confustion_matrix[0][0] / (confustion_matrix[0][0] + confustion_matrix[0][1])))
print("Precision: {}".format(confustion_matrix[1][1] / (confustion_matrix[1][1] + confustion_matrix[0][1])))
print("Recall: {}".format(confustion_matrix[1][1] / (confustion_matrix[1][1] + confustion_matrix[1][0])))

### (Generative model of legitimate transactions?)