In [5]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np 

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.svm import SVC
import tensorflow as tf
import matplotlib.pyplot as plt

s3 = boto3.resource('s3')
bucket_name = "rachaeld-data445"
bucket = s3.Bucket(bucket_name)

file_key = 'train.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

#reading the data-file
project = pd.read_csv(file_content_stream)
project.head()



Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemsVoids,scansWithoutRegistration,quanitityModification,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,fraud
0,5,1054,54.7,7,0,3,0.027514,0.051898,0.241379,0
1,3,108,27.36,5,2,4,0.12963,0.253333,0.357143,0
2,3,1516,62.16,3,10,5,0.008575,0.041003,0.230769,0
3,6,1791,92.31,8,4,4,0.016192,0.051541,0.275862,0
4,5,430,81.53,3,7,2,0.062791,0.189605,0.111111,0


In [6]:
# defining the input and target variables
X = project[['trustLevel', 'scannedLineItemsPerSecond','totalScanTimeInSeconds', 'lineItemVoidsPerPosition', 'valuePerSecond']]
Y = project['fraud']

# splitting the data
X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size = .2)

In [3]:
# min-max transformation
scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [8]:
#lists for results 
md1results = list()
md2results = list()
md3results = list()
md4results = list()

#running each model 1000 times 
for i in range (0,1000):
    #splitting
    X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size = .2)
    
    # min-max transformation
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)
    
### model 1 ###
    # defining the model
    md1 = tf.keras.models.Sequential([
        tf.keras.layers.Dense(4, input_dim = 5, activation= 'tanh'),
        tf.keras.layers.Dense(2, activation = 'softmax')
    ])
    md1.compile(optimizer='sgd', loss = 'categorical_crossentropy', metrics = ['accuracy'])
    md1.fit(X_train, tf.keras.utils.to_categorical(Y_train, num_classes = 2), epochs = 100, batch_size= 500, verbose = 0)
    # predicting on test
    md1pred = md1.predict(X_test)[:, 1]
    # cut off at 15%
    md1labels = np.where(md1pred < .15, 0, 1)
    #computing the recall score
    md1recall = recall_score(Y_test, md1labels)
    md1results.append(md1recall)

### model 2 ###
    # defining the model
    md2 = tf.keras.models.Sequential([
        tf.keras.layers.Dense(4, input_dim = 5, activation= 'relu'),
        tf.keras.layers.Dense(2, activation = 'softmax')
    ])
    md2.compile(optimizer='sgd', loss = 'categorical_crossentropy', metrics = ['accuracy'])
    md2.fit(X_train, tf.keras.utils.to_categorical(Y_train, num_classes = 2), epochs = 100, batch_size= 500, verbose = 0)
    #predicting on test
    md2pred = md2.predict(X_test)[:, 1]
    #cut off at 15%
    md2labels = np.where(md2pred < .15, 0, 1)
    #computing the recall score
    md2recall = recall_score(Y_test, md2labels)
    md2results.append(md2recall)

### model 3 ###
    #First SVC Model 
    svm_model3 = SVC(kernel = 'rbf', probability = True).fit(X_train, Y_train)
    #predicting on test
    svm3_pred = svm_model3.predict_proba(X_test)[:,1]
    #cut off at 15%
    svm3_labels = np.where(svm3_pred < .15, 0, 1)
    #computing the recall score
    svm3_recall = recall_score(Y_test, svm3_labels)
    md3results.append(svm3_recall)
    
### model 4 ###
    #Second SVC Model 
    svm_model4 = SVC(kernel = 'poly', probability = True).fit(X_train, Y_train)
    #predicting on test
    svm4_pred = svm_model4.predict_proba(X_test)[:,1]
    #cut off at 15%
    svm4_labels = np.where(svm4_pred < .15, 0, 1)
    #computing the recall score
    svm4_recall = recall_score(Y_test, svm4_labels)
    md4results.append(svm4_recall)
    
print('The average recall for model 1 is', np.mean(md1results))
print('The average recall for model 2 is',np.mean(md2results))
print('The average recall for model 3 is',np.mean(md3results))
print('The average recall for model 4 is',np.mean(md4results))    
    

The average recall for model 1 is 0.058155047249588324
The average recall for model 2 is 0.24635956572861936
The average recall for model 3 is 0.4256745299890442
The average recall for model 4 is 0.28560009996982605
