In [1]:
import boto3
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from scipy.stats import boxcox
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn import tree


##defining the bucket
s3 = boto3.resource('s3')
bucket_name = 'rachaeld-data445'
bucket = s3.Bucket(bucket_name)

#defining the csv file
file_key = 'train.csv'
file_key2 = 'test.csv'

##train
bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

##test
bucket_object2 = bucket.Object(file_key2)
file_object2 = bucket_object2.get()
file_content_stream2 = file_object2.get('Body')

train = pd.read_csv(file_content_stream)
test = pd.read_csv(file_content_stream2)

In [2]:
train.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemsVoids,scansWithoutRegistration,quanitityModification,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,fraud
0,5,1054,54.7,7,0,3,0.027514,0.051898,0.241379,0
1,3,108,27.36,5,2,4,0.12963,0.253333,0.357143,0
2,3,1516,62.16,3,10,5,0.008575,0.041003,0.230769,0
3,6,1791,92.31,8,4,4,0.016192,0.051541,0.275862,0
4,5,430,81.53,3,7,2,0.062791,0.189605,0.111111,0


In [3]:
test.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition
0,4.0,467.0,88.48,4,8,4,0.014989,0.189465,0.571429
1,3.0,1004.0,58.99,7,6,1,0.026892,0.058755,0.259259
2,1.0,162.0,14.0,4,5,4,0.006173,0.08642,4.0
3,5.0,532.0,84.79,9,3,4,0.026316,0.15938,0.642857
4,5.0,890.0,42.16,4,0,0,0.021348,0.047371,0.210526


In [4]:
## engineering variables from homework 4
##boxcox
transformed_trustLevel= boxcox(train['trustLevel'])
train['newtrustLevel'] = transformed_trustLevel[0]
transformed_trustLevel= boxcox(test['trustLevel'])
test['newtrustLevel'] = transformed_trustLevel[0]

##1/x
train['1/scannedLineItemsPerSecond']= 1/(train['scannedLineItemsPerSecond'])
test['1/scannedLineItemsPerSecond']= 1/(test['scannedLineItemsPerSecond'])

##^2
train['lineItemVoidsPerPosition_squared'] = train['lineItemVoidsPerPosition']**2
test['lineItemVoidsPerPosition_squared'] = test['lineItemVoidsPerPosition']**2

##^3
train['valuePerSecond_cubed'] = train['valuePerSecond'] **3
test['valuePerSecond_cubed'] = test['valuePerSecond'] **3

##natural log
train['Naturallog_grandTotal'] = np.log(train['grandTotal'])
test['Naturallog_grandTotal'] = np.log(test['grandTotal'])

##log base 10
train['log10_totalScanTimeInSeconds'] = np.log10(train['totalScanTimeInSeconds'])
test['log10_totalScanTimeInSeconds'] = np.log10(test['totalScanTimeInSeconds'])

## >5
train['0_1_scansWithoutRegistration'] = np.where(train['scansWithoutRegistration'] < 5, 0, 1) 
test['0_1_scansWithoutRegistration'] = np.where(test['scansWithoutRegistration'] < 5, 0, 1) 

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [5]:
##engineering interactions based on feature importance from homework 4
train['interaction_1'] = train['newtrustLevel'] * train['trustLevel']
test['interaction_1'] = test['newtrustLevel'] * test['trustLevel']

train['interaction_2'] = train['newtrustLevel'] * train['1/scannedLineItemsPerSecond']
test['interaction_2'] = test['newtrustLevel'] * test['1/scannedLineItemsPerSecond']

train['interaction_3'] = train['trustLevel'] * train['1/scannedLineItemsPerSecond']
test['interaction_3'] = test['trustLevel'] * test['1/scannedLineItemsPerSecond']

In [6]:
## final interactions from homework 4 
##interaction 1 <= .862 for all
train['tree_interaction1'] = np.where(train['interaction_1'] <= 0.862, 1, 0)

## interaction 1 --> interaction 1 --> totalScanTimeInSeconds 
train['tree_interaction_2'] = np.where(((train['interaction_1'] >= .862) & 
                                     (train['interaction_1'] <= 3.204) & 
                                     (train['totalScanTimeInSeconds'] <= 1298)), 1, 0)

## interaction 1 (.862) --> 1/scannedLineItemsPerSecond (86.763) --> log10_totalScanTimeInSeconds (3.022)
train['tree_interaction_3'] = np.where(((train['interaction_1'] >= .862) & 
                                     (train['1/scannedLineItemsPerSecond'] <= 86.763) & 
                                     (train['log10_totalScanTimeInSeconds'] <= 3.022)), 1, 0)

## same things for test
test['tree_interaction1'] = np.where(test['interaction_1'] <= 0.862, 1, 0)

test['tree_interaction_2'] = np.where(((test['interaction_1'] >= .862) & 
                                     (test['interaction_1'] <= 3.204) & 
                                     (test['totalScanTimeInSeconds'] <= 1298)), 1, 0)

test['tree_interaction_3'] = np.where(((test['interaction_1'] >= .862) & 
                                     (test['1/scannedLineItemsPerSecond'] <= 86.763) & 
                                     (test['log10_totalScanTimeInSeconds'] <= 3.022)), 1, 0)

In [7]:
logit = list()
rf = list()
ada = list()

X = train.drop(columns = ['fraud'])
Y = train['fraud']

for i in range (0,2):
    ##splitting the data 
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .2, stratify = Y)
    
## LOGISTIC REGRESSION ##
    Logit_rfe = RFE(estimator = LogisticRegression(), n_features_to_select = 5).fit(X_train, Y_train)
    logit.append(Logit_rfe.support_)
## RANDOM FOREST ##
    RF_rfe = RFE(estimator = RandomForestClassifier(n_estimators = 500, max_depth = 3), n_features_to_select = 5).fit(X_train, Y_train)
    rf.append(RF_rfe.support_)
## ADABOOST ##
    ADA_rfe = RFE(estimator = AdaBoostClassifier(base_estimator= DecisionTreeClassifier(max_depth=3), n_estimators=500, learning_rate=.01), n_features_to_select = 5).fit(X_train, Y_train)
    ada.append(ADA_rfe.support_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [10]:
rf

[array([False,  True, False, False, False, False,  True, False, False,
         True, False, False, False, False, False, False, False,  True,
         True, False, False, False]),
 array([ True, False, False, False, False, False,  True, False, False,
        False, False, False, False, False,  True, False,  True, False,
         True, False, False, False])]