In [1]:
pip install optuna 

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Note: you may need to restart the kernel to use updated packages.


In [34]:
import warnings
warnings.simplefilter(action= 'ignore', category=FutureWarning)

import boto3
import pandas as pd
import numpy as np

from tqdm import tqdm
from scipy.stats import boxcox
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.feature_selection import RFE, RFECV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier 
from sklearn.metrics import classification_report, make_scorer, confusion_matrix
import optuna

from cost_function import cost_function, cost_function_cutoff 
##defining the bucket
s3 = boto3.resource('s3')
bucket_name = 'rachaeld-data445'
bucket = s3.Bucket(bucket_name)

#defining the csv file
file_key = 'train.csv'
file_key2 = 'test.csv'

##train
bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

##test
bucket_object2 = bucket.Object(file_key2)
file_object2 = bucket_object2.get()
file_content_stream2 = file_object2.get('Body')

train = pd.read_csv(file_content_stream)
test = pd.read_csv(file_content_stream2)

In [47]:
test.rename(columns={'lineItemVoids':'lineItemsVoids'}, inplace=True)

In [35]:
train.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemsVoids,scansWithoutRegistration,quanitityModification,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,fraud
0,5,1054,54.7,7,0,3,0.027514,0.051898,0.241379,0
1,3,108,27.36,5,2,4,0.12963,0.253333,0.357143,0
2,3,1516,62.16,3,10,5,0.008575,0.041003,0.230769,0
3,6,1791,92.31,8,4,4,0.016192,0.051541,0.275862,0
4,5,430,81.53,3,7,2,0.062791,0.189605,0.111111,0


In [48]:
test.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemsVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,newtrustLevel,...,valuePerSecond_cubed,Naturallog_grandTotal,log10_totalScanTimeInSeconds,0_1_scansWithoutRegistration,interaction_1,interaction_2,interaction_3,tree_interaction1,tree_interaction_2,tree_interaction_3
0,4.0,467.0,88.48,4,8,4,0.014989,0.189465,0.571429,2.354358,...,0.006801,4.482777,2.669317,1,9.417431,157.069293,266.857149,0,0,1
1,3.0,1004.0,58.99,7,6,1,0.026892,0.058755,0.259259,1.660831,...,0.000203,4.077368,3.001734,1,4.982493,61.758306,111.555557,0,0,1
2,1.0,162.0,14.0,4,5,4,0.006173,0.08642,4.0,0.0,...,0.000645,2.639057,2.209515,1,0.0,0.0,161.999987,1,0,0
3,5.0,532.0,84.79,9,3,4,0.026316,0.15938,0.642857,2.998293,...,0.004049,4.440178,2.725912,0,14.991465,113.935136,190.000003,0,0,1
4,5.0,890.0,42.16,4,0,0,0.021348,0.047371,0.210526,2.998293,...,0.000106,3.741472,2.94939,0,14.991465,140.446353,234.210522,0,0,1


## Exercise 2

In [38]:
## engineering variables from homework 4
##boxcox
transformed_trustLevel= boxcox(train['trustLevel'])
train['newtrustLevel'] = transformed_trustLevel[0]
transformed_trustLevel= boxcox(test['trustLevel'])
test['newtrustLevel'] = transformed_trustLevel[0]

##1/x
train['1/scannedLineItemsPerSecond']= 1/(train['scannedLineItemsPerSecond'])
test['1/scannedLineItemsPerSecond']= 1/(test['scannedLineItemsPerSecond'])

##^2
train['lineItemVoidsPerPosition_squared'] = train['lineItemVoidsPerPosition']**2
test['lineItemVoidsPerPosition_squared'] = test['lineItemVoidsPerPosition']**2

##^3
train['valuePerSecond_cubed'] = train['valuePerSecond'] **3
test['valuePerSecond_cubed'] = test['valuePerSecond'] **3

##natural log
train['Naturallog_grandTotal'] = np.log(train['grandTotal'])
test['Naturallog_grandTotal'] = np.log(test['grandTotal'])

##log base 10
train['log10_totalScanTimeInSeconds'] = np.log10(train['totalScanTimeInSeconds'])
test['log10_totalScanTimeInSeconds'] = np.log10(test['totalScanTimeInSeconds'])

## >5
train['0_1_scansWithoutRegistration'] = np.where(train['scansWithoutRegistration'] < 5, 0, 1) 
test['0_1_scansWithoutRegistration'] = np.where(test['scansWithoutRegistration'] < 5, 0, 1) 

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [39]:
##engineering interactions based on feature importance from homework 4
train['interaction_1'] = train['newtrustLevel'] * train['trustLevel']
test['interaction_1'] = test['newtrustLevel'] * test['trustLevel']

train['interaction_2'] = train['newtrustLevel'] * train['1/scannedLineItemsPerSecond']
test['interaction_2'] = test['newtrustLevel'] * test['1/scannedLineItemsPerSecond']

train['interaction_3'] = train['trustLevel'] * train['1/scannedLineItemsPerSecond']
test['interaction_3'] = test['trustLevel'] * test['1/scannedLineItemsPerSecond']

In [40]:
## final interactions from homework 4 
##interaction 1 <= .862 for all
train['tree_interaction1'] = np.where(train['interaction_1'] <= 0.862, 1, 0)

## interaction 1 --> interaction 1 --> totalScanTimeInSeconds 
train['tree_interaction_2'] = np.where(((train['interaction_1'] >= .862) & 
                                     (train['interaction_1'] <= 3.204) & 
                                     (train['totalScanTimeInSeconds'] <= 1298)), 1, 0)

## interaction 1 (.862) --> 1/scannedLineItemsPerSecond (86.763) --> log10_totalScanTimeInSeconds (3.022)
train['tree_interaction_3'] = np.where(((train['interaction_1'] >= .862) & 
                                     (train['1/scannedLineItemsPerSecond'] <= 86.763) & 
                                     (train['log10_totalScanTimeInSeconds'] <= 3.022)), 1, 0)

## same things for test
test['tree_interaction1'] = np.where(test['interaction_1'] <= 0.862, 1, 0)

test['tree_interaction_2'] = np.where(((test['interaction_1'] >= .862) & 
                                     (test['interaction_1'] <= 3.204) & 
                                     (test['totalScanTimeInSeconds'] <= 1298)), 1, 0)

test['tree_interaction_3'] = np.where(((test['interaction_1'] >= .862) & 
                                     (test['1/scannedLineItemsPerSecond'] <= 86.763) & 
                                     (test['log10_totalScanTimeInSeconds'] <= 3.022)), 1, 0)

In [41]:
# TOP 7 
# log10_totalScanTimeInSeconds
# interaction_3
# totalScanTimeInSeconds
# tree_interaction_2
# interaction_1
# lineItemVoidsPerPosition_squared
# lineItemsVoids

X_train_5 = train[['log10_totalScanTimeInSeconds', 'interaction_3', 'totalScanTimeInSeconds', 'tree_interaction_2', 'interaction_1']]

X_train_6 = train[['log10_totalScanTimeInSeconds', 'interaction_3', 'totalScanTimeInSeconds',
                   'tree_interaction_2', 'interaction_1', 'lineItemVoidsPerPosition_squared']]

X_train_7 = train[['log10_totalScanTimeInSeconds', 'interaction_3', 'totalScanTimeInSeconds',
                   'tree_interaction_2', 'interaction_1', 'lineItemVoidsPerPosition_squared', 'lineItemsVoids']]

Y_train = train[['fraud']]

(85 points) Using the train data-frame (including the top 7 features from homework assignment
5), do the following:
    
(i) Consider a model to predict fraud. Then, do the following:

    • With the top 5 important features and using the GridSearchCV function with cv = 3,
    run a hyper-parameter tuning procedure on the model. Please see page 4 of
    DATA-MINING-CUP-2019-task.pdf file to understand how the model should be evaluated.

    • With the top 6 important features and using the GridSearchCV function with cv = 3,
    run a hyper-parameter tuning procedure on the model. Please see page 4 of
    DATA-MINING-CUP-2019-task.pdf file to understand how the model should be evaluated.

    • With the top 7 important features and using the GridSearchCV function with cv = 3,
    run a hyper-parameter tuning procedure on the model. Please see page 4 of
    DATA-MINING-CUP-2019-task.pdf file to understand how the model should be evaluated.
    
From above three scenarios, identify the best model; that is, the model model (input features
and hyper-parameters) that has the best performance.

In [9]:
#defining scorer
my_scorer = make_scorer(cost_function, greater_is_better = True, needs_proba = True)

In [13]:
## Gradient Boosting with GridSearchCV
GB_param_grid = {'n_estimators': [100, 300],
                  'min_samples_split': [10, 15],
                  'min_samples_leaf': [5, 7],
                  'max_depth': [3, 5, 7],
                  'learning_rate': [0.001, 0.01, 0.1]}

##top 5 variables
GB_grid_search_1 = GridSearchCV(estimator = GradientBoostingClassifier(), param_grid = GB_param_grid, cv = 3, 
                                scoring = my_scorer).fit(X_train_5, Y_train)
print('The best params with Gradient Boosting and GridSearchCV for the top 5 variables is:', GB_grid_search_1.best_params_ )

##top 6 variables
GB_grid_search_2 = GridSearchCV(estimator = GradientBoostingClassifier(), param_grid = GB_param_grid, cv = 3, 
                                scoring = my_scorer).fit(X_train_6, Y_train)
print('The best params with Gradient Boosting and GridSearchCV for the top 6 variables is:', GB_grid_search_2.best_params_ )

##top 7 variables
GB_grid_search_3 = GridSearchCV(estimator = GradientBoostingClassifier(), param_grid = GB_param_grid, cv = 3, 
                                scoring = my_scorer).fit(X_train_7, Y_train)
print('The best params with Gradient Boosting and GridSearchCV for the top 7 variables is:', GB_grid_search_3.best_params_ )

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

The best params with Gradient Boosting and GridSearchCV for the top 5 variables is: {'learning_rate': 0.01, 'max_depth': 5, 'min_samples_leaf': 7, 'min_samples_split': 10, 'n_estimators': 300}


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

The best params with Gradient Boosting and GridSearchCV for the top 6 variables is: {'learning_rate': 0.01, 'max_depth': 5, 'min_samples_leaf': 7, 'min_samples_split': 10, 'n_estimators': 300}


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

The best params with Gradient Boosting and GridSearchCV for the top 7 variables is: {'learning_rate': 0.1, 'max_depth': 3, 'min_samples_leaf': 7, 'min_samples_split': 15, 'n_estimators': 100}


In [14]:
GB_best_score_1 = GB_grid_search_1.best_score_
print('The best params with Gradient Boosting and GridSearchCV for the top 5 variables is:', GB_grid_search_1.best_params_ )
print('The best score for the top 5 variables is:', GB_best_score_1)

GB_best_score_2 = GB_grid_search_2.best_score_
print('The best params with Gradient Boosting and GridSearchCV for the top 6 variables is:', GB_grid_search_2.best_params_ )
print('The best score for the top 6 variables is:', GB_best_score_2)

GB_best_score_3 = GB_grid_search_3.best_score_
print('The best params with Gradient Boosting and GridSearchCV for the top 7 variables is:', GB_grid_search_3.best_params_ )
print('The best score for the top 7 variables is:', GB_best_score_3)


The best params with Gradient Boosting and GridSearchCV for the top 5 variables is: {'learning_rate': 0.01, 'max_depth': 5, 'min_samples_leaf': 7, 'min_samples_split': 10, 'n_estimators': 300}
The best score for the top 5 variables is: -15.0
The best params with Gradient Boosting and GridSearchCV for the top 6 variables is: {'learning_rate': 0.01, 'max_depth': 5, 'min_samples_leaf': 7, 'min_samples_split': 10, 'n_estimators': 300}
The best score for the top 6 variables is: -11.666666666666666
The best params with Gradient Boosting and GridSearchCV for the top 7 variables is: {'learning_rate': 0.1, 'max_depth': 3, 'min_samples_leaf': 7, 'min_samples_split': 15, 'n_estimators': 100}
The best score for the top 7 variables is: 11.666666666666666


(ii) Consider a model different from part (i) to predict fraud. Then, do the following:

    • With the top 5 important features and using the RandomizedSearchCV function with cv
    = 3 and n iter = 30, run a hyper-parameter tuning procedure on the model. Please
    see page 4 of DATA-MINING-CUP-2019-task.pdf file to understand how the model
    should be evaluated.
    
    • With the top 6 important features and using the RandomizedSearchCV function with cv
    = 3 and n iter = 30, run a hyper-parameter tuning procedure on the model. Please
    see page 4 of DATA-MINING-CUP-2019-task.pdf file to understand how the model
    should be evaluated.
    
    • With the top 7 important features and using the RandomizedSearchCV function with cv
    = 3 and n iter = 30, run a hyper-parameter tuning procedure on the model. Please
    see page 4 of DATA-MINING-CUP-2019-task.pdf file to understand how the model
    should be evaluated.


In [15]:
### ADABOOST
ADA_param_grid = {'n_estimators': [100, 300],
                  'estimator__min_samples_split': [10, 15],
                  'estimator__min_samples_leaf': [5, 7],
                  'estimator__max_depth': [3, 5, 7],
                  'learning_rate': [0.001, 0.01, 0.1]}

##top 5 variables
ADA_random_search_1 = RandomizedSearchCV(estimator = AdaBoostClassifier(estimator = DecisionTreeClassifier()), 
                                             param_distributions = ADA_param_grid, cv = 3, scoring = my_scorer,
                                             n_jobs = -1, n_iter = 30).fit(X_train_5, Y_train)
print('The best params with ADAboost and RandomizedSearchCV for the top 5 variables is:', ADA_random_search_1.best_params_ )

##top 6 variables
ADA_random_search_2 = RandomizedSearchCV(estimator = AdaBoostClassifier(estimator = DecisionTreeClassifier()), 
                                             param_distributions = ADA_param_grid, cv = 3, scoring = my_scorer,
                                             n_jobs = -1, n_iter = 30).fit(X_train_6, Y_train)
print('The best params with ADAboost and RandomizedSearchCV for the top 6 variables is:', ADA_random_search_2.best_params_ )

##top 7 variables
ADA_random_search_3 = RandomizedSearchCV(estimator = AdaBoostClassifier(estimator = DecisionTreeClassifier()), 
                                             param_distributions = ADA_param_grid, cv = 3, scoring = my_scorer,
                                             n_jobs = -1, n_iter = 30).fit(X_train_7, Y_train)
print('The best params with ADAboost and RandomizedSearchCV for the top 7 variables is:', ADA_random_search_3.best_params_ )

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

The best params with ADAboost and RandomizedSearchCV for the top 5 variables is: {'n_estimators': 100, 'learning_rate': 0.1, 'estimator__min_samples_split': 15, 'estimator__min_samples_leaf': 7, 'estimator__max_depth': 3}


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

The best params with ADAboost and RandomizedSearchCV for the top 6 variables is: {'n_estimators': 100, 'learning_rate': 0.01, 'estimator__min_samples_split': 10, 'estimator__min_samples_leaf': 7, 'estimator__max_depth': 5}


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

The best params with ADAboost and RandomizedSearchCV for the top 7 variables is: {'n_estimators': 100, 'learning_rate': 0.1, 'estimator__min_samples_split': 10, 'estimator__min_samples_leaf': 5, 'estimator__max_depth': 3}


In [16]:
ADA_best_score_1 = ADA_random_search_1.best_score_
print('The best params with ADAboost and RandomizedSearchCV for the top 5 variables is:', ADA_random_search_1.best_params_ )
print('The best score is:', ADA_best_score_1)

ADA_best_score_2 = ADA_random_search_2.best_score_
print('The best params with ADAboost and RandomizedSearchCV for the top 6 variables is:', ADA_random_search_2.best_params_ )
print('The best score is:', ADA_best_score_2)

ADA_best_score_3 = ADA_random_search_3.best_score_
print('The best params with ADAboost and RandomizedSearchCV for the top 7 variables is:', ADA_random_search_3.best_params_ )
print('The best score is:', ADA_best_score_3)


The best params with ADAboost and RandomizedSearchCV for the top 5 variables is: {'n_estimators': 100, 'learning_rate': 0.1, 'estimator__min_samples_split': 15, 'estimator__min_samples_leaf': 7, 'estimator__max_depth': 3}
The best score is: -8.333333333333334
The best params with ADAboost and RandomizedSearchCV for the top 6 variables is: {'n_estimators': 100, 'learning_rate': 0.01, 'estimator__min_samples_split': 10, 'estimator__min_samples_leaf': 7, 'estimator__max_depth': 5}
The best score is: -13.333333333333334
The best params with ADAboost and RandomizedSearchCV for the top 7 variables is: {'n_estimators': 100, 'learning_rate': 0.1, 'estimator__min_samples_split': 10, 'estimator__min_samples_leaf': 5, 'estimator__max_depth': 3}
The best score is: 3.3333333333333335


(iii) Consider a model different from parts (i) & (ii) to predict fraud. Then, do the following:
    
    • With the top 5 important features and using the Optuna framework using 3 folds and
    N TRIALS = 30, run a hyper-parameter tuning procedure on the model. Please see
    page 4 of DATA-MINING-CUP-2019-task.pdf file to understand how the model should
    be evaluated.
    
    • With the top 6 important features and using the Optuna framework using 3 folds and
    N TRIALS = 30, run a hyper-parameter tuning procedure on the model. Please see
    page 4 of DATA-MINING-CUP-2019-task.pdf file to understand how the model should
    be evaluated.
    
    • With the top 7 important features and using the Optuna framework using 3 folds and
    N TRIALS = 30, run a hyper-parameter tuning procedure on the model. Please see
    page 4 of DATA-MINING-CUP-2019-task.pdf file to understand how the model should
    be evaluated.


In [17]:
## Random Forest and Optuna 

##Top 5 variables
class Objective:
    def __init__(self, seed):
        self.seed = seed
        
    def __call__(self, trial):
        params = dict(n_estimators = trial.suggest_int('n_estimators', 100, 2000),
                        min_samples_split = trial.suggest_int('min_samples_split', 5, 30),
                        min_samples_leaf = trial.suggest_int('min_samples_leaf', 5, 30),
                        max_depth = trial.suggest_int('max_depth', 2, 10)
                        )
        scores = list()
        
        skf = StratifiedKFold(n_splits = 3, shuffle = True, random_state = self.seed)
        
        for train_idx, valid_idx in skf.split(X_train_5, Y_train):
            X_train_1, X_valid_1 = X_train_5.iloc[train_idx], X_train_5.iloc[valid_idx]
            Y_train_1, Y_valid_1 = Y_train.iloc[train_idx], Y_train.iloc[valid_idx]
            
            RF_md = RandomForestClassifier(**params).fit(X_train_1, Y_train_1)
            pred_valid = RF_md.predict_proba(X_valid_1)[:, 1]
            score = cost_function(Y_valid_1, pred_valid)
            scores.append(score)
            
        return np.mean(scores)

In [18]:
SEED = 42
N_TRIALS = 30

study = optuna.create_study(direction = 'maximize')
study.optimize(Objective(SEED), n_trials = N_TRIALS)

[32m[I 2023-03-31 00:25:33,697][0m A new study created in memory with name: no-name-94f6d889-8fe4-4c59-8bcd-342bcca89f99[0m
  RF_md = RandomForestClassifier(**params).fit(X_train_1, Y_train_1)
  RF_md = RandomForestClassifier(**params).fit(X_train_1, Y_train_1)
  RF_md = RandomForestClassifier(**params).fit(X_train_1, Y_train_1)
[32m[I 2023-03-31 00:25:40,121][0m Trial 0 finished with value: -21.666666666666668 and parameters: {'n_estimators': 1075, 'min_samples_split': 23, 'min_samples_leaf': 6, 'max_depth': 5}. Best is trial 0 with value: -21.666666666666668.[0m
  RF_md = RandomForestClassifier(**params).fit(X_train_1, Y_train_1)
  RF_md = RandomForestClassifier(**params).fit(X_train_1, Y_train_1)
  RF_md = RandomForestClassifier(**params).fit(X_train_1, Y_train_1)
[32m[I 2023-03-31 00:25:43,575][0m Trial 1 finished with value: -71.66666666666667 and parameters: {'n_estimators': 691, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_depth': 2}. Best is trial 0 with value: 

In [19]:
##Top 6 variables
class Objective:
    def __init__(self, seed):
        self.seed = seed
        
    def __call__(self, trial):
        params = dict(n_estimators = trial.suggest_int('n_estimators', 100, 2000),
                        min_samples_split = trial.suggest_int('min_samples_split', 6, 30),
                        min_samples_leaf = trial.suggest_int('min_samples_leaf', 6, 30),
                        max_depth = trial.suggest_int('max_depth', 2, 10)
                        )
        scores = list()
        
        skf = StratifiedKFold(n_splits = 3, shuffle = True, random_state = self.seed)
        
        for train_idx, valid_idx in skf.split(X_train_6, Y_train):
            X_train_1, X_valid_1 = X_train_6.iloc[train_idx], X_train_6.iloc[valid_idx]
            Y_train_1, Y_valid_1 = Y_train.iloc[train_idx], Y_train.iloc[valid_idx]
            
            RF_md = RandomForestClassifier(**params).fit(X_train_1, Y_train_1)
            pred_valid = RF_md.predict_proba(X_valid_1)[:, 1]
            score = cost_function(Y_valid_1, pred_valid)
            scores.append(score)
            
        return np.mean(scores)

In [20]:
SEED = 42
N_TRIALS = 30

study2 = optuna.create_study(direction = 'maximize')
study2.optimize(Objective(SEED), n_trials = N_TRIALS)

[32m[I 2023-03-31 00:29:04,686][0m A new study created in memory with name: no-name-8a5c7d96-834f-4786-aea1-6799b1462de4[0m
  RF_md = RandomForestClassifier(**params).fit(X_train_1, Y_train_1)
  RF_md = RandomForestClassifier(**params).fit(X_train_1, Y_train_1)
  RF_md = RandomForestClassifier(**params).fit(X_train_1, Y_train_1)
[32m[I 2023-03-31 00:29:05,677][0m Trial 0 finished with value: -40.0 and parameters: {'n_estimators': 115, 'min_samples_split': 11, 'min_samples_leaf': 8, 'max_depth': 5}. Best is trial 0 with value: -40.0.[0m
  RF_md = RandomForestClassifier(**params).fit(X_train_1, Y_train_1)
  RF_md = RandomForestClassifier(**params).fit(X_train_1, Y_train_1)
  RF_md = RandomForestClassifier(**params).fit(X_train_1, Y_train_1)
[32m[I 2023-03-31 00:29:11,493][0m Trial 1 finished with value: -36.666666666666664 and parameters: {'n_estimators': 1027, 'min_samples_split': 29, 'min_samples_leaf': 24, 'max_depth': 8}. Best is trial 1 with value: -36.666666666666664.[0m
 

In [21]:
##Top 7 variables
class Objective:
    def __init__(self, seed):
        self.seed = seed
        
    def __call__(self, trial):
        params = dict(n_estimators = trial.suggest_int('n_estimators', 100, 2000),
                        min_samples_split = trial.suggest_int('min_samples_split', 7, 30),
                        min_samples_leaf = trial.suggest_int('min_samples_leaf', 7, 30),
                        max_depth = trial.suggest_int('max_depth', 2, 10)
                        )
        scores = list()
        
        skf = StratifiedKFold(n_splits = 3, shuffle = True, random_state = self.seed)
        
        for train_idx, valid_idx in skf.split(X_train_7, Y_train):
            X_train_1, X_valid_1 = X_train_7.iloc[train_idx], X_train_7.iloc[valid_idx]
            Y_train_1, Y_valid_1 = Y_train.iloc[train_idx], Y_train.iloc[valid_idx]
            
            RF_md = RandomForestClassifier(**params).fit(X_train_1, Y_train_1)
            pred_valid = RF_md.predict_proba(X_valid_1)[:, 1]
            score = cost_function(Y_valid_1, pred_valid)
            scores.append(score)
            
        return np.mean(scores)

In [23]:
SEED = 42
N_TRIALS = 30

study3 = optuna.create_study(direction = 'maximize')
study3.optimize(Objective(SEED), n_trials = N_TRIALS)

[32m[I 2023-03-31 00:43:56,644][0m A new study created in memory with name: no-name-74d4ad7b-aa99-4b65-bdf1-37830722f104[0m
  RF_md = RandomForestClassifier(**params).fit(X_train_1, Y_train_1)
  RF_md = RandomForestClassifier(**params).fit(X_train_1, Y_train_1)
  RF_md = RandomForestClassifier(**params).fit(X_train_1, Y_train_1)
[32m[I 2023-03-31 00:44:01,384][0m Trial 0 finished with value: -51.666666666666664 and parameters: {'n_estimators': 748, 'min_samples_split': 14, 'min_samples_leaf': 30, 'max_depth': 9}. Best is trial 0 with value: -51.666666666666664.[0m
  RF_md = RandomForestClassifier(**params).fit(X_train_1, Y_train_1)
  RF_md = RandomForestClassifier(**params).fit(X_train_1, Y_train_1)
  RF_md = RandomForestClassifier(**params).fit(X_train_1, Y_train_1)
[32m[I 2023-03-31 00:44:04,464][0m Trial 1 finished with value: -43.333333333333336 and parameters: {'n_estimators': 563, 'min_samples_split': 11, 'min_samples_leaf': 9, 'max_depth': 3}. Best is trial 1 with value:

In [24]:
print('The best parameters for the top 5 variables is:', study.best_trial.params)
print('The best parameters for the top 6 variables is:', study2.best_trial.params)
print('The best parameters for the top 7 variables is:', study3.best_trial.params)

'''
Trial 29 finished with value: -60.0 and parameters: {'n_estimators': 709, 'min_samples_split': 18, 'min_samples_leaf': 5, 'max_depth': 3}. Best is trial 22 with value: -20.0.
Trial 29 finished with value: -31.666666666666668 and parameters: {'n_estimators': 873, 'min_samples_split': 8, 'min_samples_leaf': 8, 'max_depth': 4}. Best is trial 20 with value: -15.0.
Trial 29 finished with value: -50.0 and parameters: {'n_estimators': 734, 'min_samples_split': 14, 'min_samples_leaf': 29, 'max_depth': 9}. Best is trial 23 with value: -11.666666666666666.
'''

The best parameters for the top 5 variables is: {'n_estimators': 1376, 'min_samples_split': 17, 'min_samples_leaf': 10, 'max_depth': 8}
The best parameters for the top 6 variables is: {'n_estimators': 1779, 'min_samples_split': 14, 'min_samples_leaf': 6, 'max_depth': 7}
The best parameters for the top 7 variables is: {'n_estimators': 1101, 'min_samples_split': 9, 'min_samples_leaf': 7, 'max_depth': 8}


## Exercise 3

(70 points) Using the train data-frame and the models from exercise 2, split the train data-frame
into two data-frames: training (80%) and validation (20%) taking into account the proportions
of 0s and 1s. Then, do the following:

In [51]:
X = train[['log10_totalScanTimeInSeconds', 'interaction_3', 'totalScanTimeInSeconds',
                   'tree_interaction_2', 'interaction_1', 'lineItemVoidsPerPosition_squared', 'lineItemsVoids']]
Y = train['fraud']

X_training, X_valid, Y_training, Y_valid = train_test_split(X, Y, test_size = .2, stratify = Y)

In [53]:
test1 = test[['log10_totalScanTimeInSeconds', 'interaction_3', 'totalScanTimeInSeconds',
                   'tree_interaction_2', 'interaction_1', 'lineItemVoidsPerPosition_squared', 'lineItemsVoids']]

(i) Consider the best model from exercise 2(i). Build that model on the training data-frame.
After that, predict the likelihood of fraud on the validation and test data-frames.

In [60]:
'''
The best params with Gradient Boosting and GridSearchCV for the top 7 variables is: {'learning_rate': 0.1, 'max_depth': 3, 'min_samples_leaf': 7, 'min_samples_split': 15, 'n_estimators': 100}
The best score for the top 7 variables is: 11.666666666666666
'''

##building the model
GB_md = GradientBoostingClassifier(n_estimators = 100, max_depth = 3, learning_rate = 0.1,
                                   min_samples_leaf = 7, min_samples_split = 15).fit(X_training, Y_training)
##predicting on validation and test
GB_val_pred = GB_md.predict_proba(X_valid)[:, 1]
GB_test_pred = GB_md.predict_proba(test1)[:, 1]

##computing cost
print('Cost function score of GB model:',cost_function(Y_valid, GB_val_pred))
print('Cutoff value for GB model:', cost_function_cutoff(Y_valid, GB_val_pred))

Cost function score of GB model: 35.0
Cutoff value for GB model: 0.6


(ii) Consider the best model from exercise 2(ii). Build that model on the training data-frame.
After that, predict the likelihood of fraud on the validation and test data-frames

In [61]:
'''
The best params with ADAboost and RandomizedSearchCV for the top 7 variables is: {'n_estimators': 100, 'learning_rate': 0.1, 'estimator__min_samples_split': 10, 'estimator__min_samples_leaf': 5, 'estimator__max_depth': 3}
The best score is: 3.3333333333333335
'''
##building the model
ADA_md = AdaBoostClassifier(estimator = DecisionTreeClassifier(max_depth = 3, min_samples_leaf = 5, min_samples_split = 10),
                        learning_rate = 0.1, n_estimators = 100).fit(X_training, Y_training)
##predicting on validation and test
ADA_val_pred = ADA_md.predict_proba(X_valid)[:, 1]
ADA_test_pred = ADA_md.predict_proba(test1)[:, 1]

##computing cost
print('Cost function score of ADA model:',cost_function(Y_valid, ADA_val_pred))
print('Cutoff value for ADA model:', cost_function_cutoff(Y_valid, ADA_val_pred))


Cost function score of ADA model: 20.0
Cutoff value for ADA model: 0.63


(iii) Consider the best model from exercise 2(iii). Build that model on the training data-frame.
After that, predict the likelihood of fraud on the validation and test data-frames.

In [64]:
'''
Best is trial 23 with value: -11.666666666666666
The best parameters for the top 7 variables is: {'n_estimators': 1101, 'min_samples_split': 9, 'min_samples_leaf': 7, 'max_depth': 8}
'''

##building the model
RF_md = RandomForestClassifier(max_depth = 8, min_samples_leaf = 7, min_samples_split = 9, n_estimators = 1101).fit(X_training, Y_training)

##predicting on validation and test
RF_val_pred = RF_md.predict_proba(X_valid)[:, 1]
RF_test_pred = RF_md.predict_proba(test1)[:, 1]

##computing cost
print('Cost function score of RF model:',cost_function(Y_valid, RF_val_pred))
print('Cutoff value for RF model:',cost_function_cutoff(Y_valid, RF_val_pred))

Cost function score of RF model: -10.0
Cutoff value for RF model: 0.44


Using the prediction on the validation data-frame as inputs from parts (i)-(ii)-(iii) and the
actual fraud values from the validation data-frame as the target variable, build a meta-learner
to predict fraud. Make sure to tune the hyper-parameters of the meta-learner keeping in mind how
the results are going to be evaluated. For more info, see page 4 of DATA-MINING-CUP-2019-task.pdf
file. Finally, use the best meta-learner to predict the likelihood of fraud in the test data-frame.
Submit the likelihoods in a csv file. Also submit the associated cut-off value

In [70]:
X_ensemble = pd.concat([pd.DataFrame(GB_val_pred),pd.DataFrame(ADA_val_pred), pd.DataFrame(RF_val_pred)], axis = 1)
X_testensemble = pd.concat([pd.DataFrame(GB_test_pred),pd.DataFrame(ADA_test_pred), pd.DataFrame(RF_test_pred)], axis = 1)

X = X_ensemble
Y = Y_valid

class Objective:
    def __init__(self, seed):
        self.seed = seed

    def __call__(self, trial):
        ## Parameters to be evaluated
        params = dict(n_estimators = trial.suggest_int('n_estimators', 100, 2000),
        min_samples_split = trial.suggest_int('min_samples_split', 5, 30),
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 5, 30),
        max_depth = trial.suggest_int('max_depth', 2, 10)
        )
        scores = []
        ## Running cross validation
        skf = StratifiedKFold(n_splits = 3, shuffle = True, random_state = self.seed)
        
        for train_idx, valid_idx in skf.split(X, Y):
            
            X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
            Y_train , Y_valid = Y.iloc[train_idx] , Y.iloc[valid_idx]
            RF_md = RandomForestClassifier(**params).fit(X_train, Y_train)
            preds_valid = RF_md.predict_proba(X_valid)[:, 1]
            score = cost_function(Y_valid, preds_valid)
            scores.append(score)
        return np.mean(scores)

In [71]:
#defining seed and number of trials
SEED = 42
N_TRIALS = 30

#execute an optimization
study = optuna.create_study(direction = 'maximize')
study.optimize(Objective(SEED), n_trials = N_TRIALS)

[32m[I 2023-03-31 02:41:06,071][0m A new study created in memory with name: no-name-66df7cad-4405-4a24-abb0-5ede53859546[0m
[32m[I 2023-03-31 02:41:13,629][0m Trial 0 finished with value: 11.666666666666666 and parameters: {'n_estimators': 1689, 'min_samples_split': 11, 'min_samples_leaf': 8, 'max_depth': 10}. Best is trial 0 with value: 11.666666666666666.[0m
[32m[I 2023-03-31 02:41:16,777][0m Trial 1 finished with value: -35.0 and parameters: {'n_estimators': 736, 'min_samples_split': 18, 'min_samples_leaf': 28, 'max_depth': 7}. Best is trial 0 with value: 11.666666666666666.[0m
[32m[I 2023-03-31 02:41:24,302][0m Trial 2 finished with value: -3.3333333333333335 and parameters: {'n_estimators': 1788, 'min_samples_split': 26, 'min_samples_leaf': 14, 'max_depth': 9}. Best is trial 0 with value: 11.666666666666666.[0m
[32m[I 2023-03-31 02:41:24,825][0m Trial 3 finished with value: 11.666666666666666 and parameters: {'n_estimators': 101, 'min_samples_split': 29, 'min_samples