In [100]:
import boto3
import numpy as np
import pandas as pd
from sagemaker import get_execution_role
import datetime
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline
import seaborn as sns

In [101]:
# get data
role = get_execution_role()
bucket='aws-ml-anomalydetection'
data_key = 'PS_20174392719_1491204439457_log.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)
data = pd.read_csv(data_location)

In [102]:
# check if the bucket exists
prefix = 'sagemaker/aws-ml-anomalydetection'
try:
    boto3.Session().client('s3').head_bucket(Bucket=bucket)
except botocore.exceptions.ParamValidationError as e:
    print('Hey! You either forgot to specify your S3 bucket'
          ' or you gave your bucket an invalid name!')
except botocore.exceptions.ClientError as e:
    if e.response['Error']['Code'] == '403':
        print("Hey! You don't have permission to access the bucket, {}.".format(bucket))
    elif e.response['Error']['Code'] == '404':
        print("Hey! Your bucket, {}, doesn't exist!".format(bucket))
    else:
        raise
else:
    print('Training input/output will be stored in: s3://{}/{}'.format(bucket, prefix))

Training input/output will be stored in: s3://aws-ml-anomalydetection/sagemaker/aws-ml-anomalydetection


In [103]:
# add row id column
data['rowId'] = np.arange(len(data))
# force everything to factors...
data.type = pd.Categorical(data.type)
data.nameOrig = pd.Categorical(data.nameOrig)
data.nameDest = pd.Categorical(data.nameDest)
data["typeF"] = data.type.cat.codes
data["nameOrigF"] = data.nameOrig.cat.codes
data["nameDestF"] = data.nameDest.cat.codes

In [104]:
# feature engineering: hour-of-day and day-of-week
data["hourOfDay"] = data.step % 24
data["dayOfWeek"] = (round((data.step / 24) + 0.5)) % 7

In [118]:
# origData is the original data + new features
# data is reset to be ready for training
origData = data.copy()
data = origData[["isFraud","typeF","amount",
                 "oldbalanceOrg","newbalanceOrig","oldbalanceDest","newbalanceDest"]]
print(data.head())

### training with names resulted in very low f1 values: 0.019...
### removing them and trying again

   isFraud  typeF    amount  oldbalanceOrg  newbalanceOrig  oldbalanceDest  \
0        0      3   9839.64       170136.0       160296.36             0.0   
1        0      3   1864.28        21249.0        19384.72             0.0   
2        1      4    181.00          181.0            0.00             0.0   
3        1      1    181.00          181.0            0.00         21182.0   
4        0      3  11668.14        41554.0        29885.86             0.0   

   newbalanceDest  
0             0.0  
1             0.0  
2             0.0  
3             0.0  
4             0.0  


In [119]:
# will give a dataTrain, dataTest, dataVal
from sklearn.model_selection import train_test_split
dataTrainZ, dataVal = train_test_split(data, test_size=0.2)
dataTrain, dataTest = train_test_split(dataTrainZ, test_size=0.2)
dataTrainX = dataTrain.loc[:, dataTrain.columns != 'isFraud']
dataTrainY = dataTrain.isFraud

In [120]:
pd.concat([dataTrain['isFraud'], dataTrain.drop(['isFraud'], axis=1)], axis=1).to_csv('dataTrain.csv', index=False, header=False)
pd.concat([dataVal['isFraud'], dataVal.drop(['isFraud'], axis=1)], axis=1).to_csv('dataVal.csv', index=False, header=False)
pd.concat([dataTest['isFraud'], dataTest.drop(['isFraud'], axis=1)], axis=1).to_csv('dataTest.csv', index=False, header=False)

import os
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'data/train/train.csv')).upload_file('dataTrain.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'data/test/test.csv')).upload_file('dataTest.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'data/val/val.csv')).upload_file('dataVal.csv')

In [121]:
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner
objective_metric_name = 'test:f1'
hyperparameter_ranges = {'num_trees': IntegerParameter(50, 1000),
                         'num_samples_per_tree': IntegerParameter(1, 2048)}

In [123]:
import sagemaker
from sagemaker.amazon.amazon_estimator import get_image_uri
region = boto3.Session().region_name

container = get_image_uri(region, 'randomcutforest', repo_version='latest')
sess = sagemaker.Session()
rcf = sagemaker.estimator.Estimator(container,
                                    role=role,
                                    train_instance_count=1,
                                    train_instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=sess,
                                    train_volume_size=20,
                                    train_max_run=360000,
                                    hyperparameters={"feature_dim":6})
tuner = HyperparameterTuner(rcf,
                            objective_metric_name,
                            hyperparameter_ranges,
                            max_jobs=20,
                            max_parallel_jobs=5)

In [124]:
s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}/data/train/train.csv'.format(bucket, prefix), 
                                    content_type='text/csv', distribution='ShardedByS3Key')
s3_input_test  = sagemaker.s3_input(s3_data='s3://{}/{}/data/test/test.csv'.format(bucket, prefix), 
                                    content_type='text/csv', distribution='FullyReplicated')
tuner.fit({'train': s3_input_train, 'test': s3_input_test}, include_cls_metadata=False)

INFO:sagemaker:Creating hyperparameter tuning job with name: randomcutforest-190214-0359


In [126]:
boto3.client('sagemaker').describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuner.latest_tuning_job.job_name)['HyperParameterTuningJobStatus']

'Completed'

{'feature_dim': 6}


In [127]:
rcf_inference = rcf.deploy(
    initial_instance_count=1,
    instance_type='ml.m4.xlarge',
)

ValueError: Estimator is not associated with a training job