In [1]:
import boto3
import numpy as np
import pandas as pd
from sagemaker import get_execution_role
import datetime
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline
import seaborn as sns

In [2]:
# get data
role = get_execution_role()
bucket='aws-ml-anomalydetection'
data_key = 'PS_20174392719_1491204439457_log.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)
data = pd.read_csv(data_location)

In [3]:
# check if the bucket exists
prefix = 'sagemaker/aws-ml-anomalydetection'
try:
    boto3.Session().client('s3').head_bucket(Bucket=bucket)
except botocore.exceptions.ParamValidationError as e:
    print('Hey! You either forgot to specify your S3 bucket'
          ' or you gave your bucket an invalid name!')
except botocore.exceptions.ClientError as e:
    if e.response['Error']['Code'] == '403':
        print("Hey! You don't have permission to access the bucket, {}.".format(bucket))
    elif e.response['Error']['Code'] == '404':
        print("Hey! Your bucket, {}, doesn't exist!".format(bucket))
    else:
        raise
else:
    print('Training input/output will be stored in: s3://{}/{}'.format(bucket, prefix))

Training input/output will be stored in: s3://aws-ml-anomalydetection/sagemaker/aws-ml-anomalydetection


In [4]:
# add row id column
data['rowId'] = np.arange(len(data))
# force everything to factors...
data.type = pd.Categorical(data.type)
data.nameOrig = pd.Categorical(data.nameOrig)
data.nameDest = pd.Categorical(data.nameDest)
data["typeF"] = data.type.cat.codes
data["nameOrigF"] = data.nameOrig.cat.codes
data["nameDestF"] = data.nameDest.cat.codes

In [5]:
# feature engineering: hour-of-day and day-of-week
data["hourOfDay"] = data.step % 24
data["dayOfWeek"] = (round((data.step / 24) + 0.5)) % 7

In [6]:
data.type.cat.categories

Index(['CASH_IN', 'CASH_OUT', 'DEBIT', 'PAYMENT', 'TRANSFER'], dtype='object')

In [8]:
# based on exploratory analysis, let's train two models, one on TRANSFERS, one on CASH_OUTs
# for now, just do the TRANSFER model...
dataTRANSFER = data[data.typeF==4]
dataCASH_OUT = data[data.typeF==1]

In [9]:
dataTRANSFER.shape

(532909, 17)

In [10]:
dataCASH_OUT.shape

(2237500, 17)

In [11]:
# origData is the original data + new features
# data is reset to be ready for training
origDataTRANSFER = dataTRANSFER.copy()
origDataCASH_OUT = dataCASH_OUT.copy()
dataTRANSFER = origDataTRANSFER[["isFraud","amount","oldbalanceOrg","newbalanceOrig","oldbalanceDest","newbalanceDest"]]
dataCASH_OUT = origDataTRANSFER[["isFraud","amount","oldbalanceOrg","newbalanceOrig","oldbalanceDest","newbalanceDest"]]

In [19]:
# will give train, test, val for TRANSFER
from sklearn.model_selection import train_test_split
dataTrainZTRANSFER, dataValTRANSFER = train_test_split(dataTRANSFER, test_size=0.2)
dataTrainTRANSFER, dataTestTRANSFER = train_test_split(dataTrainZTRANSFER, test_size=0.2)
dataTrainXTRANSFER = dataTrainTRANSFER.loc[:, dataTrainTRANSFER.columns != 'isFraud']
dataTrainYTRANSFER = dataTrainTRANSFER.isFraud

# will give train, test, val for CASH_OUT
from sklearn.model_selection import train_test_split
dataTrainZCASH_OUT, dataValCASH_OUT = train_test_split(dataCASH_OUT, test_size=0.2)
dataTrainCASH_OUT, dataTestCASH_OUT = train_test_split(dataTrainZCASH_OUT, test_size=0.2)
dataTrainXCASH_OUT = dataTrainCASH_OUT.loc[:, dataTrainCASH_OUT.columns != 'isFraud']
dataTrainYCASH_OUT = dataTrainCASH_OUT.isFraud

In [20]:
dataTrainXTRANSFER.shape

(341061, 5)

In [21]:
dataTrainYTRANSFER.shape

(341061,)

In [22]:
import botocore
import sagemaker
import sys

from sagemaker import RandomCutForest

session = sagemaker.Session()

# specify general training job information
rcfTRANSFER = RandomCutForest(role=role,
                              train_instance_count=1,
                              train_instance_type='ml.m4.xlarge',
                              data_location='s3://{}/{}/'.format(bucket, prefix),
                              output_path='s3://{}/{}/output'.format(bucket, prefix),
                              num_samples_per_tree=512,
                              num_trees=50)
trainRecordSetTRANSFER = rcfTRANSFER.record_set(train=dataTrainXTRANSFER.as_matrix(),
                                        labels=dataTrainYTRANSFER.as_matrix(),
                                        channel='train')

rcfCASH_OUT = RandomCutForest(role=role,
                              train_instance_count=1,
                              train_instance_type='ml.m4.xlarge',
                              data_location='s3://{}/{}/'.format(bucket, prefix),
                              output_path='s3://{}/{}/output'.format(bucket, prefix),
                              num_samples_per_tree=512,
                              num_trees=50)
trainRecordSetCASH_OUT = rcfCASH_OUT.record_set(train=dataTrainXCASH_OUT.as_matrix(),
                                        labels=dataTrainYCASH_OUT.as_matrix(),
                                        channel='train')


In [23]:
rcfTRANSFER.fit(trainRecordSetTRANSFER,logs=True)
rcfCASH_OUT.fit(trainRecordSetCASH_OUT,logs=True)

INFO:sagemaker:Creating training-job with name: randomcutforest-2019-02-14-16-08-00-084


2019-02-14 16:08:00 Starting - Starting the training job...
2019-02-14 16:08:01 Starting - Launching requested ML instances......
2019-02-14 16:09:09 Starting - Preparing the instances for training......
2019-02-14 16:10:29 Downloading - Downloading input data...
2019-02-14 16:10:54 Training - Downloading the training image..
[31mDocker entrypoint called with argument(s): train[0m
[31m[02/14/2019 16:11:09 INFO 139805685851968] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-conf.json: {u'_ftp_port': 8999, u'num_samples_per_tree': 256, u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'_log_level': u'info', u'_kvstore': u'dist_async', u'force_dense': u'true', u'epochs': 1, u'num_trees': 100, u'eval_metrics': [u'accuracy', u'precision_recall_fscore'], u'_num_kv_servers': u'auto', u'mini_batch_size': 1000}[0m
[31m[02/14/2019 16:11:09 INFO 139805685851968] Reading provided configuration from /opt/ml/input/config/hyperparam


2019-02-14 16:11:19 Uploading - Uploading generated training model
2019-02-14 16:11:19 Completed - Training job completed


INFO:sagemaker:Creating training-job with name: randomcutforest-2019-02-14-16-11-41-904


Billable seconds: 51
2019-02-14 16:11:42 Starting - Starting the training job...
2019-02-14 16:11:43 Starting - Launching requested ML instances......
2019-02-14 16:12:47 Starting - Preparing the instances for training......
2019-02-14 16:13:47 Downloading - Downloading input data...
2019-02-14 16:14:15 Training - Downloading the training image..
[31mDocker entrypoint called with argument(s): train[0m
[31m[02/14/2019 16:14:48 INFO 140113010726720] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-conf.json: {u'_ftp_port': 8999, u'num_samples_per_tree': 256, u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'_log_level': u'info', u'_kvstore': u'dist_async', u'force_dense': u'true', u'epochs': 1, u'num_trees': 100, u'eval_metrics': [u'accuracy', u'precision_recall_fscore'], u'_num_kv_servers': u'auto', u'mini_batch_size': 1000}[0m
[31m[02/14/2019 16:14:48 INFO 140113010726720] Reading provided configuration from /opt/ml/in


2019-02-14 16:14:57 Uploading - Uploading generated training model
2019-02-14 16:14:57 Completed - Training job completed
Billable seconds: 70


In [24]:
print('Training job name: {}'.format(rcfTRANSFER.latest_training_job.job_name))

Training job name: randomcutforest-2019-02-14-16-08-00-084


In [25]:
print('Training job name: {}'.format(rcfCASH_OUT.latest_training_job.job_name))

Training job name: randomcutforest-2019-02-14-16-11-41-904


In [26]:
rcf_inferenceTRANSFER = rcfTRANSFER.deploy(
    initial_instance_count=1,
    instance_type='ml.m4.xlarge',
)
rcf_inferenceCASH_OUT = rcfCASH_OUT.deploy(
    initial_instance_count=1,
    instance_type='ml.m4.xlarge',
)

INFO:sagemaker:Creating model with name: randomcutforest-2019-02-14-16-17-30-544
INFO:sagemaker:Creating endpoint with name randomcutforest-2019-02-14-16-08-00-084


---------------------------------------------------------------------------------------!

INFO:sagemaker:Creating model with name: randomcutforest-2019-02-14-16-24-51-741
INFO:sagemaker:Creating endpoint with name randomcutforest-2019-02-14-16-11-41-904


---------------------------------------------------------------------------!

In [27]:
print('Endpoint name: {}'.format(rcf_inferenceTRANSFER.endpoint))

Endpoint name: randomcutforest-2019-02-14-16-08-00-084


In [28]:
print('Endpoint name: {}'.format(rcf_inferenceCASH_OUT.endpoint))

Endpoint name: randomcutforest-2019-02-14-16-11-41-904


In [None]:
from sagemaker.predictor import csv_serializer, json_deserializer

rcf_inference.content_type = 'text/csv'
rcf_inference.serializer = csv_serializer
rcf_inference.accept = 'application/json'
rcf_inference.deserializer = json_deserializer

In [None]:
#dataTestSS = dataTest[:100000]
dataTestX = dataTest.loc[:, dataTest.columns != 'isFraud']
dataTestXM = dataTestX.as_matrix()
dataTestY = dataTest.isFraud
results = rcf_inference.predict(dataTestXM)

In [None]:
dataTestXM.shape

In [None]:
dataTest.shape

In [None]:
scores = [datum['score'] for datum in results['scores']]
# add scores to taxi data frame and print first few values
dataTest['score'] = pd.Series(scores, index=dataTest.index)
dataTest.head()

In [None]:
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()

#
# *Try this out* - change `start` and `end` to zoom in on the 
# anomaly found earlier in this notebook
#
#start, end = 0, len(dataTest)
start, end = 0, 100
dataTest_subset = dataTest[start:end]

ax1.plot(dataTest_subset['isFraud'], color='C0', alpha=0.8)
ax2.plot(dataTest_subset['score'], color='C1')

ax1.grid(which='major', axis='both')

ax1.set_ylabel('is Fraud', color='C0')
ax2.set_ylabel('Anomaly Score', color='C1')

ax1.tick_params('y', colors='C0')
ax2.tick_params('y', colors='C1')

ax1.set_ylim(0, 1)
ax2.set_ylim(min(dataTest_subset.score), max(dataTest_subset.score))
#ax1.set_xlim(0,len(dataTest))
fig.set_figwidth(10)

In [None]:
?plt

In [None]:
score_mean = dataTest['score'].mean()
score_std = dataTest['score'].std()
score_cutoff = score_mean + 20*score_std

anomalies = dataTest[dataTest['score'] > score_cutoff]
anomalies

In [None]:
###
### CONCLUSION: pretty bad model, even training on just the TRANSFER subset.
###