In [3]:
import io
import os
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd 

import boto3
import sagemaker
from sagemaker import get_execution_role

%matplotlib inline

!mkdir data

In [4]:
# sagemaker session, role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

# S3 bucket name
bucket = sagemaker_session.default_bucket()



In [396]:
s3 = boto3.resource('s3')
b = s3.Bucket('sagemaker-mlai-harvesting')

b.download_file( 'data/MLAI_ParsedDataSet.tsv', 'data/data.tsv')
b.download_file( "data/MinimalLogs/Minimal_May01.rpt", 'data/may1.tsv')
b.download_file( "data/MinimalLogs/Minimal_May02.rpt", 'data/may2.tsv')
b.download_file( "data/MinimalLogs/Minimal_May03.rpt", 'data/may3.tsv')
b.download_file( "data/MinimalLogs/Minimal_OnlyLT.rpt", 'data/lt-only.tsv')


# !head data/data.tsv

In [408]:
may1 = pd.read_csv('data/may1.tsv',sep='\t')
may2 = pd.read_csv('data/may2.tsv',sep='\t')
may3 = pd.read_csv('data/may3.tsv',sep='\t')
lt = pd.read_csv('data/lt-only.tsv',sep='\t')

bad_col='BadActor'
sess_col='SessionNo'
txn_col='Act'

txn= may1.append([may2, may3, lt])
txn[txn[bad_col]==1]


Unnamed: 0,SessionNo,LogTime,CustID,GroupID,ProfID,Act,BadActor
7704,-40132942,2019-05-01 19:18:52.000,s8873650,main,ehost,111,1
7707,-1,2019-05-01 19:18:53.000,s8873650,main,ehost,201,1
7863,-40132942,2019-05-01 19:19:22.000,s8873650,main,ehost,121,1
8391,-1,2019-05-01 19:20:29.000,s8875270,main,ehost,201,1
8396,1731108217,2019-05-01 19:20:29.000,s8875270,main,ehost,111,1
8722,1731108217,2019-05-01 19:21:07.000,s8875270,main,ehost,121,1
8963,401087102,2019-05-01 19:21:51.000,s8875834,main,ehost,111,1
8968,-1,2019-05-01 19:21:52.000,s8875834,main,ehost,201,1
9356,401087102,2019-05-01 19:22:47.000,s8875834,main,ehost,121,1
9690,401087102,2019-05-01 19:23:39.000,s8875834,main,ehost,124,1


In [435]:
txns = pd.DataFrame(np.sort(txn['Act'].unique()))
lt_txns = pd.DataFrame(np.sort(lt['Act'].unique()))

# txns[txns.isin(lt_txns).all(1)], txns[~txns.isin(lt_txns).all(1)],lt_txns[~lt_txns.isin(txns).all(1)]


In [436]:
def flatten_txns( txn_log ):
    txn_narrow = txn_log[[sess_col, txn_col,bad_col]]
    txn_pivot = pd.pivot_table(txn_narrow, index=[sess_col,bad_col], columns = [txn_col],aggfunc=[np.size]).fillna(0)
    txn_pivot.columns = txn_pivot.columns.droplevel(0)
    txn_flat = txn_pivot.rename_axis(None, axis=1).reset_index()
    return txn_flat

In [437]:
flatten_txns( txn ).head(10)

Unnamed: 0,SessionNo,BadActor,111,112,114,115,116,117,118,119,...,403,404,406,407,410,411,511,513,601,607
0,-2147481927,0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-2147360137,1,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-2147317281,0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-2147002735,0,3.0,0.0,0.0,6.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-2146953899,0,0.0,3.0,0.0,60.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,-2146926264,0,3.0,0.0,0.0,3.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,-2146915841,0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,-2146723372,0,3.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,-2146089473,0,3.0,0.0,0.0,3.0,3.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,-2145757832,0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [438]:
flat = flatten_txns( txn )

In [439]:
# def func_split( flat ):
#     flat_x = shuffle.drop('Bad_Actor', axis=1)
#     flat_y = shuffle['Bad_Actor']
#     return flat_x, flat_y

def split_frame( df, train_frac ):
    l = len(df)
    test_frac = (1-train_frac)/2
    tr = int(train_frac * l)
    te = int(tr + test_frac * l)
    
    train = df[:tr]
    test = df[tr:te]
    val = df[te:]
    return [train, test, val]

In [440]:
sets= split_frame(np.arange(10),.4)
df = []
for i in sets:
    np.random.shuffle(i)
    df.append( i)
    
df

[array([1, 3, 0, 2]), array([4, 5, 6]), array([9, 8, 7])]

In [441]:
def train_split( flat, bad_split=.8 ):
    bad = flat[flat[bad_col]==1]
    good = flat[flat[bad_col]==0]
    
    bads = split_frame(bad, bad_split)
    goods = split_frame(good, bad_split)
    
    dfs = []
    for i in range(3):
        df = bads[i].append(goods[i]).drop(sess_col,axis=1).sample(frac=1)
        dfs.append( df )
    
    return dfs
    


# Split the data and upload to S3
Break the set into train, test, and validation collections and output CSV's.
For Sagemaker, leave out row indices and column headers.

In [442]:
dfs = train_split(flat, .8)

!mkdir out

s3_client = boto3.client('s3')
bucket = "sagemaker-mlai-harvesting"

for i, df in enumerate(dfs):
    files = ["train","test","validate"]
    file = "out/{}.csv".format(files[i])
    df.to_csv(path_or_buf= file, header=False, index=False  )

    print("Uploading {} to {}".format(file, bucket))

    response = s3_client.upload_file(file, bucket, file)
    print(response)
    
    
    


mkdir: cannot create directory ‘out’: File exists
Uploading out/train.csv to sagemaker-mlai-harvesting
None
Uploading out/test.csv to sagemaker-mlai-harvesting
None
Uploading out/validate.csv to sagemaker-mlai-harvesting
None


In [455]:
s3.Bucket("sagemaker-mlai-harvesting").get_available_subresources()

['Acl',
 'Cors',
 'Lifecycle',
 'LifecycleConfiguration',
 'Logging',
 'Notification',
 'Object',
 'Policy',
 'RequestPayment',
 'Tagging',
 'Versioning',
 'Website']

# Prepare and train a model

In [472]:
%%time
region = 'us-east-1'
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(region, 'xgboost')


from time import gmtime, strftime

job_name = 'harvesting-xgboost-binary-classification' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("Training job", job_name)

#Ensure that the training and validation data folders generated above are reflected in the "InputDataConfig" parameter below.

create_training_params = \
{
    "AlgorithmSpecification": {
        "TrainingImage": container,
        "TrainingInputMode": "File"
    },
    "RoleArn": role,
    "OutputDataConfig": {
        "S3OutputPath": os.path.join("s3://",bucket, "out", "xgb-class") 
    },
    "ResourceConfig": {
        "InstanceCount": 1,
        "InstanceType": "ml.m4.4xlarge",
        "VolumeSizeInGB": 5
    },
    "TrainingJobName": job_name,
    "HyperParameters": {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "silent":"0",
        "objective":"binary:logistic",
        "num_round":"50"
    },
    "StoppingCondition": {
        "MaxRuntimeInSeconds": 3600
    },
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": os.path.join( "s3://", bucket, "out" ), 
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "ContentType": "text/csv",
            "CompressionType": "None"
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": os.path.join( "s3://", bucket, "out" ),
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "ContentType": "text/csv",
            "CompressionType": "None"
        }
    ]
}


client = boto3.client('sagemaker', region_name=region)
client.create_training_job(**create_training_params)

import time

status = client.describe_training_job(TrainingJobName=job_name)['TrainingJobStatus']
print(status)
while status !='Completed' and status!='Failed':
    time.sleep(60)
    status = client.describe_training_job(TrainingJobName=job_name)['TrainingJobStatus']
    print(status)

Training job harvesting-xgboost-binary-classification2019-06-12-17-23-43
InProgress
InProgress
InProgress
Completed
CPU times: user 163 ms, sys: 0 ns, total: 163 ms
Wall time: 3min


In [474]:
%%time
import boto3
from time import gmtime, strftime

model_name="harvesting-xgboost-binary-cl-2019-06-12-17-23-43"+ '-model'
print(model_name)

info = client.describe_training_job(TrainingJobName=job_name)
model_data = info['ModelArtifacts']['S3ModelArtifacts']
print(model_data)

primary_container = {
    'Image': container,
    'ModelDataUrl': model_data
}

create_model_response = client.create_model(
    ModelName = model_name,
    ExecutionRoleArn = role,
    PrimaryContainer = primary_container)

print(create_model_response['ModelArn'])

harvesting-xgboost-binary-cl-2019-06-12-17-23-43-model
s3://sagemaker-mlai-harvesting/out/xgb-class/harvesting-xgboost-binary-classification2019-06-12-17-23-43/output/model.tar.gz
arn:aws:sagemaker:us-east-1:872344130825:model/harvesting-xgboost-binary-cl-2019-06-12-17-23-43-model
CPU times: user 15 ms, sys: 4.03 ms, total: 19.1 ms
Wall time: 335 ms


In [475]:
from time import gmtime, strftime

endpoint_config_name = 'Harvest-XGBoostEndpointConfig-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print(endpoint_config_name)
create_endpoint_config_response = client.create_endpoint_config(
    EndpointConfigName = endpoint_config_name,
    ProductionVariants=[{
        'InstanceType':'ml.m4.xlarge',
        'InitialVariantWeight':1,
        'InitialInstanceCount':1,
        'ModelName':model_name,
        'VariantName':'AllTraffic'}])

print("Endpoint Config Arn: " + create_endpoint_config_response['EndpointConfigArn'])


Harvest-XGBoostEndpointConfig-2019-06-12-17-30-04
Endpoint Config Arn: arn:aws:sagemaker:us-east-1:872344130825:endpoint-config/harvest-xgboostendpointconfig-2019-06-12-17-30-04


In [476]:
%%time
import time

endpoint_name = 'Harvest-XGBoostEndpoint-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print(endpoint_name)
create_endpoint_response = client.create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=endpoint_config_name)
print(create_endpoint_response['EndpointArn'])

resp = client.describe_endpoint(EndpointName=endpoint_name)
status = resp['EndpointStatus']
print("Status: " + status)

while status=='Creating':
    time.sleep(60)
    resp = client.describe_endpoint(EndpointName=endpoint_name)
    status = resp['EndpointStatus']
    print("Status: " + status)

print("Arn: " + resp['EndpointArn'])
print("Status: " + status)



Harvest-XGBoostEndpoint-2019-06-12-17-30-34
arn:aws:sagemaker:us-east-1:872344130825:endpoint/harvest-xgboostendpoint-2019-06-12-17-30-34
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: InService
Arn: arn:aws:sagemaker:us-east-1:872344130825:endpoint/harvest-xgboostendpoint-2019-06-12-17-30-34
Status: InService
CPU times: user 168 ms, sys: 5.78 ms, total: 174 ms
Wall time: 9min 1s


In [488]:
runtime_client = boto3.client('runtime.sagemaker', region_name=region)



In [557]:
import json
from itertools import islice
import math
import struct

!head -1000 out/test.csv > out/single-test.csv

file_name = 'out/single-test.csv' #customize to your test file

In [558]:
csv = pd.read_csv(file_name, header=None)
csv.columns
label = csv[0]
csv = csv.drop(0,axis=1)

single = "out/single.csv"

csv.to_csv(path_or_buf=single, header=False, index=False)

with open(single, 'r') as f:
    payload = f.read().strip()

In [561]:
pd.DataFrame(label)

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0


In [579]:
response = runtime_client.invoke_endpoint(EndpointName=endpoint_name, 
                                   ContentType='text/csv', 
                                   Body=payload)
result = response['Body'].read()
result = result.decode("utf-8")
result = result.split(',')
result = [round(float(i)) for i in result]
# pd.DataFrame(label)

In [597]:
comp = pd.concat( [label, pd.DataFrame(result)], axis = 1)
comp.columns =["label",'prediction']

label_positive = comp['label'] == 1
predict_positive = comp['prediction'] == 1


In [605]:
tp = len( comp[label_positive & predict_positive])
fp = len( comp[~label_positive & predict_positive])
tn = len( comp[~label_positive & ~predict_positive])
fn = len( comp[label_positive & ~predict_positive])


m = len(comp)

accuracy = (tp+tn)/m
precision = tp/(tp+fp)
recall = tp/(tp+fn)

print("accuracy: {} precision: {} recall {}".format(accuracy, precision,recall))

accuracy: 0.968 precision: 1.0 recall 0.8391959798994975


In [604]:
tp,fp,tn,fn

(167, 0, 801, 32)