In [3]:
import io
import os
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd 

import boto3
import sagemaker
from sagemaker import get_execution_role

%matplotlib inline

!mkdir data

In [4]:
# sagemaker session, role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

# S3 bucket name
bucket = sagemaker_session.default_bucket()



In [396]:
s3 = boto3.resource('s3')
b = s3.Bucket('sagemaker-mlai-harvesting')

b.download_file( 'data/MLAI_ParsedDataSet.tsv', 'data/data.tsv')
b.download_file( "data/MinimalLogs/Minimal_May01.rpt", 'data/may1.tsv')
b.download_file( "data/MinimalLogs/Minimal_May02.rpt", 'data/may2.tsv')
b.download_file( "data/MinimalLogs/Minimal_May03.rpt", 'data/may3.tsv')
b.download_file( "data/MinimalLogs/Minimal_OnlyLT.rpt", 'data/lt-only.tsv')


# !head data/data.tsv

In [408]:
may1 = pd.read_csv('data/may1.tsv',sep='\t')
may2 = pd.read_csv('data/may2.tsv',sep='\t')
may3 = pd.read_csv('data/may3.tsv',sep='\t')
lt = pd.read_csv('data/lt-only.tsv',sep='\t')

bad_col='BadActor'
sess_col='SessionNo'
txn_col='Act'

txn= may1.append([may2, may3, lt])
txn[txn[bad_col]==1]


Unnamed: 0,SessionNo,LogTime,CustID,GroupID,ProfID,Act,BadActor
7704,-40132942,2019-05-01 19:18:52.000,s8873650,main,ehost,111,1
7707,-1,2019-05-01 19:18:53.000,s8873650,main,ehost,201,1
7863,-40132942,2019-05-01 19:19:22.000,s8873650,main,ehost,121,1
8391,-1,2019-05-01 19:20:29.000,s8875270,main,ehost,201,1
8396,1731108217,2019-05-01 19:20:29.000,s8875270,main,ehost,111,1
8722,1731108217,2019-05-01 19:21:07.000,s8875270,main,ehost,121,1
8963,401087102,2019-05-01 19:21:51.000,s8875834,main,ehost,111,1
8968,-1,2019-05-01 19:21:52.000,s8875834,main,ehost,201,1
9356,401087102,2019-05-01 19:22:47.000,s8875834,main,ehost,121,1
9690,401087102,2019-05-01 19:23:39.000,s8875834,main,ehost,124,1


In [404]:
txns = pd.DataFrame(np.sort(txn['Act'].unique()))
lt_txns = pd.DataFrame(np.sort(lt['Act'].unique()))

# txns[txns.isin(lt_txns).all(1)], txns[~txns.isin(lt_txns).all(1)],lt_txns[~lt_txns.isin(txns).all(1)]


In [411]:
def flatten_txns( txn_log ):
    txn_narrow = txn_log[[sess_col, txn_col,bad_col]]
    txn_pivot = pd.pivot_table(txn_narrow, index=[sess_col,bad_col], columns = [txn_col],aggfunc=[np.size]).fillna(0)
    txn_pivot.columns = txn_pivot.columns.droplevel(0)
    txn_flat = txn_pivot.rename_axis(None, axis=1).reset_index()
    return txn_flat

In [412]:
flatten_txns( txn ).head(10)

Unnamed: 0,SessionNo,BadActor,111,112,114,115,116,117,118,119,...,403,404,406,407,410,411,511,513,601,607
0,-2147481927,0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-2147360137,1,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-2147317281,0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-2147002735,0,3.0,0.0,0.0,6.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-2146953899,0,0.0,3.0,0.0,60.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,-2146926264,0,3.0,0.0,0.0,3.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,-2146915841,0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,-2146723372,0,3.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,-2146089473,0,3.0,0.0,0.0,3.0,3.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,-2145757832,0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [413]:
flat = flatten_txns( txn )

In [414]:
# def func_split( flat ):
#     flat_x = shuffle.drop('Bad_Actor', axis=1)
#     flat_y = shuffle['Bad_Actor']
#     return flat_x, flat_y

def split_frame( df, train_frac ):
    l = len(df)
    test_frac = (1-train_frac)/2
    tr = int(train_frac * l)
    te = int(tr + test_frac * l)
    
    train = df[:tr]
    test = df[tr:te]
    val = df[te:]
    return [train, test, val]

In [415]:
sets= split_frame(np.arange(10),.4)
df = []
for i in sets:
    np.random.shuffle(i)
    df.append( i)
    
df

[array([3, 1, 2, 0]), array([5, 4, 6]), array([9, 7, 8])]

In [416]:
def train_split( flat, bad_split=.8 ):
    bad = flat[flat['Bad_Actor']==1]
    good = flat[flat['Bad_Actor']==0]
    
    bads = split_frame(bad, bad_split)
    goods = split_frame(good, bad_split)
    
    dfs = []
    for i in range(3):
        df = bads[i].append(goods[i]).drop('SessionNo',axis=1).sample(frac=1)
        dfs.append( df )
    
    return dfs
    


# Split the data and upload to S3
Break the set into train, test, and validation collections and output CSV's.
For Sagemaker, leave out row indices and column headers.

In [420]:
dfs = train_split(flat, .8)

!mkdir out

s3_client = boto3.client('s3')
bucket = "sagemaker-mlai-harvesting"

for i, df in enumerate(dfs):
    files = ["train","test","validate"]
    file = "out/{}.csv".format(files[i])
    df.to_csv(path_or_buf= file, header=False, index=False  )

    print("Uploading {} to {}".format(file, bucket))

    response = s3_client.upload_file(file, bucket, file)
    print(response)
    
    
    


mkdir: cannot create directory ‘out’: File exists
Uploading out/train.csv to sagemaker-mlai-harvesting
None
Uploading out/test.csv to sagemaker-mlai-harvesting
None
Uploading out/validate.csv to sagemaker-mlai-harvesting
None


# Prepare and train a model

In [362]:
bucket

'sagemaker-us-east-1-872344130825'