In [1]:
data_bucket = 'ml-automation'
subfolder = 'katana'
dataset = 'invoice_data_prog_processed.csv'

In [2]:
import pandas as pd
import boto3
import sagemaker
import s3fs
from sklearn.model_selection import train_test_split

role = sagemaker.get_execution_role()
s3 = s3fs.S3FileSystem(anon=False)

In [3]:
df = pd.read_csv(f's3://{data_bucket}/{subfolder}/{dataset}')
df.head()

Unnamed: 0,invoice_risk_decision,customer,payment_due_date,payment_date,grand_total
0,0,id_24,2,2,64.25
1,0,id_11,3,3,50.34
2,0,id_29,4,4,40.03
3,0,id_28,4,2,94.86
4,1,id_13,2,8,65.15


In [4]:
print('Number of rows in dataset: {df.shape[0]}')
print(df[df.columns[0]].value_counts())

Number of rows in dataset: {df.shape[0]}
0    171
1    135
Name: invoice_risk_decision, dtype: int64


In [5]:
encoded_data = pd.get_dummies(df)
encoded_data.head()

Unnamed: 0,invoice_risk_decision,payment_due_date,payment_date,grand_total,customer_id_11,customer_id_12,customer_id_13,customer_id_14,customer_id_15,customer_id_18,...,customer_id_45,customer_id_46,customer_id_47,customer_id_48,customer_id_49,customer_id_50,customer_id_6,customer_id_7,customer_id_8,customer_id_9
0,0,2,2,64.25,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,3,3,50.34,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,4,4,40.03,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,4,2,94.86,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,2,8,65.15,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
corrs = encoded_data.corr()['invoice_risk_decision'].abs()
columns = corrs[corrs > .1].index
corrs = corrs.filter(columns)
corrs

invoice_risk_decision    1.000000
payment_due_date         0.349879
payment_date             0.432499
grand_total              0.182643
customer_id_11           0.166688
customer_id_12           0.111988
customer_id_19           0.110278
customer_id_27           0.102258
customer_id_4            0.130518
customer_id_44           0.114517
customer_id_50           0.179509
customer_id_8            0.131556
Name: invoice_risk_decision, dtype: float64

In [7]:
y = encoded_data['invoice_risk_decision']
train_df, test_and_val_data, _, _ = train_test_split(encoded_data, y, test_size=0.3, stratify=y, random_state=0)

y = test_and_val_data['invoice_risk_decision']
val_df, test_df, _, _ = train_test_split(test_and_val_data, y, test_size=0.333, stratify=y, random_state=0)

print(train_df.shape, val_df.shape, test_df.shape)
print()
print('Number of rows in Train dataset: {train_df.shape[0]}')
print(train_df['invoice_risk_decision'].value_counts())
print()
print('Number of rows in Validate dataset: {val_df.shape[0]}')
print(val_df['invoice_risk_decision'].value_counts())
print()
print('Number of rows in Test dataset: {test_df.shape[0]}')
print(test_df['invoice_risk_decision'].value_counts())

(214, 44) (61, 44) (31, 44)

Number of rows in Train dataset: {train_df.shape[0]}
0    120
1     94
Name: invoice_risk_decision, dtype: int64

Number of rows in Validate dataset: {val_df.shape[0]}
0    34
1    27
Name: invoice_risk_decision, dtype: int64

Number of rows in Test dataset: {test_df.shape[0]}
0    17
1    14
Name: invoice_risk_decision, dtype: int64


In [8]:
train_data = train_df.to_csv(None, header=False, index=False).encode()
val_data = val_df.to_csv(None, header=False, index=False).encode()
test_data = test_df.to_csv(None, header=True, index=False).encode()

with s3.open(f'{data_bucket}/{subfolder}/processed/train.csv', 'wb') as f:
    f.write(train_data)

with s3.open(f'{data_bucket}/{subfolder}/processed/val.csv', 'wb') as f:
    f.write(val_data) 
    
with s3.open(f'{data_bucket}/{subfolder}/processed/test.csv', 'wb') as f:
    f.write(test_data) 
    
train_input = sagemaker.s3_input(s3_data=f's3://{data_bucket}/{subfolder}/processed/train.csv', content_type='csv')
val_input = sagemaker.s3_input(s3_data=f's3://{data_bucket}/{subfolder}/processed/val.csv', content_type='csv') 

In [9]:
sess = sagemaker.Session()

containers = {
                'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',
                'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
                'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',
                'eu-west-1': '685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest'
             }

estimator = sagemaker.estimator.Estimator(containers[boto3.Session().region_name], 
                                    role,
                                    train_instance_count=1, 
                                    train_instance_type='ml.m5.large',
                                    output_path=f's3://{data_bucket}/{subfolder}/output',
                                    sagemaker_session=sess)

estimator.set_hyperparameters(max_depth=3,
                        subsample=0.7,
                        objective='binary:logistic',
                        eval_metric='auc',
                        num_round=100,
                        early_stopping_rounds=10)

estimator.fit({'train': train_input, 'validation': val_input})

INFO:sagemaker:Creating training-job with name: xgboost-2018-11-29-20-57-33-284


2018-11-29 20:57:33 Starting - Starting the training job...
2018-11-29 20:57:39 Starting - Launching requested ML instances......
2018-11-29 20:58:46 Starting - Preparing the instances for training......
2018-11-29 20:59:50 Downloading - Downloading input data..

2018-11-29 21:00:22 Training - Training image download completed. Training in progress.
2018-11-29 21:00:22 Uploading - Uploading generated training model
2018-11-29 21:00:22 Completed - Training job completed
[31mArguments: train[0m
[31m[2018-11-29:21:00:17:INFO] Running standalone xgboost training.[0m
[31m[2018-11-29:21:00:17:INFO] File size need to be processed in the node: 0.02mb. Available memory size in the node: 154.28mb[0m
[31m[2018-11-29:21:00:17:INFO] Determined delimiter of CSV input is ','[0m
[31m[21:00:17] S3DistributionType set as FullyReplicated[0m
[31m[21:00:17] 214x43 matrix with 9202 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[31m[2018-11-29:21:00:17:INF