# Model creation and evaluation
The next step was to create a few models, hypertune them and compare them using the F1 metric. The first model will be the benchmark model, which is the XGBoost.

To construct the XGBoost, I'll use the SageMaker's XGBoost API.

In [3]:
import os
import sagemaker
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker import get_execution_role

# Our current execution role is require when creating the model as the training
# and inference code will need to access the model artifacts.
role = get_execution_role()
session = sagemaker.Session() # Store the current SageMaker session
# S3 prefix (which folder will we use)
prefix = 'covid19-classifier'
container = get_image_uri(session.boto_region_name, 'xgboost')


	get_image_uri(region, 'xgboost', '1.0-1').


In [16]:
import pandas as pd
test_df = pd.read_csv('data/test.csv', encoding='latin2', header=None)
test_x = test_df.iloc[:, 1:]
test_x.to_csv('data/test_x.csv', index=False, header=False)
test_y = test_df.iloc[:,0]
print(len(test_y[test_y == 1]) )

14459


In [30]:
data_dir = 'data'

test_location = session.upload_data(os.path.join(data_dir, 'test_x.csv'), key_prefix=prefix)
test_2020_location = session.upload_data(os.path.join(data_dir, 'x_test_2020.csv'), key_prefix=prefix)
val_location = session.upload_data(os.path.join(data_dir, 'val.csv'), key_prefix=prefix)
train_location = session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)

In [11]:
xgb = sagemaker.estimator.Estimator(container, # The location of the container we wish to use
                                    role,                                    # What is our current IAM Role
                                    train_instance_count=1,                  # How many compute instances
                                    train_instance_type='ml.m4.xlarge',      # What kind of compute instances
                                    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
                                    sagemaker_session=session)

# TODO: Set the XGBoost hyperparameters in the xgb object. Don't forget that in this case we have a binary
#       label so we should be using the 'binary:logistic' objective.

# Solution:
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        early_stopping_rounds=10,
                        num_round=500)



In [18]:
s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data=val_location, content_type='csv')



In [13]:
xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

2020-07-27 22:47:33 Starting - Starting the training job...
2020-07-27 22:47:36 Starting - Launching requested ML instances......
2020-07-27 22:48:53 Starting - Preparing the instances for training......
2020-07-27 22:49:56 Downloading - Downloading input data
2020-07-27 22:49:56 Training - Downloading the training image..[34mArguments: train[0m
[34m[2020-07-27:22:50:15:INFO] Running standalone xgboost training.[0m
[34m[2020-07-27:22:50:15:INFO] File size need to be processed in the node: 60.81mb. Available memory size in the node: 8490.77mb[0m
[34m[2020-07-27:22:50:15:INFO] Determined delimiter of CSV input is ','[0m
[34m[22:50:15] S3DistributionType set as FullyReplicated[0m
[34m[22:50:16] 281088x32 matrix with 8994816 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2020-07-27:22:50:16:INFO] Determined delimiter of CSV input is ','[0m
[34m[22:50:16] S3DistributionType set as FullyReplicated[0m
[34m[22:50:16] 70272x32 matrix 

In [46]:
def test_and_print_metrics(xgb_object, test_location, output_name, gt):
    xgb_transformer = xgb_object.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')    
    xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')
    xgb_transformer.wait()
    !aws s3 cp --recursive $xgb_transformer.output_path $data_dir
    predictions = pd.read_csv(os.path.join('data', '{}.out'.format(output_name)), header=None)
    predictions = [round(num) for num in predictions.squeeze().values]
    print_metrics(predictions, gt)
    

In [47]:
def print_metrics(preds, gt):
    from sklearn.metrics import f1_score, precision_score, accuracy_score, recall_score
    import numpy as np
    print("F1: ", f1_score(gt, preds))
    print("acc: ",accuracy_score(gt, preds))
    print("prec: ", precision_score(gt, preds))
    print("recall: ", recall_score(gt, preds))

In [48]:
test_and_print_metrics(xgb, test_location, 'test_x.csv', test_y)



.......

KeyboardInterrupt: 

In [None]:
test_and_print_metrics(xgb, test_2020_location, 'x_test_2020.csv')