# Model creation and evaluation
The next step was to create a few models, hypertune them and compare them using the F1 metric. The first model will be the benchmark model, which is the XGBoost.

To construct the XGBoost, I'll use the SageMaker's XGBoost API.

In [72]:
import os
import sagemaker
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker import get_execution_role

# Our current execution role is require when creating the model as the training
# and inference code will need to access the model artifacts.
role = get_execution_role()
session = sagemaker.Session() # Store the current SageMaker session
# S3 prefix (which folder will we use)
prefix = 'covid19-classifier'
container = get_image_uri(session.boto_region_name, 'xgboost')


	get_image_uri(region, 'xgboost', '1.0-1').


In [73]:
import pandas as pd
test_df = pd.read_csv('data/test.csv', encoding='latin2')
test_x = test_df.iloc[:, 1:]
test_x.to_csv('data/test_x.csv', index=False)
test_y = test_df.iloc[:,0]
print(len(test_y[test_y == 1]) )

12998


In [96]:
data_dir = 'data'

test_location = session.upload_data(os.path.join(data_dir, 'x_2020.csv'), key_prefix=prefix)
val_location = session.upload_data(os.path.join(data_dir, 'val.csv'), key_prefix=prefix)
train_location = session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)

In [75]:
xgb = sagemaker.estimator.Estimator(container, # The location of the container we wish to use
                                    role,                                    # What is our current IAM Role
                                    train_instance_count=1,                  # How many compute instances
                                    train_instance_type='ml.m4.xlarge',      # What kind of compute instances
                                    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
                                    sagemaker_session=session)

# TODO: Set the XGBoost hyperparameters in the xgb object. Don't forget that in this case we have a binary
#       label so we should be using the 'binary:logistic' objective.

# Solution:
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        early_stopping_rounds=10,
                        num_round=500)



In [76]:
s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data=val_location, content_type='csv')



In [77]:
xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

2020-07-24 01:43:26 Starting - Starting the training job...
2020-07-24 01:43:29 Starting - Launching requested ML instances......
2020-07-24 01:44:44 Starting - Preparing the instances for training......
2020-07-24 01:45:40 Downloading - Downloading input data...
2020-07-24 01:46:14 Training - Downloading the training image..[34mArguments: train[0m
[34m[2020-07-24:01:46:39:INFO] Running standalone xgboost training.[0m
[34m[2020-07-24:01:46:39:INFO] File size need to be processed in the node: 64.15mb. Available memory size in the node: 8508.95mb[0m
[34m[2020-07-24:01:46:39:INFO] Determined delimiter of CSV input is ','[0m
[34m[01:46:39] S3DistributionType set as FullyReplicated[0m
[34m[01:46:40] 262998x37 matrix with 9730926 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2020-07-24:01:46:40:INFO] Determined delimiter of CSV input is ','[0m
[34m[01:46:40] S3DistributionType set as FullyReplicated[0m
[34m[01:46:40] 65750x37 matr

In [78]:
xgb_transformer = xgb.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')




In [None]:
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')
xgb_transformer.wait()

............

In [None]:
!aws s3 cp --recursive $xgb_transformer.output_path $data_dir


In [None]:
predictions = pd.read_csv(os.path.join('data', 'x_2020.csv.out'), header=None)
predictions = [round(num) for num in predictions.squeeze().values]
print(predictions)

In [None]:
from sklearn.metrics import f1_score, precision_score, accuracy_score, recall_score
import numpy as np
index = total_Y.astype(int) == 0
reverse_Y = pd.Series([ int(not bool(i)) for i in total_Y.astype('int')])
print(predictions[predictions==0])
print("F1: ", f1_score(reverse_Y[index], predictions[1:][index]))
print("acc: ",accuracy_score(reverse_Y[index], predictions[1:][index]))
print("prec: ", precision_score(reverse_Y[index], predictions[1:][index]))
print("recall: ", recall_score(reverse_Y[index], predictions[1:][index]))


In [86]:
df_2020 = pd.read_csv('data/2020.csv',sep=';', encoding='latin2')

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
def categorize_and_fill(df_in):
    # feature_list is the list of selected features
    df_out_X = df_in[feature_list + ['CLASSI_FIN']]
    
    
    # converting categorical into numeric
    df_out_X.loc[:,'SG_UF_NOT'] = df_out_X['SG_UF_NOT'].astype('category').cat.codes
        
    df_out_X.loc[:,'CS_SEXO'] = df_out_X['CS_SEXO'].astype('category').cat.codes
    
    # the age in older dataframes has two formats
    # one where the value of 'NU_IDADE_N' is the category of if
    # as in years, months or days, and the other where this definition
    # is in another column, 'TP_IDADE'
    
    # to tackle that, I did it like this
    
    df_out_X.loc[pd.notnull(df_out_X['TP_IDADE']) & (df_out_X['TP_IDADE'] != 3), ['NU_IDADE_N']] = 0
    df_out_X.loc[(df_out_X['NU_IDADE_N'] > 4000), ['NU_IDADE_N']] = df_out_X[
        ((df_out_X['NU_IDADE_N'] > 4000))]['NU_IDADE_N'] % 1000 
    df_out_X.loc[df_out_X['NU_IDADE_N'] > 1000, ['NU_IDADE_N']] = 0

    # removing the 'TP_IDADE' column, that represents the age kind
    df_out_X = df_out_X.drop(labels='TP_IDADE', axis=1)
    
    # filling the null values with the 'ignored' token
    df_out_X = df_out_X.fillna(9)
    
    # since the classifier will only classify if it's covid or not,
    # it's actually a binary classifier, so there are only two classes,
    # is covid and is not covid
    
    return df_out_X


In [80]:
feature_list = ['SG_UF_NOT',
                'CS_SEXO',
                'TP_IDADE',
                'NU_IDADE_N',
                'CS_RACA',
                'SURTO_SG',
                'NOSOCOMIAL',
                'AVE_SUINO',
                'FEBRE',
                'TOSSE',
                'GARGANTA',
                'DISPNEIA',
                'DESC_RESP',
                'SATURACAO',
                'DIARREIA',
                'VOMITO',
                'OUTRO_SIN',
                'PUERPERA',
                'CARDIOPATI',
                'HEMATOLOGI',
                'SIND_DOWN',
                'HEPATICA',
                'ASMA',
                'DIABETES',
                'NEUROLOGIC',
                'PNEUMOPATI',
                'RENAL',
                'OBESIDADE',
                'VACINA',
                'ANTIVIRAL',
                'TP_ANTIVIR',
                'HOSPITAL',
                'UTI',
                'SUPORT_VEN',
                'RAIOX_RES',
                'AMOSTRA',
                'TP_AMOSTRA',
                'EVOLUCAO'
               ]

In [87]:
df_2020 = categorize_and_fill(df_2020)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [88]:

df_2020 = df_2020[(df_2020['CLASSI_FIN'] == 4) | (df_2020['CLASSI_FIN'] == 5)]
df_2020.loc[df_2020['CLASSI_FIN'] == 4, 'CLASSI_FIN'] = 0
df_2020.loc[df_2020['CLASSI_FIN'] == 5, 'CLASSI_FIN'] = 1

In [90]:
print(df_2020[df_2020['CLASSI_FIN'] == 1])

        SG_UF_NOT  CS_SEXO  NU_IDADE_N  CS_RACA  SURTO_SG  NOSOCOMIAL  \
2207           25        0         0.0      1.0       2.0         2.0   
3408           25        2        74.0      1.0       2.0         2.0   
3449           25        0        49.0      4.0       2.0         2.0   
3580            6        0        52.0      4.0       2.0         2.0   
3653           17        2         1.0      9.0       2.0         2.0   
...           ...      ...         ...      ...       ...         ...   
436505         10        2        87.0      4.0       2.0         2.0   
436506         15        0        68.0      1.0       9.0         9.0   
436507         15        0        71.0      4.0       2.0         2.0   
436512          0        0        45.0      4.0       9.0         9.0   
436522          1        0        76.0      4.0       1.0         2.0   

        AVE_SUINO  FEBRE  TOSSE  GARGANTA  ...  ANTIVIRAL  TP_ANTIVIR  \
2207          2.0    1.0    1.0       2.0  ...    

In [93]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle
total_X, total_Y = df_2020[[x for x in feature_list if x != 'TP_IDADE']].values, df_2020['CLASSI_FIN'].values
total_X, total_Y = shuffle(total_X, total_Y)
total_X = MinMaxScaler().fit_transform(total_X)
print(total_X)

[[0.61538462 1.         0.24264706 ... 0.         0.         1.        ]
 [0.69230769 1.         0.57352941 ... 0.125      1.         0.        ]
 [0.69230769 1.         0.15441176 ... 0.         0.         0.        ]
 ...
 [0.96153846 1.         0.53676471 ... 0.         0.         0.        ]
 [0.96153846 1.         0.26470588 ... 0.         0.         0.        ]
 [0.23076923 1.         0.02941176 ... 0.         0.         0.        ]]


In [94]:
pd.DataFrame(total_X).to_csv('data/x_2020.csv', index=False)

In [61]:
print(len(test_y[test_y==1])) 
len(test_y[test_y==0])

13023


8894