In [1]:
import os
import numpy as np
import pandas as pd
import sagemaker
import argparse
from datasets import load_dataset
from sklearn.model_selection import train_test_split

sagemaker_session = sagemaker.Session()

bucket = sagemaker_session.default_bucket()
role = sagemaker.get_execution_role()

In [2]:
! pip install --upgrade datasets

Collecting datasets
  Downloading datasets-1.5.0-py3-none-any.whl (192 kB)
[K     |████████████████████████████████| 192 kB 17.4 MB/s eta 0:00:01
Installing collected packages: datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 1.3.0
    Uninstalling datasets-1.3.0:
      Successfully uninstalled datasets-1.3.0
Successfully installed datasets-1.5.0
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p36/bin/python -m pip install --upgrade pip' command.[0m


# Download Adverse Drug Reaction data from HuggingFace 

https://huggingface.co/datasets/ade_corpus_v2 

In [11]:
dataset = load_dataset("ade_corpus_v2", "Ade_corpus_v2_classification")
df_context, df_label = dataset['train'].__getitem__('text'), dataset['train'].__getitem__('label')
df = pd.DataFrame(
    {'text': df_context,
     'label': df_label
    })
df = df.rename(columns={'text':'CONTENT', 'label':'WORKFLOW_CLASSIFICATION'})
df['WORKFLOW_CLASSIFICATION'] = df['WORKFLOW_CLASSIFICATION'].apply(lambda x: 'Adverse Event (AE)' if x == 1 else 'non ae')

# Create train and validation datasets
train, valid = train_test_split(df, test_size=0.20,shuffle = True, random_state = 2678,  stratify=df[['WORKFLOW_CLASSIFICATION']])


Reusing dataset ade_corpus_v2 (/home/ec2-user/.cache/huggingface/datasets/ade_corpus_v2/Ade_corpus_v2_classification/1.0.0/df238bf01b826b881a7cfc4a778a953409c9c4075eb3e4274e7a83f74379ab32)


In [12]:
# Save train and valid datasets
train.to_csv("./data/train.csv")
valid.to_csv("./data/valid.csv")


# Process Raw data and load it to S3 for model training

In [13]:
def load_data(data_dir):
    
    df = pd.read_csv(data_dir)
    label2id = {'Adverse Event (AE)': 1, 'non ae': 0}
    df["label"] = df["WORKFLOW_CLASSIFICATION"].map(lambda x: label2id[x])
    return df

In [16]:
df_train = load_data("./data/train.csv")
df_valid = load_data("./data/valid.csv")

local_data_dir = "./data/data_model"
os.mkdir(local_data_dir)

df_train.to_csv(os.path.join(local_data_dir, "train.csv"), index=False)
df_valid.to_csv(os.path.join(local_data_dir, "valid.csv"), index=False)

In [39]:
len(df_train), len(df_valid)

(18812, 4704)

## Upload to S3 for SageMaker model training 

In [4]:
task_name = 'AE_bert/data'
s3_prefix = 'HF_models/' + task_name

# data path in SageMaker notebook instance.
data_dir = local_data_dir

# upload data to S3
inputs_data = sagemaker_session.upload_data(path=data_dir, bucket=bucket, key_prefix=s3_prefix)
print('input spec (in this case, just an S3 path): {}'.format(inputs_data))

input spec (in this case, just an S3 path): s3://sagemaker-us-east-1-649363377072/HF_models/AE_bert/data


# SageMaker model training

In [9]:
from sagemaker.pytorch import PyTorch

In [13]:
# hyperparameters, which are passed into the training job
hyperparameters={'epochs': 4,
                 'train_batch_size': 64,
                 'max_seq_length': 128,
                 'learning_rate': 5e-5,
                 'model_name':'distilbert-base-uncased',
                 'text_column':'CONTENT',
                 'label_column': 'label'
                 }

In [14]:
# Amazon SageMaker PyTorch framework
train_instance_type = 'ml.p3.2xlarge'

bert_estimator = PyTorch(entry_point='hf_train_deploy.py',
                    source_dir = 'src',
                    role=role,
                    framework_version='1.4.0',
                    py_version='py3',
                    instance_count=1,
                    instance_type= train_instance_type,#'local',
                    hyperparameters = hyperparameters
                   )

In [15]:
bert_estimator.fit({'training': inputs_data})

2021-03-08 14:56:50 Starting - Starting the training job...ProfilerReport-1615215410: InProgress
...
2021-03-08 14:57:49 Starting - Launching requested ML instances.........
2021-03-08 14:59:10 Starting - Preparing the instances for training......
2021-03-08 15:00:20 Downloading - Downloading input data...
2021-03-08 15:00:51 Training - Downloading the training image.........
2021-03-08 15:02:12 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-03-08 15:02:08,237 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-03-08 15:02:08,260 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-03-08 15:02:11,275 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2021-03-08 15:02:11,592 sagemaker-containers INFO

In [16]:
bert_estimator.model_data

's3://sagemaker-us-east-1-649363377072/pytorch-training-2021-03-08-14-56-50-390/output/model.tar.gz'

# SageMaker Endpoint Deploy

In [17]:
import sagemaker
from sagemaker.pytorch.model import PyTorchModel
from sagemaker.deserializers import JSONDeserializer
from sagemaker.serializers import JSONSerializer

In [18]:
role = sagemaker.get_execution_role()

In [20]:
model_data = "s3://sagemaker-us-east-1-649363377072/pytorch-training-2021-03-08-14-56-50-390/output/model.tar.gz"
src_dir = 'src'

pytorch_model = PyTorchModel(model_data=model_data,
                             role=role,
                             framework_version="1.4.0",
                             source_dir=src_dir,
                             py_version="py3",
                             entry_point="hf_train_deploy.py")

In [21]:
predictor = pytorch_model.deploy(initial_instance_count=1, 
                                 instance_type="ml.m5.large", 
                                 endpoint_name='HF-BERT-AE-model',
                                 serializer=JSONSerializer(),
                                 deserializer=JSONDeserializer())

---------------!

# Inference: invoke SageMaker Endpoint

In [22]:
import boto3
import json
import time
import numpy as np
import pandas as pd

In [23]:
endpoint_name = 'HF-BERT-AE-model'
runtime= boto3.client('runtime.sagemaker')

In [32]:
query = 'This entity is probably related to a combination of high doses of corticosteroids, vecuronium administration and metabolic abnormalities associated with respiratory failure.'


response = runtime.invoke_endpoint(EndpointName=endpoint_name,
                                   ContentType='application/json',
                                   Body=json.dumps(query))
prob = eval(response['Body'].read())

In [37]:
threshold = 0.6

prd_prob = prob[1]
pred_label = "Adverse Event (AE)" if prd_prob >= threshold else "non ae"

In [38]:
pred_label, prd_prob

('Adverse Event (AE)', 0.9993265867233276)