In [1]:
account_id_map = {
    'us-east-1': '785573368785',
    'us-east-2': '007439368137',
    'us-west-1': '710691900526',
    'us-west-2': '301217895009',
    'eu-west-1': '802834080501',
    'eu-west-2': '205493899709',
    'eu-west-3': '254080097072',
    'eu-north-1': '601324751636',
    'eu-south-1': '966458181534',
    'eu-central-1': '746233611703',
    'ap-east-1': '110948597952',
    'ap-south-1': '763008648453',
    'ap-northeast-1': '941853720454',
    'ap-northeast-2': '151534178276',
    'ap-southeast-1': '324986816169',
    'ap-southeast-2': '355873309152',
    'cn-northwest-1': '474822919863',
    'cn-north-1': '472730292857',
    'sa-east-1': '756306329178',
    'ca-central-1': '464438896020',
    'me-south-1': '836785723513',
    'af-south-1': '774647643957'
}

In [3]:
# need torch 1.3.1 for elastic inference
!python -m pip install --upgrade pip
!pip install torch --quiet
!pip install transformers --quiet

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com, https://pypi.ngc.nvidia.com


In [4]:
import os
import numpy as np
import pandas as pd
import sagemaker

sagemaker_session = sagemaker.Session()

bucket = sagemaker_session.default_bucket()
model_prefix = "sagemaker/nlp-data-drift-bert-model"

role = sagemaker.get_execution_role()

In [5]:
!mkdir -p nlp_drift

In [6]:
if not os.path.exists("./nlp_drift/cola_public_1.1.zip"):
    !curl -o ./nlp_drift/cola_public_1.1.zip https://nyu-mll.github.io/CoLA/cola_public_1.1.zip
if not os.path.exists("./nlp_drift/cola_public/"):
    !cd nlp_drift && unzip cola_public_1.1.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  249k  100  249k    0     0  7473k      0 --:--:-- --:--:-- --:--:-- 7555k
Archive:  cola_public_1.1.zip
   creating: cola_public/
  inflating: cola_public/README      
   creating: cola_public/tokenized/
  inflating: cola_public/tokenized/in_domain_dev.tsv  
  inflating: cola_public/tokenized/in_domain_train.tsv  
  inflating: cola_public/tokenized/out_of_domain_dev.tsv  
   creating: cola_public/raw/
  inflating: cola_public/raw/in_domain_dev.tsv  
  inflating: cola_public/raw/in_domain_train.tsv  
  inflating: cola_public/raw/out_of_domain_dev.tsv  


In [7]:
df = pd.read_csv(
    "./nlp_drift/cola_public/raw/in_domain_train.tsv",
    sep="\t",
    header=None,
    usecols=[1, 3],
    names=["label", "sentence"],
)
sentences = df.sentence.values
labels = df.label.values

In [8]:
df.head()

Unnamed: 0,label,sentence
0,1,"Our friends won't buy this analysis, let alone..."
1,1,One more pseudo generalization and I'm giving up.
2,1,One more pseudo generalization or I'm giving up.
3,1,"The more we study verbs, the crazier they get."
4,1,Day by day the facts are getting murkier.


In [9]:
print(sentences[20:25])
print(labels[20:25])

['The professor talked us.' 'We yelled ourselves hoarse.'
 'We yelled ourselves.' 'We yelled Harry hoarse.'
 'Harry coughed himself into a fit.']
[0 1 0 0 1]


In [10]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df)
train.to_csv("./nlp_drift/cola_public/train.csv", index=False)
test.to_csv("./nlp_drift/cola_public/test.csv", index=False)

In [12]:
inputs_train = sagemaker_session.upload_data("./nlp_drift/cola_public/train.csv", bucket=bucket, key_prefix=model_prefix)
inputs_test = sagemaker_session.upload_data("./nlp_drift/cola_public/test.csv", bucket=bucket, key_prefix=model_prefix)


In [14]:
!mkdir -p nlp_drift/code

In [24]:
%%writefile nlp_drift/code/requirements.txt
tqdm
requests==2.22.0
regex
sentencepiece
sacremoses
transformers==2.3.0

Overwriting nlp_drift/code/requirements.txt


In [16]:
%%writefile nlp_drift/code/train_deploy.py
import argparse
import json
import logging
import os
import sys

import numpy as np
import pandas as pd
import torch
import transformers
import torch.distributed as dist
import torch.utils.data
import torch.utils.data.distributed
from torch.utils.data import DataLoader, RandomSampler, TensorDataset
from transformers import AdamW, BertForSequenceClassification, BertTokenizer

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler(sys.stdout))

MAX_LEN = 64  # this is the max length of the sentence

print("Loading BERT tokenizer...")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)


def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


def _get_train_data_loader(batch_size, training_dir, is_distributed):
    logger.info("Get train data loader")

    dataset = pd.read_csv(os.path.join(training_dir, "train.csv"))
    sentences = dataset.sentence.values
    labels = dataset.label.values

    input_ids = []
    for sent in sentences:
        encoded_sent = tokenizer.encode(sent, add_special_tokens=True)
        input_ids.append(encoded_sent)

    # pad shorter sentences
    input_ids_padded = []
    for i in input_ids:
        while len(i) < MAX_LEN:
            i.append(0)
        input_ids_padded.append(i)
    input_ids = input_ids_padded

    # mask; 0: added, 1: otherwise
    attention_masks = []
    # For each sentence...
    for sent in input_ids:
        att_mask = [int(token_id > 0) for token_id in sent]
        attention_masks.append(att_mask)

    # convert to PyTorch data types.
    train_inputs = torch.tensor(input_ids)
    train_labels = torch.tensor(labels)
    train_masks = torch.tensor(attention_masks)

    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    if is_distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
    else:
        train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    return train_dataloader


def _get_test_data_loader(test_batch_size, training_dir):
    dataset = pd.read_csv(os.path.join(training_dir, "test.csv"))
    sentences = dataset.sentence.values
    labels = dataset.label.values

    input_ids = []
    for sent in sentences:
        encoded_sent = tokenizer.encode(sent, add_special_tokens=True)
        input_ids.append(encoded_sent)

    # pad shorter sentences
    input_ids_padded = []
    for i in input_ids:
        while len(i) < MAX_LEN:
            i.append(0)
        input_ids_padded.append(i)
    input_ids = input_ids_padded

    # mask; 0: added, 1: otherwise
    attention_masks = []
    # For each sentence...
    for sent in input_ids:
        att_mask = [int(token_id > 0) for token_id in sent]
        attention_masks.append(att_mask)

    # convert to PyTorch data types.
    train_inputs = torch.tensor(input_ids)
    train_labels = torch.tensor(labels)
    train_masks = torch.tensor(attention_masks)

    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=test_batch_size)

    return train_dataloader


def train(args):
    is_distributed = len(args.hosts) > 1 and args.backend is not None
    logger.debug("NLP_DRIFT:Distributed training - %s", is_distributed)
    use_cuda = args.num_gpus > 0
    logger.debug("NLP_DRIFT:Number of gpus available - %d", args.num_gpus)
    device = torch.device("cuda" if use_cuda else "cpu")

    if is_distributed:
        # Initialize the distributed environment.
        world_size = len(args.hosts)
        os.environ["WORLD_SIZE"] = str(world_size)
        host_rank = args.hosts.index(args.current_host)
        os.environ["RANK"] = str(host_rank)
        dist.init_process_group(backend=args.backend, rank=host_rank, world_size=world_size)
        logger.info(
            "NLP_DRIFT:Initialized the distributed environment: '%s' backend on %d nodes. "
            "NLP_DRIFT:Current host rank is %d. Number of gpus: %d",
            args.backend, dist.get_world_size(),
            dist.get_rank(), args.num_gpus
        )

    # set the seed for generating random numbers
    torch.manual_seed(args.seed)
    if use_cuda:
        torch.cuda.manual_seed(args.seed)

    train_loader = _get_train_data_loader(args.batch_size, args.data_dir, is_distributed)
    test_loader = _get_test_data_loader(args.test_batch_size, args.test)

    logger.debug(
        "NLP_DRIFT:Processes {}/{} ({:.0f}%) of train data".format(
            len(train_loader.sampler),
            len(train_loader.dataset),
            100.0 * len(train_loader.sampler) / len(train_loader.dataset),
        )
    )

    logger.debug(
        "NLP_DRIFT:Processes {}/{} ({:.0f}%) of test data".format(
            len(test_loader.sampler),
            len(test_loader.dataset),
            100.0 * len(test_loader.sampler) / len(test_loader.dataset),
        )
    )

    logger.info("NLP_DRIFT:Starting BertForSequenceClassification\n")
    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased",  # Use the 12-layer BERT model, with an uncased vocab.
        num_labels=args.num_labels,  # The number of output labels--2 for binary classification.
        output_attentions=False,  # Whether the model returns attentions weights.
        output_hidden_states=False,  # Whether the model returns all hidden-states.
    )

    model = model.to(device)
    if is_distributed and use_cuda:
        # multi-machine multi-gpu case
        model = torch.nn.parallel.DistributedDataParallel(model)
    else:
        # single-machine multi-gpu case or single-machine or multi-machine cpu case
        model = torch.nn.DataParallel(model)
    optimizer = AdamW(
        model.parameters(),
        lr=2e-5,  # args.learning_rate - default is 5e-5, our notebook had 2e-5
        eps=1e-8,  # args.adam_epsilon - default is 1e-8.
    )

    logger.info("NLP_DRIFT:End of defining BertForSequenceClassification\n")
    for epoch in range(1, args.epochs + 1):
        total_loss = 0
        model.train()
        for step, batch in enumerate(train_loader):
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            model.zero_grad()

            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs[0]

            total_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            # modified based on their gradients, the learning rate, etc.
            optimizer.step()
            if step % args.log_interval == 0:
                logger.info(
                    "NLP_DRIFT:Train Epoch: {} [{}/{} ({:.0f}%)] Loss: {:.6f}".format(
                        epoch,
                        step * len(batch[0]),
                        len(train_loader.sampler),
                        100.0 * step / len(train_loader),
                        loss.item(),
                    )
                )

        logger.info("NLP_DRIFT:Average training loss: %f\n", total_loss / len(train_loader))

        test(model, test_loader, device)

    logger.info("NLP_DRIFT:Saving tuned model.")
    model_2_save = model.module if hasattr(model, "module") else model
    model_2_save.save_pretrained(save_directory=args.model_dir)


def test(model, test_loader, device):
    model.eval()
    _, eval_accuracy = 0, 0

    with torch.no_grad():
        for batch in test_loader:
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            logits = outputs[0]
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to("cpu").numpy()
            tmp_eval_accuracy = flat_accuracy(logits, label_ids)
            eval_accuracy += tmp_eval_accuracy

    logger.info("NLP_DRIFT:Test set: Accuracy: %f\n", tmp_eval_accuracy)


def model_fn(model_dir):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("NLP_DRIFT:================ objects in model_dir ===================")
    print(os.listdir(model_dir))
    model = BertForSequenceClassification.from_pretrained(model_dir)
    print("NLP_DRIFT:================ model loaded ===========================")
    return model.to(device)




def input_fn(request_body, request_content_type):
    """An input_fn that loads a pickled tensor"""
    if request_content_type == "application/json":
        data = json.loads(request_body)
        print("NLP_DRIFT:================ input sentences ===============")
        print(data)
        
        if isinstance(data, str):
            data = [data]
        elif isinstance(data, list) and len(data) > 0 and isinstance(data[0], str):
            pass
        else:
            raise ValueError("Unsupported input type. Input type can be a string or an non-empty list. \
                             I got {}".format(data))
                       
        #encoded = [tokenizer.encode(x, add_special_tokens=True) for x in data]
        #encoded = tokenizer(data, add_special_tokens=True) 
        
        # for backward compatibility use the following way to encode 
        # https://github.com/huggingface/transformers/issues/5580
        input_ids = [tokenizer.encode(x, add_special_tokens=True) for x in data]
        
        print("NLP_DRIFT:================ encoded sentences ==============")
        print(input_ids)

        # pad shorter sentence
        padded =  torch.zeros(len(input_ids), MAX_LEN) 
        for i, p in enumerate(input_ids):
            padded[i, :len(p)] = torch.tensor(p)
     
        # create mask
        mask = (padded != 0)
        
        print("NLP_DRIFT:================= padded input and attention mask ================")
        print(padded, '\n', mask)

        return padded.long(), mask.long()
    raise ValueError("Unsupported content type: {}".format(request_content_type))
    

def predict_fn(input_data, model):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    input_id, input_mask = input_data
    input_id = input_id.to(device)
    input_mask = input_mask.to(device)
    print("NLP_DRIFT:============== encoded data =================")
    print(input_id, input_mask)
    with torch.no_grad():
        y = model(input_id, attention_mask=input_mask)[0]
        print("NLP_DRIFT:=============== inference result =================")
        print(y)
    return y

if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    # Data and model checkpoints directories
    parser.add_argument(
        "--num_labels", type=int, default=2, metavar="N", help="input batch size for training (default: 64)"
    )

    parser.add_argument(
        "--batch-size", type=int, default=64, metavar="N", help="input batch size for training (default: 64)"
    )
    parser.add_argument(
        "--test-batch-size", type=int, default=1000, metavar="N", help="input batch size for testing (default: 1000)"
    )
    parser.add_argument("--epochs", type=int, default=2, metavar="N", help="number of epochs to train (default: 10)")
    parser.add_argument("--lr", type=float, default=0.01, metavar="LR", help="learning rate (default: 0.01)")
    parser.add_argument("--momentum", type=float, default=0.5, metavar="M", help="SGD momentum (default: 0.5)")
    parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)")
    parser.add_argument(
        "--log-interval",
        type=int,
        default=50,
        metavar="N",
        help="how many batches to wait before logging training status",
    )
    parser.add_argument(
        "--backend",
        type=str,
        default=None,
        help="backend for distributed training (tcp, gloo on cpu and gloo, nccl on gpu)",
    )

    # Container environment
    parser.add_argument("--hosts", type=list, default=json.loads(os.environ["SM_HOSTS"]))
    parser.add_argument("--current-host", type=str, default=os.environ["SM_CURRENT_HOST"])
    parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"])
    parser.add_argument("--data-dir", type=str, default=os.environ["SM_CHANNEL_TRAINING"])
    parser.add_argument("--test", type=str, default=os.environ["SM_CHANNEL_TESTING"])
    parser.add_argument("--num-gpus", type=int, default=os.environ["SM_NUM_GPUS"])

    train(parser.parse_args())


Overwriting nlp_drift/code/train_deploy.py


In [25]:
from sagemaker.pytorch import PyTorch

# place to save model artifact
output_path = f"s3://{bucket}/{model_prefix}"

estimator = PyTorch(
    entry_point="train_deploy.py",
    source_dir="nlp_drift/code",
    role=role,
    framework_version="1.3.1",
    py_version="py3",
    instance_count=1,  # this script only support distributed training for GPU instances.
    instance_type="ml.p3.2xlarge",
    output_path=output_path,
    hyperparameters={
        "epochs": 1,
        "num_labels": 2,
        "backend": "gloo",
    },
    disable_profiler=True, # disable debugger
)
estimator.fit({"training": inputs_train, "testing": inputs_test})

2022-09-18 17:44:47 Starting - Starting the training job......
2022-09-18 17:45:25 Starting - Preparing the instances for training......
2022-09-18 17:46:42 Downloading - Downloading input data...
2022-09-18 17:47:17 Training - Downloading the training image............
2022-09-18 17:49:03 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2022-09-18 17:49:06,792 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2022-09-18 17:49:06,829 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2022-09-18 17:49:06,830 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2022-09-18 17:49:07,254 sagemaker-containers INFO     Module default_user_module_name does not provide a setup.py. [0m
[34mGenerating setup.py[0m
[34m202

In [26]:
from sagemaker.model_monitor import DataCaptureConfig

#s3_capture_upload_path = f's3://{sagemaker_session.default_bucket()}/{s3_prefix}/endpoint/data_capture'
prefix = "sagemaker/CustomModelMonitor"
data_capture_prefix = "{}/datacapture".format(prefix)
s3_capture_upload_path = "s3://{}/{}".format(bucket, data_capture_prefix)

print(s3_capture_upload_path)

data_capture_config = DataCaptureConfig(
    enable_capture=True,
    sampling_percentage=100,
    destination_s3_uri=s3_capture_upload_path
)
endpoint_name='nlp-data-drift-bert-endpoint'

s3://sagemaker-us-east-1-622343165275/sagemaker/CustomModelMonitor/datacapture


In [27]:
endpoint_name='nlp-data-drift-bert-endpoint'
predictor = estimator.deploy(endpoint_name=endpoint_name,
                             initial_instance_count=1, 
                             instance_type="ml.m4.xlarge",
                             data_capture_config=data_capture_config)
print(endpoint_name)
print(predictor)

-----------------!nlp-data-drift-bert-endpoint
<sagemaker.pytorch.model.PyTorchPredictor object at 0x7f223e93f910>


In [28]:
endpoint_name = predictor.endpoint_name
print(endpoint_name)

nlp-data-drift-bert-endpoint


In [29]:
predictor.serializer = sagemaker.serializers.JSONSerializer()
predictor.deserializer = sagemaker.deserializers.JSONDeserializer()

In [30]:
# batch inference 

print("Sending test traffic to the endpoint {}. \nPlease wait...".format(endpoint_name))

result = predictor.predict([
    "CLI to download the zip file", 
    "Thanks so much for driving me home",
    "construct the sub-embeddings and corresponding baselines",
    "our Bert model and interpret what the model",
    "Bert models using Captum library",
    "case study we focus on a fine-tuned Question Answering model on SQUAD datase",
    "we pretrain the model, we can load ",
    "need to define baselines / references, nu",
    "defines numericalized special tokens ",
    "Thanks so much for cooking dinner. I really appreciate it",
    "let's define the ground truth for prediction's start and en",
    "pre-computation of embeddings for the second option is necessary because",
    "to summarize attributions for each word token in the sequence.",
    "Nice to meet you, Sergio. So, where are you from"
])

print("Done!")

Sending test traffic to the endpoint nlp-data-drift-bert-endpoint. 
Please wait...
Done!


In [None]:
print("Predicted class: ", np.argmax(result, axis=1))

#### View Captured Data

In [33]:
#Note: It takes a few minutes for the capture data to appear in S3

import boto3

s3_client = boto3.Session().client('s3')

current_endpoint_capture_prefix = "{}/{}".format(data_capture_prefix, endpoint_name)
result = s3_client.list_objects(Bucket=bucket, Prefix=current_endpoint_capture_prefix)
capture_files = [capture_file.get("Key") for capture_file in result.get("Contents")]
print("Found Capture Files:")
print("\n ".join(capture_files))

Found Capture Files:
sagemaker/CustomModelMonitor/datacapture/nlp-data-drift-bert-endpoint/AllTraffic/2022/09/18/18/05-50-842-b30c8534-a9ba-4981-a29e-24b270114782.jsonl


In [34]:
def get_obj_body(obj_key):
    return s3_client.get_object(Bucket=bucket, Key=obj_key).get('Body').read().decode("utf-8")

capture_file = get_obj_body(capture_files[-1])
print(capture_file[:2000])

{"captureData":{"endpointInput":{"observedContentType":"application/json","mode":"INPUT","data":"[\"CLI to download the zip file\", \"Thanks so much for driving me home\", \"construct the sub-embeddings and corresponding baselines\", \"our Bert model and interpret what the model\", \"Bert models using Captum library\", \"case study we focus on a fine-tuned Question Answering model on SQUAD datase\", \"we pretrain the model, we can load \", \"need to define baselines / references, nu\", \"defines numericalized special tokens \", \"Thanks so much for cooking dinner. I really appreciate it\", \"let's define the ground truth for prediction's start and en\", \"pre-computation of embeddings for the second option is necessary because\", \"to summarize attributions for each word token in the sequence.\", \"Nice to meet you, Sergio. So, where are you from\"]","encoding":"JSON"},"endpointOutput":{"observedContentType":"application/json","mode":"OUTPUT","data":"[[0.6717401742935181, -0.8138699531

In [None]:
import json

print(json.dumps(json.loads(capture_file.split('\n')[0]), indent=2))

In [104]:
sentences

array(["Our friends won't buy this analysis, let alone the next one we propose.",
       "One more pseudo generalization and I'm giving up.",
       "One more pseudo generalization or I'm giving up.", ...,
       'It is easy to slay the Gorgon.',
       'I had the strangest feeling that I knew you.',
       'What all did you get for Christmas?'], dtype=object)

#### Create the base line

In [35]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []
sentence_embeddings = []

# For every sentence...
for sent in sentences:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 64,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    with torch.no_grad():
        outputs = model(encoded_dict['input_ids'], encoded_dict['attention_mask'])
        hidden_states = outputs[2]
        token_vecs = hidden_states[-2][0]
        sentence_embedding = torch.mean(token_vecs, dim=0)
        sentence_embeddings.append(sentence_embedding)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max 

In [36]:
sentence_embeddings_list = []

for i in sentence_embeddings:
    sentence_embeddings_list.append(i.numpy())

#### Save sentence as a npy file

In [37]:
np.save('nlp_drift/embeddings.npy', sentence_embeddings_list)

In [38]:
sentence_embeddings_list[0]

array([ 7.41459668e-01, -9.18637812e-02,  5.19062281e-01,  4.00024682e-01,
        8.66519213e-02, -3.25883031e-01, -1.24016702e-01,  7.44163468e-02,
        7.17647374e-01, -2.21915811e-01,  1.27307832e-01,  2.71159589e-01,
       -2.26745367e-01,  3.91900063e-01, -1.14975072e-01, -1.54685378e-01,
        5.66386878e-01,  1.60334148e-02,  5.11665940e-02, -8.28866363e-02,
       -2.42578909e-02, -5.17321490e-02, -4.64935780e-01, -4.92869020e-02,
        3.78277272e-01, -2.42931932e-01,  1.66612789e-02,  4.27489057e-02,
       -4.87603247e-01,  3.37470293e-01,  1.70969591e-01, -4.77717727e-01,
       -3.13293129e-01, -5.75514138e-03, -1.34845823e-01,  2.92395443e-01,
       -2.04022259e-01,  4.02555794e-01, -5.67179203e-01,  1.42273039e-01,
       -3.88651490e-01, -3.48770648e-01,  1.54632971e-01, -4.36050072e-02,
       -4.35701072e-01, -2.95430809e-01,  3.90959084e-01,  2.03553438e-01,
       -1.27090558e-01, -5.96862435e-01, -1.63994581e-01,  2.14198038e-01,
       -1.01401545e-02,  

In [39]:
!aws s3 cp nlp_drift/embeddings.npy s3://{bucket}/{model_prefix}/embeddings/

upload: nlp_drift/embeddings.npy to s3://sagemaker-us-east-1-622343165275/sagemaker/nlp-data-drift-bert-model/embeddings/embeddings.npy


#### Dockerfile create

In [75]:
%%writefile docker-nlp/Dockerfile
FROM python:3.7-slim-buster

RUN pip3 install sagemaker
RUN pip3 install scipy
RUN pip3 install transformers
RUN pip3 install torch
RUN pip3 install s3fs

ENV PYTHONUNBUFFERED=TRUE

ADD evaluation.py /

ENTRYPOINT ["python3", "/evaluation.py"]


Overwriting docker-nlp/Dockerfile


In [76]:
%%writefile nlp_drift/code/evaluate.py
"""Custom Model Monitoring script for Detecting Data Drift in NLP using SageMaker Model Monitor
"""

# Python Built-Ins:
from collections import defaultdict
import datetime
import json
import os
import traceback
from types import SimpleNamespace

# External Dependencies:
import numpy as np
import boto3
from scipy.spatial.distance import cosine
from transformers import BertTokenizer, BertModel
import torch


def get_environment():
    """Load configuration variables for SM Model Monitoring job

    See https://docs.aws.amazon.com/sagemaker/latest/dg/model-monitor-byoc-contract-inputs.html
    """
    print(f"nlp-drift::get_environment()::")
    try:
        with open("/opt/ml/config/processingjobconfig.json", "r") as conffile:
            defaults = json.loads(conffile.read())["Environment"]
    except Exception as e:
        traceback.print_exc()
        print("Unable to read environment vars from SM processing config file")
        defaults = {}

    return SimpleNamespace(
        dataset_format=os.environ.get("dataset_format", defaults.get("dataset_format")),
        dataset_source=os.environ.get(
            "dataset_source",
            defaults.get("dataset_source", "/opt/ml/processing/input/endpoint"),
        ),
        end_time=os.environ.get("end_time", defaults.get("end_time")),
        output_path=os.environ.get(
            "output_path",
            defaults.get("output_path", "/opt/ml/processing/resultdata"),
        ),
        publish_cloudwatch_metrics=os.environ.get(
            "publish_cloudwatch_metrics",
            defaults.get("publish_cloudwatch_metrics", "Enabled"),
        ),
        sagemaker_endpoint_name=os.environ.get(
            "sagemaker_endpoint_name",
            defaults.get("sagemaker_endpoint_name"),
        ),
        sagemaker_monitoring_schedule_name=os.environ.get(
            "sagemaker_monitoring_schedule_name",
            defaults.get("sagemaker_monitoring_schedule_name"),
        ),
        start_time=os.environ.get(
            "start_time", 
            defaults.get("start_time")),
        max_ratio_threshold=float(os.environ.get(
            "THRESHOLD", 
             defaults.get("THRESHOLD", "nan"))),
        bucket=os.environ.get(
            "bucket",
            defaults.get("bucket", "None")),
    )


def download_embeddings_file():
    
    env = get_environment()
    print(f"nlp-drift::Starting s3fs: download")
    
    from s3fs.core import S3FileSystem
    s3 = S3FileSystem()
    
    key = 'sagemaker/nlp-data-drift-bert-model/embeddings/embeddings.npy'
    bucket = env.bucket
    print(f"nlp-drift::S3 bucket name is={bucket}")

    return np.load(s3.open('{}/{}'.format(bucket, key)))
    
if __name__=="__main__":

    env = get_environment()
    print(f"nlp-drift::Starting evaluation with config\n{env}")

    print(f"nlp-drift::Downloading Embedding File")
    
    #download BERT embedding file used for fine-tuning BertForSequenceClassification
    embedding_list = download_embeddings_file()
    
    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Load pre-trained model (weights)
    model = BertModel.from_pretrained('bert-base-uncased',
                                      output_hidden_states = True, # Whether the model returns all hidden-states.
                                      )

    # Put the model in "evaluation" mode, meaning feed-forward operation.
    model.eval()
    
    # Tokenize all of the sentences and map the tokens to thier word IDs.
    sent_cosine_dict = {}
    violations = []
    
    total_record_count = 0  # Including error predictions that we can't read the response for
    error_record_count = 0
    counts = defaultdict(int)  # dict defaulting to 0 when unseen keys are requested
    print(f"nlp-drift::counts={counts}::")
    
    for path, directories, filenames in os.walk(env.dataset_source):
        for filename in filter(lambda f: f.lower().endswith(".jsonl"), filenames):
            print(f"nlp-drift::starting:DRIFT:Analysis:filename={filename}:")
            
            with open(os.path.join(path, filename), "r") as file:
                for entry in file:
                    total_record_count += 1
                    try:
                        response = json.loads(json.loads(entry)["captureData"]["endpointInput"]["data"])
                    except:
                        continue
                
                    for record in response:
                        encoded_dict = tokenizer.encode_plus(
                            record, 
                            add_special_tokens = True,
                            max_length = 64,
                            padding= True,
                            return_attention_mask = True,
                            return_tensors = 'pt',
                            truncation=True,
                            )

                        with torch.no_grad():
                            outputs = model(encoded_dict['input_ids'], encoded_dict['attention_mask'])
                            hidden_states = outputs[2]
                            token_vecs = hidden_states[-2][0]
                            input_sentence_embedding = torch.mean(token_vecs, dim=0)
                        
                        cosine_score = 0
                        
                        for embed_item in embedding_list:
                            cosine_score += (1 - cosine(input_sentence_embedding, embed_item))
                            print(f"nlp-drift::cos:={cosine_score}")
                            
                        cosine_score_avg = cosine_score/(len(embedding_list))
                        if cosine_score_avg < env.max_ratio_threshold:
                            error_record_count += 1
                            sent_cosine_dict[record] = cosine_score_avg
                            violations.append({
                                    "sentence": record,
                                    "avg_cosine_score": cosine_score_avg,
                                    "feature_name": "sent_cosine_score",
                                    "constraint_check_type": "baseline_drift_check",
                                    "endpoint_name" : env.sagemaker_endpoint_name,
                                    "monitoring_schedule_name": env.sagemaker_monitoring_schedule_name
                                })
        
    print(f"nlp-drift::Checking for constraint violations...")
    print(f"nlp-drift::Violations: {violations if len(violations) else 'None'}")

    print(f"nlp-drift::Writing violations file...")
    with open(os.path.join(env.output_path, "constraints_violations.json"), "w") as outfile:
        outfile.write(json.dumps(
            { "violations": violations },
            indent=4,
        ))
    
    print(f"nlp-drift::Writing overall status output...")
    with open("/opt/ml/output/message", "w") as outfile:
        if len(violations):
            msg = ''
            for v in violations:
                msg += f"CompletedWithViolations: {v['sentence']}"
                msg +="\n"
        else:
            msg = "Completed: Job completed successfully with no violations."
        outfile.write(msg)
        print(msg)

    if True:
    #if env.publish_cloudwatch_metrics:
        print(f"nlp-drift::Writing CloudWatch metrics...")
        with open("/opt/ml/output/metrics/cloudwatch/cloudwatch_metrics.jsonl", "a+") as outfile:
            # One metric per line (JSONLines list of dictionaries)
            # Remember these metrics are aggregated in graphs, so we report them as statistics on our dataset
            outfile.write(json.dumps(
            { "violations": violations },
            indent=4,
            ))
    print(f"nlp-drift::Done")

Overwriting nlp_drift/code/evaluate.py


#### Build Custom Container for Monitoring

In [91]:
import boto3

account_id = boto3.client('sts').get_caller_identity().get('Account')
ecr_repository = 'nlp-data-drift-bert-v1'
tag = ':latest'

region = boto3.session.Session().region_name

sm = boto3.client('sagemaker')

uri_suffix = 'amazonaws.com'
if region in ['cn-north-1', 'cn-northwest-1']:
    uri_suffix = 'amazonaws.com.cn'
processing_repository_uri = f'{account_id}.dkr.ecr.{region}.{uri_suffix}/{ecr_repository + tag}'

In [92]:
ecr_repository

'nlp-data-drift-bert-v1'

In [93]:
print(ecr_repository + tag)
print(processing_repository_uri)

nlp-data-drift-bert-v1:latest
622343165275.dkr.ecr.us-east-1.amazonaws.com/nlp-data-drift-bert-v1:latest


In [100]:
%%sh

# The name of our algorithm
algorithm_name=nlp-data-drift-bert-test

cd docker-nlp

echo "Repository name is $algorithm_name"

account=$(aws sts get-caller-identity --query Account --output text)
echo "account got =$account"
# Get the region defined in the current configuration (default to us-east-1 if none defined)
region=$(aws configure get region)
region=${region:-us-east-1}
echo "region got=$region "

fullname="${account}.dkr.ecr.${region}.amazonaws.com/${algorithm_name}:latest"

# If the repository doesn't exist in ECR, create it.
aws ecr describe-repositories --repository-names "${algorithm_name}" > /dev/null 2>&1

if [ $? -ne 0 ]
then
    aws ecr create-repository --repository-name "${algorithm_name}" > /dev/null
fi

# Get the login command from ECR and execute it directly
$(aws ecr get-login --region ${region} --no-include-email)

# Build the docker image locally with the image name and then push it to ECR
# with the full name.

docker build -q -t ${algorithm_name} .

echo "Docker image created repo:name:or:algorithm_name=$algorithm_name fullName= $fullname"
docker tag ${algorithm_name} ${fullname}

echo "Docker push full name image=$fullname"
docker push ${fullname}

Repository name is nlp-data-drift-bert-test
account got =622343165275
region got=us-east-1 
Login Succeeded
sha256:9af1451c3dcc51944a619729e081c6966a38ed7edb5f09c7206768b7f5ce0fe0
Docker image created repo:name:or:algorithm_name=nlp-data-drift-bert-test fullName= 622343165275.dkr.ecr.us-east-1.amazonaws.com/nlp-data-drift-bert-test:latest
Docker push full name image=622343165275.dkr.ecr.us-east-1.amazonaws.com/nlp-data-drift-bert-test:latest
The push refers to repository [622343165275.dkr.ecr.us-east-1.amazonaws.com/nlp-data-drift-bert-test]
d8e3719b5b21: Preparing
6fd29d78e330: Preparing
60d5590dfd0f: Preparing
ea77d2057171: Preparing
fea0a1339bc6: Preparing
bfb7ec3efbd0: Preparing
7aa05e0971fb: Preparing
578ed91ab344: Preparing
eaa28ace589d: Preparing
2012c49cb260: Preparing
926aa14921f2: Preparing
e06e631d87d6: Preparing
bfb7ec3efbd0: Waiting
7aa05e0971fb: Waiting
eaa28ace589d: Waiting
926aa14921f2: Waiting
e06e631d87d6: Waiting
ea77d2057171: Retrying in 5 seconds
60d5590dfd0f: Retr

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

EOF


CalledProcessError: Command 'b'\n# The name of our algorithm\nalgorithm_name=nlp-data-drift-bert-test\n\ncd docker-nlp\n\necho "Repository name is $algorithm_name"\n\naccount=$(aws sts get-caller-identity --query Account --output text)\necho "account got =$account"\n# Get the region defined in the current configuration (default to us-east-1 if none defined)\nregion=$(aws configure get region)\nregion=${region:-us-east-1}\necho "region got=$region "\n\nfullname="${account}.dkr.ecr.${region}.amazonaws.com/${algorithm_name}:latest"\n\n# If the repository doesn\'t exist in ECR, create it.\naws ecr describe-repositories --repository-names "${algorithm_name}" > /dev/null 2>&1\n\nif [ $? -ne 0 ]\nthen\n    aws ecr create-repository --repository-name "${algorithm_name}" > /dev/null\nfi\n\n# Get the login command from ECR and execute it directly\n$(aws ecr get-login --region ${region} --no-include-email)\n\n# Build the docker image locally with the image name and then push it to ECR\n# with the full name.\n\ndocker build -q -t ${algorithm_name} .\n\necho "Docker image created repo:name:or:algorithm_name=$algorithm_name fullName= $fullname"\ndocker tag ${algorithm_name} ${fullname}\n\necho "Docker push full name image=$fullname"\ndocker push ${fullname}\n'' returned non-zero exit status 1.

#### Now create the monitoring schedule

In [81]:
from sagemaker.model_monitor import ModelMonitor

monitor = ModelMonitor(
    base_job_name='nlp-data-drift-bert-v1',
    role=role,
    image_uri=processing_repository_uri,
    instance_count=1,
    instance_type='ml.m5.large',
    env={ 'THRESHOLD':'0.5', 'bucket': bucket },
)

In [None]:
from sagemaker.model_monitor import CronExpressionGenerator, MonitoringOutput
from sagemaker.processing import ProcessingInput, ProcessingOutput

destination = f's3://{sagemaker_session.default_bucket()}/{prefix}/{endpoint_name}/monitoring_schedule'

processing_output = ProcessingOutput(
    output_name='result',
    source='/opt/ml/processing/resultdata',
    destination=destination,
)
output = MonitoringOutput(source=processing_output.source, destination=processing_output.destination)

monitor.create_monitoring_schedule(
    monitor_schedule_name='nlp-data-drift-bert-schedule',
    output=output,
    endpoint_input=predictor.endpoint_name,
    schedule_cron_expression=CronExpressionGenerator.hourly(),
)

In [None]:
monitor.describe_schedule()

In [None]:
jobs = monitor.list_executions()
jobs

In [None]:
if len(jobs) > 0:
    last_execution_desc = monitor.list_executions()[-1].describe()
    print(last_execution_desc)
    print(f'\nExit Message: {last_execution_desc.get("ExitMessage", "None")}')
else:
    print("""No processing job has been executed yet. 
    This means that one hour has not passed yet. 
    You can go to the next code cell and run the processing job manually""")

In [83]:
processing_repository_uri

'622343165275.dkr.ecr.us-east-1.amazonaws.com/nlp-data-drift-bert-v1:latest'

In [87]:
#image_uri_processing = sagemaker.image_uris.retrieve('processing_repository_uri', 'us-east-1')
container = "{}.dkr.ecr.{}.amazonaws.com/nlp-data-drift-bert-v1:latest".format(
    account_id, region
)
container

'622343165275.dkr.ecr.us-east-1.amazonaws.com/nlp-data-drift-bert-v1:latest'

In [101]:
container = "{}.dkr.ecr.{}.amazonaws.com/nlp-data-drift-bert-test:latest".format(
    account_id, region
)
container

'622343165275.dkr.ecr.us-east-1.amazonaws.com/nlp-data-drift-bert-test:latest'

### MANUALLY run the PROCESSING JOB

In [None]:
from sagemaker.processing import Processor
from sagemaker.model_monitor import CronExpressionGenerator, MonitoringOutput
from sagemaker.processing import ProcessingInput, ProcessingOutput

destination = f's3://{sagemaker_session.default_bucket()}/{prefix}/{endpoint_name}/monitoring_schedule'
processor = Processor(
    base_job_name='nlp-data-drift-bert-v1',
    role=role,
    image_uri=container, #processing_repository_uri,
    instance_count=1,
    instance_type='ml.m5.large',
    env={ 'THRESHOLD':'0.5','bucket': bucket },
)
    
processor.run(
    [ProcessingInput(
        input_name='endpointdata',
        source = "s3://{}/{}/{}".format(bucket, data_capture_prefix,endpoint_name),
        #source=f's3://{sagemaker_session.default_bucket()}/{s3_prefix}/endpoint/data_capture',
        destination = '/opt/ml/processing/input/endpoint',
    )],
    [ProcessingOutput(
        output_name='result',
        source='/opt/ml/processing/resultdata',
        destination=destination,
    )],
)


Job Name:  nlp-data-drift-bert-v1-2022-09-19-06-36-20-256
Inputs:  [{'InputName': 'endpointdata', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-622343165275/sagemaker/CustomModelMonitor/datacapture/nlp-data-drift-bert-endpoint', 'LocalPath': '/opt/ml/processing/input/endpoint', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'result', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-east-1-622343165275/sagemaker/CustomModelMonitor/nlp-data-drift-bert-endpoint/monitoring_schedule', 'LocalPath': '/opt/ml/processing/resultdata', 'S3UploadMode': 'EndOfJob'}}]
..........

#### Clean up Optional

In [None]:
#Delete the monitoring schedule
monitor.delete_monitoring_schedule()

In [None]:
#Delete endpoint
sm.delete_endpoint(EndpointName=endpoint_name)