In [None]:
account_id_map = {
    'us-east-1': '785573368785',
    'us-east-2': '007439368137',
    'us-west-1': '710691900526',
    'us-west-2': '301217895009',
    'eu-west-1': '802834080501',
    'eu-west-2': '205493899709',
    'eu-west-3': '254080097072',
    'eu-north-1': '601324751636',
    'eu-south-1': '966458181534',
    'eu-central-1': '746233611703',
    'ap-east-1': '110948597952',
    'ap-south-1': '763008648453',
    'ap-northeast-1': '941853720454',
    'ap-northeast-2': '151534178276',
    'ap-southeast-1': '324986816169',
    'ap-southeast-2': '355873309152',
    'cn-northwest-1': '474822919863',
    'cn-north-1': '472730292857',
    'sa-east-1': '756306329178',
    'ca-central-1': '464438896020',
    'me-south-1': '836785723513',
    'af-south-1': '774647643957'
}

In [None]:
!pip install transformers[torch]

In [None]:
!pip install nvidia-pyindex
!pip install tritonclient[http]

!pip install -qU pip awscli boto3 sagemaker transformers


In [None]:
!pip install --upgrade -qU pip awscli boto3 sagemaker transformers

In [None]:
import sagemaker
import boto3

sess = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sess.default_bucket()
region = boto3.Session().region_name

sm_client = boto3.Session().client(service_name="sagemaker", region_name=region)
s3_client = boto3.Session().client(service_name="s3", region_name=region)

In [None]:
import pandas as pd
df = pd.read_csv(
    filepath_or_buffer='./data/raw/amazon_raw_1000.tsv',
    delimiter="\t",
    header=0,
    error_bad_lines=False
)
print(df.columns)

review_columns = df.columns
df.head()

In [None]:
def cast_object_to_string(data_frame):
    for label in data_frame.columns:
        if data_frame.dtypes[label] == "object":
            data_frame[label] = data_frame[label].astype("str").astype("string")
    return data_frame

df = cast_object_to_string(df)
df.dtypes

In [None]:

class InputFeatures(object):
    """BERT feature vectors."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id, review_id, date, label):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id
        self.review_id = review_id
        self.date = date
        self.label = label
        #print("InputFeatures:created")
        
def convert_input(tokenizer, the_input, max_seq_length):
    # First, we need to preprocess our data so that it matches the data BERT was trained on:
    # 1. Lowercase our text (if we're using a BERT lowercase model)
    # 2. Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"])
    # 3. Break words into WordPieces (i.e. "calling" -> ["call", "##ing"])
    #
    # Fortunately, the Transformers tokenizer does this for us!

    tokens = tokenizer.tokenize(the_input.text)
    tokens.insert(0, '[CLS]')
    tokens.append('[SEP]')
    print("**{} tokens**\n{}\n".format(len(tokens), tokens))

    encode_plus_tokens = tokenizer.encode_plus(
        the_input.text,
        padding='max_length',
        max_length=max_seq_length,
        truncation=True
    )

    # The id from the pre-trained BERT vocabulary that represents the token.  (Padding of 0 will be used if the # of tokens is less than `max_seq_length`)
    input_ids = encode_plus_tokens["input_ids"]

    # Specifies which tokens BERT should pay attention to (0 or 1).  Padded `input_ids` will have 0 in each of these vector elements.
    input_mask = encode_plus_tokens["attention_mask"]

    # Segment ids are always 0 for single-sequence tasks such as text classification.  1 is used for two-sequence tasks such as question/answer and next sentence prediction.
    segment_ids = [0] * max_seq_length

    # Label for each training row (`star_rating` 1 through 5)
    label_id = label_map[the_input.label]

    features = InputFeatures(
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids,
        label_id=label_id,
        review_id=the_input.review_id,
        date=the_input.date,
        label=the_input.label,
    )

    print("**input_ids**\n{}\n".format(features.input_ids))
    print("**input_mask**\n{}\n".format(features.input_mask))
    print("**segment_ids**\n{}\n".format(features.segment_ids))
    print("**label_id**\n{}\n".format(features.label_id))
    print("**review_id**\n{}\n".format(features.review_id))
    print("**date**\n{}\n".format(features.date))
    print("**label**\n{}\n".format(features.label))

    return features

# We'll need to transform our data into a format that BERT understands.
# - `text` is the text we want to classify, which in this case, is the `Request` field in our Dataframe.
# - `label` is the star_rating label (1, 2, 3, 4, 5) for our training input data
def transform_inputs_to_tfrecord(inputs, output_file, max_seq_length):
    records = []
    tf_record_writer = tf.io.TFRecordWriter(output_file)

    for (input_idx, the_input) in enumerate(inputs):
        if input_idx % 10000 == 0:
            print("Writing input {} of {}\n".format(input_idx, len(inputs)))

        features = convert_input(the_input, max_seq_length)

        all_features = collections.OrderedDict()

        # Create TFRecord With input_ids, input_mask, segment_ids, and label_ids
        all_features["input_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids))
        all_features["input_mask"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask))
        all_features["segment_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids))
        all_features["label_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id]))

        tf_record = tf.train.Example(features=tf.train.Features(feature=all_features))
        tf_record_writer.write(tf_record.SerializeToString())

        # Create Record For Feature Store With All Features
        records.append(
            {  #'tf_record': tf_record.SerializeToString(),
                "input_ids": features.input_ids,
                "input_mask": features.input_mask,
                "segment_ids": features.segment_ids,
                "label_id": features.label_id,
                "review_id": the_input.review_id,
                "date": the_input.date,
                "label": features.label,
                #                        'review_body': features.review_body
            }
        )

    tf_record_writer.close()

    return records

In [None]:
review_columns

In [None]:
print("role {}".format(role))

print("The DEFAULT BUCKET is {}".format(bucket))
#############################


tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

REVIEW_BODY_COLUMN = "review_body"
REVIEW_ID_COLUMN = "review_id"
DATE_COLUMN = 'review_date'

LABEL_COLUMN = "star_rating"
LABEL_VALUES = [1, 2, 3, 4, 5]

label_map = {}
for (i, label) in enumerate(LABEL_VALUES):
    label_map[label] = i




def convert_input(the_input, max_seq_length):
    # First, we need to preprocess our data so that it matches the data BERT was trained on:
    #
    # 1. Lowercase our text (if we're using a BERT lowercase model)
    # 2. Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"])
    # 3. Break words into WordPieces (i.e. "calling" -> ["call", "##ing"])
    #
    # Fortunately, the Transformers tokenizer does this for us!
    #
    tokens = tokenizer.tokenize(the_input.text)

    # Next, we need to do the following:
    #
    # 4. Map our words to indexes using a vocab file that BERT provides
    # 5. Add special "CLS" and "SEP" tokens (see the [readme](https://github.com/google-research/bert))
    # 6. Append "index" and "segment" tokens to each input (see the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf))
    #
    # Again, the Transformers tokenizer does this for us!
    #
    encode_plus_tokens = tokenizer.encode_plus(
        the_input.text,
        padding='max_length',
        max_length=max_seq_length,
        truncation=True
    )

    # The id from the pre-trained BERT vocabulary that represents the token.  (Padding of 0 will be used if the # of tokens is less than `max_seq_length`)
    input_ids = encode_plus_tokens["input_ids"]

    # Specifies which tokens BERT should pay attention to (0 or 1).  Padded `input_ids` will have 0 in each of these vector elements.
    input_mask = encode_plus_tokens["attention_mask"]

    # Segment ids are always 0 for single-sequence tasks such as text classification.  1 is used for two-sequence tasks such as question/answer and next sentence prediction.
    segment_ids = [0] * max_seq_length

    # Label for each training row (`star_rating` 1 through 5)
    label_id = label_map[the_input.label]

    features = InputFeatures(
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids,
        label_id=label_id,
        review_id=the_input.review_id,
        date=the_input.date,
        label=the_input.label,
    )
    #        review_body=the_input.text)

    #     print('**input_ids**\n{}\n'.format(features.input_ids))
    #     print('**input_mask**\n{}\n'.format(features.input_mask))
    #     print('**segment_ids**\n{}\n'.format(features.segment_ids))
    #     print('**label_id**\n{}\n'.format(features.label_id))
    #     print('**review_id**\n{}\n'.format(features.review_id))
    #     print('**date**\n{}\n'.format(features.date))
    #     print('**label**\n{}\n'.format(features.label))
    #    print('**review_body**\n{}\n'.format(features.review_body))

    return features


def list_arg(raw_value):
    """argparse type for a list of strings"""
    return str(raw_value).split(",")

class Input(object):
    """A single training/test input for sequence classification."""

    def __init__(self, text, review_id, date, label=None):
        """Constructs an Input.
        Args:
          text: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
          label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.text = text
        self.review_id = review_id
        self.date = date
        self.label = label


def _transform_tsv_to_tfrecord(df, file, max_seq_length):
    print("file {}".format(file))
    print("max_seq_length {}".format(max_seq_length))


    filename_without_extension = Path(Path(file).stem).stem


    print("Shape of dataframe {}".format(df.shape))


    print("train split percentage {}".format(args.train_split_percentage))
    print("validation split percentage {}".format(args.validation_split_percentage))
    print("test split percentage {}".format(args.test_split_percentage))

    holdout_percentage = 1.00 - args.train_split_percentage
    print("holdout percentage {}".format(holdout_percentage))
    
    df_train, df_holdout = train_test_split(df, test_size=holdout_percentage, stratify=df["star_rating"])

    test_holdout_percentage = args.test_split_percentage / holdout_percentage
    
    print("test holdout percentage {}".format(test_holdout_percentage))
    
    df_validation, df_test = train_test_split(
        df_holdout, test_size=test_holdout_percentage, stratify=df_holdout["star_rating"])

    df_train = df_train.reset_index(drop=True)
    df_validation = df_validation.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)

    print("Shape of train dataframe {}".format(df_train.shape))
    print("Shape of validation dataframe {}".format(df_validation.shape))
    print("Shape of test dataframe {}".format(df_test.shape))

    timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
    print(timestamp)

    train_inputs = df_train.apply(
        lambda x: Input(
            label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=x[DATE_COLUMN]
        ),
        axis=1,
    )

    validation_inputs = df_validation.apply(
        lambda x: Input(
            label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=x[DATE_COLUMN]
        ),
        axis=1,
    )

    test_inputs = df_test.apply(
        lambda x: Input(
            label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp
        ),
        axis=1,
    )

    # Next, we need to preprocess our data so that it matches the data BERT was trained on. For this, we'll need to do a couple of things (but don't worry--this is also included in the Python library):
    #
    #
    # 1. Lowercase our text (if we're using a BERT lowercase model)
    # 2. Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"])
    # 3. Break words into WordPieces (i.e. "calling" -> ["call", "##ing"])
    # 4. Map our words to indexes using a vocab file that BERT provides
    # 5. Add special "CLS" and "SEP" tokens (see the [readme](https://github.com/google-research/bert))
    # 6. Append "index" and "segment" tokens to each input (see the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf))
    #
    # We don't have to worry about these details.  The Transformers tokenizer does this for us.
    #
    train_data = "{}/bert/train".format(args.output_data)
    validation_data = "{}/bert/validation".format(args.output_data)
    test_data = "{}/bert/test".format(args.output_data)

    # Convert our train and validation features to InputFeatures (.tfrecord protobuf) that works with BERT and TensorFlow.
    train_records = transform_inputs_to_tfrecord(
        train_inputs,
        "{}/part-{}-{}.tfrecord".format(train_data, args.current_host, filename_without_extension),
        max_seq_length,
    )

    validation_records = transform_inputs_to_tfrecord(
        validation_inputs,
        "{}/part-{}-{}.tfrecord".format(validation_data, args.current_host, filename_without_extension),
        max_seq_length,
    )

    test_records = transform_inputs_to_tfrecord(
        test_inputs,
        "{}/part-{}-{}.tfrecord".format(test_data, args.current_host, filename_without_extension),
        max_seq_length,
    )

    df_train_records = pd.DataFrame.from_dict(train_records)
    df_train_records["split_type"] = "train"
    df_train_records.head()
    
    #df_train_records.

    #df_validation_records = pd.DataFrame.from_dict(validation_records)
    #df_validation_records["split_type"] = "validation"
    #df_validation_records.head()

    #df_test_records = pd.DataFrame.from_dict(test_records)
    #df_test_records["split_type"] = "test"
    #df_test_records.head()

    print('...features ingested!')






**Test how to create DistillBERT torchscript model**

In [None]:
from transformers import DistilBertTokenizer
import torch

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenizing input text
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
text = "i needed an antivirus application and know the quality of Norton products. This was a no brainer for me and i am glad it was so simple to get."
tokens = tokenizer.tokenize(text)
print(f"BERT:Tokenized:Text:Tokens={tokens}:::")



In [None]:
MAX_SEQ_LENGTH=64
encode_plus_tokens = tokenizer.encode_plus(
    text,
    pad_to_max_length=True,
    max_length=MAX_SEQ_LENGTH
)
encode_plus_tokens

In [None]:
import sagemaker
import boto3

sess = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sess.default_bucket()
region = boto3.Session().region_name

sm_client = boto3.Session().client(service_name="sagemaker", region_name=region)
s3_client = boto3.Session().client(service_name="s3", region_name=region)


dummy_model_input

In [None]:
s3_public_path_tsv="https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Video_v1_00.tsv.gz"
s3_public_path_tsv

In [None]:
s3_data_path_reviews = sagemaker.s3.S3Uploader().upload(
    local_path="./data/raw/amazon_reviews_us_Video_v1_00.tsv",
    desired_s3_uri="s3://sagemaker-us-east-1-622343165275/bloom/data/amazon_reviews",
    sagemaker_session=sess
)
s3_data_path_reviews

In [None]:
raw_input_data_s3_uri = s3_data_path_reviews
raw_input_data_s3_uri

In [None]:
!pygmentize bert-gptj/preprocess-scikit-text-to-bert-feature-store.py

**Test Tokenizers**

In [None]:
import time
featurestore_runtime = boto3.Session().client(service_name="sagemaker-featurestore-runtime", region_name=region)
timestamp = int(time.time())

feature_store_offline_prefix = "reviews-feature-store-" + str(timestamp)

print(feature_store_offline_prefix)

feature_group_name = "reviews-feature-group-" + str(timestamp)

print(feature_group_name)

from sagemaker.feature_store.feature_definition import (
    FeatureDefinition,
    FeatureTypeEnum,
)

feature_definitions = [
    FeatureDefinition(feature_name="input_ids", feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name="input_mask", feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name="segment_ids", feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name="label_id", feature_type=FeatureTypeEnum.INTEGRAL),
    FeatureDefinition(feature_name="review_id", feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name="date", feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name="label", feature_type=FeatureTypeEnum.INTEGRAL),
    #    FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING)
]

from sagemaker.feature_store.feature_group import FeatureGroup

feature_group = FeatureGroup(name=feature_group_name, feature_definitions=feature_definitions, sagemaker_session=sess)

print(feature_group)


In [None]:
processing_instance_type = "ml.c5.2xlarge"
processing_instance_count = 2
train_split_percentage = 0.90
validation_split_percentage = 0.05
test_split_percentage = 0.05
balance_dataset = True
max_seq_length = 64

### Run the processing job

In [None]:
from sagemaker.sklearn.processing import SKLearnProcessor

processor = SKLearnProcessor(
    framework_version='0.20.0',#"0.23-1",
    role=role,
    instance_type=processing_instance_type,
    instance_count=1, #processing_instance_count,
    env={"AWS_DEFAULT_REGION": region},
    max_runtime_in_seconds=7200,
)

In [None]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

processor.run(
    code="./bert-gptj/preprocess-scikit-text-to-bert-feature-store.py",
    inputs=[
        ProcessingInput(
            input_name="raw-input-data",
            source=raw_input_data_s3_uri,
            destination="/opt/ml/processing/input/data/",
            s3_data_distribution_type="ShardedByS3Key",
        )
    ],
    outputs=[
        ProcessingOutput(
            output_name="bert-train", s3_upload_mode="EndOfJob", source="/opt/ml/processing/output/bert/train"
        ),
        ProcessingOutput(
            output_name="bert-validation",
            s3_upload_mode="EndOfJob",
            source="/opt/ml/processing/output/bert/validation",
        ),
        ProcessingOutput(
            output_name="bert-test", s3_upload_mode="EndOfJob", source="/opt/ml/processing/output/bert/test"
        ),
    ],
    arguments=[
        "--train-split-percentage",
        str(train_split_percentage),
        "--validation-split-percentage",
        str(validation_split_percentage),
        "--test-split-percentage",
        str(test_split_percentage),
        "--max-seq-length",
        str(max_seq_length),
        "--balance-dataset",
        str(balance_dataset),
        "--feature-store-offline-prefix",
        str(feature_store_offline_prefix),
        "--feature-group-name",
        str(feature_group_name),
    ],
    #experiment_config=experiment_config,
    logs=True,
    wait=True,
)

In [None]:
scikit_processing_job_name = processor.jobs[-1].describe()["ProcessingJobName"]
print(scikit_processing_job_name)

In [None]:
running_processor = sagemaker.processing.ProcessingJob.from_processing_name(
    processing_job_name=scikit_processing_job_name, sagemaker_session=sess
)

processing_job_description = running_processor.describe()

print(processing_job_description)
running_processor.wait(logs=True)

In [None]:
processing_job_description = running_processor.describe()

output_config = processing_job_description["ProcessingOutputConfig"]
for output in output_config["Outputs"]:
    if output["OutputName"] == "bert-train":
        processed_train_data_s3_uri = output["S3Output"]["S3Uri"]
    if output["OutputName"] == "bert-validation":
        processed_validation_data_s3_uri = output["S3Output"]["S3Uri"]
    if output["OutputName"] == "bert-test":
        processed_test_data_s3_uri = output["S3Output"]["S3Uri"]

print(processed_train_data_s3_uri)
print(processed_validation_data_s3_uri)
print(processed_test_data_s3_uri)

In [None]:
!aws s3 ls $processed_train_data_s3_uri/
!aws s3 ls $processed_validation_data_s3_uri/
!aws s3 ls $processed_test_data_s3_uri/

In [None]:
!aws s3 cp $processed_train_data_s3_uri ./data/processed/
!aws s3 cp $processed_validation_data_s3_uri/ ./data/processed/
!aws s3 cp $processed_test_data_s3_uri/ ./data/processed/


### Train with L

**Note**: Amazon SageMaker expects the model tarball file to have a top level directory with the same name as the model defined in the `config.pbtxt`. Below is the sample model directory structure

```
bert-uc
├── 1
│   └── model.pt
└── config.pbtxt
```

**Have to use the same Tokenizer to generate the input to test as BERT uncased**

In [None]:
import time
import random
import pandas as pd
from glob import glob
import argparse
import json
import subprocess
import sys
import os
import tensorflow as tf
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification
from transformers import DistilBertConfig

In [None]:
max_seq_length=64

In [None]:
def select_data_and_label_from_record(record):
    x = {
        "input_ids": record["input_ids"],
        "input_mask": record["input_mask"],
        #        'segment_ids': record['segment_ids']
    }
    y = record["label_ids"]

    return (x, y)



In [None]:
def file_based_input_dataset_builder(channel, input_filenames, pipe_mode, is_training, drop_remainder):

    # For training, we want a lot of parallel reading and shuffling.
    # For eval, we want no shuffling and parallel reading doesn't matter.

    if pipe_mode:
        print("***** Using pipe_mode with channel {}".format(channel))
        from sagemaker_tensorflow import PipeModeDataset

        dataset = PipeModeDataset(channel=channel, record_format="TFRecord")
    else:
        print("***** Using input_filenames {}".format(input_filenames))
        dataset = tf.data.TFRecordDataset(input_filenames)

    dataset = dataset.repeat(100)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

    name_to_features = {
        "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
        "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64),
        #      "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
        "label_ids": tf.io.FixedLenFeature([], tf.int64),
    }

    def _decode_record(record, name_to_features):
        """Decodes a record to a TensorFlow example."""
        return tf.io.parse_single_example(record, name_to_features)

    dataset = dataset.apply(
        tf.data.experimental.map_and_batch(
            lambda record: _decode_record(record, name_to_features),
            batch_size=8,
            drop_remainder=drop_remainder,
            num_parallel_calls=tf.data.experimental.AUTOTUNE,
        )
    )

    dataset.cache()

    if is_training:
        dataset = dataset.shuffle(seed=42, buffer_size=10, reshuffle_each_iteration=True)

    return dataset



In [None]:
# -- Train data set
train_data = "./data/processed/bert-train" #"./data-tfrecord/bert-train"
train_data_filenames = glob("{}/*.tfrecord".format(train_data))
print("train_data_filenames {}".format(train_data_filenames))

train_dataset = file_based_input_dataset_builder(
    channel="train", input_filenames=train_data_filenames, pipe_mode=False, is_training=True, drop_remainder=False
).map(select_data_and_label_from_record)

# -- Validation data set
validation_data = "./data/processed/bert-validation" #"./data-tfrecord/bert-validation"
validation_data_filenames = glob("{}/*.tfrecord".format(validation_data))
print("validation_data_filenames {}".format(validation_data_filenames))

validation_dataset = file_based_input_dataset_builder(
    channel="validation",
    input_filenames=validation_data_filenames,
    pipe_mode=False,
    is_training=False,
    drop_remainder=False,
).map(select_data_and_label_from_record)

# -- Test data set
test_data = "./data/processed/bert-test" #"./data-tfrecord/bert-test"
test_data_filenames = glob("{}/*.tfrecord".format(test_data))
print(test_data_filenames)

test_dataset = file_based_input_dataset_builder(
    channel="test", input_filenames=test_data_filenames, pipe_mode=False, is_training=False, drop_remainder=False
).map(select_data_and_label_from_record)

In [None]:
epochs = 1
steps_per_epoch = 10
validation_steps = 10
test_steps = 10
freeze_bert_layer = True
learning_rate = 3e-5
epsilon = 1e-08

In [None]:
CLASSES = [1, 2, 3, 4, 5]

config = DistilBertConfig.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(CLASSES),
    id2label={0: 1, 1: 2, 2: 3, 3: 4, 4: 5},
    label2id={1: 0, 2: 1, 3: 2, 4: 3, 5: 4},
)
print(config)

In [None]:
transformer_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", config=config)

input_ids = tf.keras.layers.Input(shape=(max_seq_length,), name="input_ids", dtype="int32")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), name="input_mask", dtype="int32")

embedding_layer = transformer_model.distilbert(input_ids, attention_mask=input_mask)[0]
X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(
    embedding_layer
)
X = tf.keras.layers.GlobalMaxPool1D()(X)
X = tf.keras.layers.Dense(50, activation="relu")(X)
X = tf.keras.layers.Dropout(0.2)(X)
X = tf.keras.layers.Dense(len(CLASSES), activation="softmax")(X)

model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=X)

for layer in model.layers[:3]:
    layer.trainable = not freeze_bert_layer

In [None]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")

optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon)

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

model.summary()

In [None]:
callbacks = []

log_dir = "./tmp/tensorboard/"
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir)
callbacks.append(tensorboard_callback)

In [None]:
history = model.fit(
    train_dataset,
    shuffle=True,
    epochs=epochs,
    steps_per_epoch=steps_per_epoch,
    validation_data=validation_dataset,
    validation_steps=validation_steps,
    callbacks=callbacks,
)

In [None]:
print("Trained model {}".format(model))

In [None]:
test_history = model.evaluate(test_dataset, steps=test_steps, callbacks=callbacks)
print(test_history)

## Predict with the Model
use the data set to predict with the model

In [None]:
import pandas as pd
import numpy as np

from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

sample_review_body = "This product is terrible."

encode_plus_tokens = tokenizer.encode_plus(
    sample_review_body, padding='max_length', max_length=max_seq_length, truncation=True, return_tensors="tf"
)

# The id from the pre-trained BERT vocabulary that represents the token.  (Padding of 0 will be used if the # of tokens is less than `max_seq_length`)
input_ids = encode_plus_tokens["input_ids"]

# Specifies which tokens BERT should pay attention to (0 or 1).  Padded `input_ids` will have 0 in each of these vector elements.
input_mask = encode_plus_tokens["attention_mask"]

outputs = model.predict(x=(input_ids, input_mask))

prediction = [{"label": config.id2label[item.argmax()], "score": item.max().item()} for item in outputs]

print("")
print('Predicted star_rating "{}" for review_body "{}"'.format(prediction[0]["label"], sample_review_body))

### Create the BERT Model in Torch Script mode -- .pt model
use the ore trained and use torchscript flag here

In [None]:
from transformers import BertModel, BertTokenizer, BertConfig
import torch


# If you are instantiating the model with `from_pretrained` you can also easily set the TorchScript flag
model = BertModel.from_pretrained("bert-base-uncased", torchscript=True)

# Change to eva lmodel
model.eval()

# run a dummy prediction of tokens by tensors
output = model(tokens_tensor)
print(len(output), type(output), type(output[0]))

# Creating the trace
traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
torch.jit.save(traced_model, "./triton-serve/bert-uc/1/model.pt")

### Create the BERT Model in Torch Script using dummy inputs -- .pt model
Create using the dummy inputs

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using {} device".format(device))


model = BertModel.from_pretrained("bert-large-uncased", torchscript=True)

bs = 1
seq_len = 512
dummy_inputs = [
    torch.randint(1000, (bs, seq_len)).to(device),
    torch.zeros(bs, seq_len, dtype=torch.int).to(device),
]
model = model.eval()
model.to(device)

traced_model = torch.jit.trace(model, dummy_inputs)
torch.jit.save(traced_model, "./triton-serve/bert-uc/1/model.pt")

print("Saved {}".format(traced_model))

**Predict test using the traced model Needs Tokens and Attention mask both**

In [None]:
output = traced_model(input_ids=tokens_tensor, attention_mask=segments_tensors)
print(len(output), type(output), type(output[0]))

In [None]:
output

### UPLOAD of the Model.tar after it has been created correctly by 

Because we share the same model tar with bloom and with bert-uc
rm model.tar.gz in the triton-serve directory

tar --exclude=".git" --exclude=".gitattributes" --exclude="model.tar.gz" --exclude="*.bin" -zcvf model.tar.gz

**Upload the model.tar.gz to S3 location**

In [None]:
import sagemaker
from sagemaker import get_execution_role, Session, image_uris
from sagemaker.utils import name_from_base
import boto3
region = boto3.Session().region_name
role = get_execution_role()
sm_client = boto3.client(service_name="sagemaker")
runtime_sm_client = boto3.client("sagemaker-runtime")


In [None]:
s3_model_path_triton = sagemaker.s3.S3Uploader().upload(
    local_path="./triton-serve/model.tar.gz",
    desired_s3_uri="s3://sagemaker-us-east-1-622343165275/bloom/triton_models/bert-uc",
    sagemaker_session=session
)
s3_mme_model_path='s3://sagemaker-us-east-1-622343165275/bloom/triton_models/'
print(s3_model_path_triton)
print(s3_mme_model_path)

#### Start Single Model Triton for starting

**Triton Image download and sagemaker variables**

In [None]:
from sagemaker import get_execution_role, Session, image_uris
import boto3
from sagemaker.utils import name_from_base

region = boto3.Session().region_name
role = get_execution_role()
sm_client = boto3.client(service_name="sagemaker")
runtime_sm_client = boto3.client("sagemaker-runtime")

base = "amazonaws.com.cn" if region.startswith("cn-") else "amazonaws.com"
triton_image_uri = "{account_id}.dkr.ecr.{region}.{base}/sagemaker-tritonserver:22.07-py3".format(
    account_id=account_id_map[region], region=region, base=base
)
print(triton_image_uri)

**Model creation**

In [None]:
endpoint_name_p5 = name_from_base(f"p5-bert-uc-")
print(endpoint_name_p5)

container_p5 = {
    'Image': triton_image_uri,
    'ModelDataUrl': s3_model_path_triton,
    'Environment': {
        #'SAGEMAKER_PROGRAM' : 'inference.py',
        #'SAGEMAKER_SUBMIT_DIRECTORY' : 'code',
        'SAGEMAKER_TRITON_DEFAULT_MODEL_NAME': 'bert-uc',
        "SAGEMAKER_TRITON_BATCH_SIZE": "16",
        "SAGEMAKER_TRITON_MAX_BATCH_DELAY": "1000",
        "SAGEMAKER_TRITON_SHM_DEFAULT_BYTE_SIZE" : "16777216", #"16777216000",
        "SAGEMAKER_TRITON_SHM_GROWTH_BYTE_SIZE": "1048576"
    }
}
create_model_response = sm_client.create_model(
    ModelName=endpoint_name_p5, ExecutionRoleArn=role, PrimaryContainer=container_p5
)
print(create_model_response)

**Endpoint config**

In [None]:
create_endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_name_p5,
    ProductionVariants=[
        {
            "InstanceType": "ml.g5.8xlarge", #"ml.g4dn.xlarge",
            "InitialVariantWeight": 1,
            "InitialInstanceCount": 1,
            "ModelName": endpoint_name_p5,
            "VariantName": "AllTraffic",
        }
    ],
)

print("Endpoint Config Arn: " + create_endpoint_config_response["EndpointConfigArn"])


**Endpoint**

In [None]:
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=endpoint_name_p5, EndpointConfigName=endpoint_name_p5
)

print("Endpoint Arn: " + create_endpoint_response["EndpointArn"])


In [None]:
import time
resp = sm_client.describe_endpoint(EndpointName=endpoint_name_p5)
status = resp["EndpointStatus"]
print("SINGLE:Model:endpoint:Triton:Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name_p5)
    status = resp["EndpointStatus"]
    print("Single:model:triton:Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Single:model:triton:Status: " + status)

**Now Invoke The endpoint**

In [None]:
import tritonclient.http as httpclient
from transformers import BertTokenizer
import numpy as np


def tokenize_text(text, enc, max_length=512):
    #enc = BertTokenizer.from_pretrained("bert-base-uncased")
    print(f"Tokenize:text:why??::max_length={max_length}::Tokenizer={enc}")
    encoded_text = enc(text, padding="max_length", max_length=max_length)
    return encoded_text["input_ids"], encoded_text["attention_mask"]


def _get_sample_tokenized_text_binary(text, input_names, output_names, enc, max_length=512):
    inputs = []
    outputs = []
    inputs.append(httpclient.InferInput(input_names[0], [1, max_length], "INT32"))
    inputs.append(httpclient.InferInput(input_names[1], [1, max_length], "INT32"))
    indexed_tokens, attention_mask = tokenize_text(text,enc)

    indexed_tokens = np.array(indexed_tokens, dtype=np.int32)
    indexed_tokens = np.expand_dims(indexed_tokens, axis=0)
    inputs[0].set_data_from_numpy(indexed_tokens, binary_data=True)

    attention_mask = np.array(attention_mask, dtype=np.int32)
    attention_mask = np.expand_dims(attention_mask, axis=0)
    inputs[1].set_data_from_numpy(attention_mask, binary_data=True)

    outputs.append(httpclient.InferRequestedOutput(output_names[0], binary_data=True))
    outputs.append(httpclient.InferRequestedOutput(output_names[1], binary_data=True))
    request_body, header_length = httpclient.InferenceServerClient.generate_request_body(
        inputs, outputs=outputs
    )
    return request_body, header_length


def get_sample_tokenized_text_binary_pt(text, enc, max_length=512):
    return _get_sample_tokenized_text_binary(
        text, ["INPUT__0", "INPUT__1"], ["OUTPUT__0", "1634__1"], enc, max_length
    )


def get_sample_tokenized_text_binary_trt(text, enc):
    return _get_sample_tokenized_text_binary(text, ["token_ids", "attn_mask"], ["output", "1634"], enc, max_length)

In [None]:
%%time

import json
max_seq_length=512
text_triton = "Triton Inference Server provides a cloud and edge inferencing solution optimized for both CPUs and GPUs."
print(f"Leverage the Tokenizer={enc}::max_seq_length={max_seq_length}:: create above when creating the model ")
input_ids, attention_mask = tokenize_text(text_triton, enc, max_length=max_seq_length)

payload = {
    "inputs": [
        {"name": "INPUT__0", "shape": [1, max_seq_length], "datatype": "INT32", "data": input_ids},
        {"name": "INPUT__1", "shape": [1, max_seq_length], "datatype": "INT32", "data": attention_mask},
    ]
}

response = runtime_sm_client.invoke_endpoint(
    EndpointName=endpoint_name_p5, ContentType="application/octet-stream", Body=json.dumps(payload)
)

print(json.loads(response["Body"].read().decode("utf8")))

In [None]:
endpoint_name_p5

In [None]:
%%time
import json

max_seq_length=512
text_triton = """This is a creative writing exercise. Below, you'll be given a prompt. Your story should be based on the prompt.

Prompt: A scary story about a haunted mouse
Story: On a dark and stormy night, the mouse crept in the shadows. """


print(f"Leverage the Tokenizer={enc}::max_seq_length={max_seq_length}:: create above when creating the model ")
input_ids, attention_mask = tokenize_text(text_triton, enc, max_length=max_seq_length)

payload = {
    "inputs": [
        {"name": "INPUT__0", "shape": [1, max_seq_length], "datatype": "INT32", "data": input_ids},
        {"name": "INPUT__1", "shape": [1, max_seq_length], "datatype": "INT32", "data": attention_mask},
    ]
}

response = runtime_sm_client.invoke_endpoint(
    EndpointName=endpoint_name_p5, ContentType="application/octet-stream", Body=json.dumps(payload)
)

output_dict = json.loads(response["Body"].read().decode("utf8"))

# -- output_dict['outputs'][0]['data']  -- has 0 and 1 as 2 indexes in list 
output_dict.keys()

#enc.decode(output_dict['outputs'][0]['data'], skip_special_tokens=True)

In [None]:
input_ids
attention_mask 

# open file in write mode
with open(r'./temp-bloom/input_ids.txt', 'w') as fp:
    for item in input_ids:
        # write each item on a new line
        fp.write("%s\n" % item)
    print('Done input_ids')
    
# open file in write mode
with open(r'./temp-bloom/attention_mask.txt', 'w') as fp:
    for item in attention_mask:
        # write each item on a new line
        fp.write("%s\n" % item)
    print('Done attention_mask')


### Clean up

In [None]:
sm_client.delete_endpoint(EndpointName=endpoint_name_p5)
sm_client.delete_endpoint_config(EndpointConfigName=endpoint_name_p5)
sm_client.delete_model(ModelName=endpoint_name_p5)

In [None]:
# general imports
import boto3
import json
import os
import re
import copy
import time
from time import gmtime, strftime
import numpy as np
import datetime
import pprint
import pandas as pd

# sagemaker
import sagemaker
from sagemaker import get_execution_role

# triton
import tritonclient.http as httpclient

# transformers
from transformers import BertTokenizer

# custom CloudWatch
#from cloudwatch import get_endpoint_metrics


In [None]:
!docker run --gpus=all --rm -it  -v `pwd`/workspace:/workspace nvcr.io/nvidia/pytorch:21.08-py3 /bin/bash generate_models.sh

## START MME for triton 

**Upload first**

### Upload multiple copies for MME

In [None]:
for ii in range(1,100):
    s3_model_path_triton_mme = sagemaker.s3.S3Uploader().upload(
        local_path="./triton-serve/model.tar.gz",
        desired_s3_uri=f"s3://sagemaker-us-east-1-622343165275/bloom/triton_models/bert-uc/model-{ii}",
        sagemaker_session=session
    )
s3_model_path_mme='s3://sagemaker-us-east-1-622343165275/bloom/triton_models/bert-uc'
print("MULTIPLE:Uplodas:")
print(s3_model_path_triton_mme)
print(s3_model_path_mme)

In [None]:
model

In [None]:
s3_model_path_mme

**Create the model**

In [None]:
endpoint_name_p5_mme = name_from_base(f"p5-bert-uc-mme")
print(endpoint_name_p5_mme)

container_p5_mme = {
    'Image': triton_image_uri,
    'ModelDataUrl': s3_model_path_mme,
    'Mode':'MultiModel',
    'Environment': {
        #'SAGEMAKER_PROGRAM' : 'inference.py',
        #'SAGEMAKER_SUBMIT_DIRECTORY' : 'code',
        'SAGEMAKER_TRITON_DEFAULT_MODEL_NAME': 'model-1',
        "SAGEMAKER_TRITON_BATCH_SIZE": "16",
        "SAGEMAKER_TRITON_MAX_BATCH_DELAY": "1000",
        "SAGEMAKER_TRITON_SHM_DEFAULT_BYTE_SIZE" : "16777216", #"16777216000",
        "SAGEMAKER_TRITON_SHM_GROWTH_BYTE_SIZE": "1048576"
    }
}
create_model_response_mme = sm_client.create_model(
    ModelName=endpoint_name_p5_mme, ExecutionRoleArn=role, PrimaryContainer=container_p5_mme
)
print(create_model_response_mme)

**Create the Endpoint config**

In [None]:
create_endpoint_config_response_mme = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_name_p5_mme,
    ProductionVariants=[
        {
            "InstanceType": "ml.g4dn.xlarge", #"ml.g4dn.xlarge",ml.g5.8xlarge
            "InitialVariantWeight": 1,
            "InitialInstanceCount": 1,
            "ModelName": endpoint_name_p5_mme,
            "VariantName": "AllTraffic",
        }
    ],
)

print("Endpoint Config Arn: " + create_endpoint_config_response_mme["EndpointConfigArn"])

**Create the endpoint**

In [None]:
create_endpoint_response_mme = sm_client.create_endpoint(
    EndpointName=endpoint_name_p5_mme, EndpointConfigName=endpoint_name_p5_mme
)

print("Endpoint Arn: " + create_endpoint_response_mme["EndpointArn"])


In [None]:
import time
resp = sm_client.describe_endpoint(EndpointName=endpoint_name_p5_mme)
status = resp["EndpointStatus"]
print("MME:Model:endpoint:Triton:Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name_p5_mme)
    status = resp["EndpointStatus"]
    print("MME:model:triton:Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("MME:model:triton:Status: " + status)

**Test the end point**

In [None]:
%%time

import json
max_seq_length=512
text_triton = "Triton Inference Server provides a cloud and edge inferencing solution optimized for both CPUs and GPUs."
print(f"Leverage the Tokenizer={enc}::max_seq_length={max_seq_length}:: create above when creating the model ")
input_ids, attention_mask = tokenize_text(text_triton, enc, max_length=max_seq_length)

payload = {
    "inputs": [
        {"name": "INPUT__0", "shape": [1, max_seq_length], "datatype": "INT32", "data": input_ids},
        {"name": "INPUT__1", "shape": [1, max_seq_length], "datatype": "INT32", "data": attention_mask},
    ]
}

response = runtime_sm_client.invoke_endpoint(
    EndpointName=endpoint_name_p5_mme, ContentType="application/octet-stream", Body=json.dumps(payload), TargetModel  = "/model-9/model.tar.gz"
)

output_dict = json.loads(response["Body"].read().decode("utf8"))

# -- output_dict['outputs'][0]['data']  -- has 0 and 1 as 2 indexes in list 
output_dict.keys()

enc.decode(output_dict['outputs'][0]['data'], skip_special_tokens=True)

In [None]:
endpoint_name_p5_mme

In [None]:
response = runtime_sm_client.invoke_endpoint(
    EndpointName=endpoint_name_p5_mme, ContentType="text/json", Body=json.dumps(payload), TargetModel  = "/model-9/model.tar.gz"
)
output_dict = json.loads(response["Body"].read().decode("utf8"))

# -- output_dict['outputs'][0]['data']  -- has 0 and 1 as 2 indexes in list 
output_dict.keys()

**set up in S3 payload to be used for inference load testing**

In [None]:
max_seq_length=512
text_triton = """
                Create payload JSON and upload it on S3. 
                This will be used by Inference Recommender to run the load test.
              """

input_ids, attention_mask = tokenize_text(text_triton, enc, max_length=max_seq_length)

payload = {
    "inputs": [
        {"name": "INPUT__0", "shape": [1, max_seq_length], "datatype": "INT32", "data": input_ids},
        {"name": "INPUT__1", "shape": [1, max_seq_length], "datatype": "INT32", "data": attention_mask},
    ]
}

print(f"Sample payload to be used with Inference Recommender")
print(payload)

payload_location = "./sample-payload/"
!mkdir -p $payload_location

payload_archive_name = "payload.tar.gz"

with open(payload_location + "request.json", "w") as f:
    json.dump(payload, f)


!cd ./sample-payload/ && tar czvf ../payload.tar.gz *

print(f"payload.tar.gz created at {payload_location}/{payload_archive_name}")

**Upload sample payload to S3**

In [None]:
s3_sample_data_path_triton = sagemaker.s3.S3Uploader().upload(
    local_path=f"{payload_archive_name}",
    desired_s3_uri="s3://sagemaker-us-east-1-622343165275/bloom/triton_test_data",
    sagemaker_session=session
)
s3_sample_data_path_triton

## Inference Load test set up
### DOES NOT WORK FOR MME -- SO SKIP this section

In [None]:
ml_domain = "NATURAL_LANGUAGE_PROCESSING"
ml_task = "FILL_MASK"
ml_framework = "PYTORCH"
framework_version = "1.6.0"
model_tested = "bert-base-uncased"

In [None]:
ts = time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
sm_model_name = "pt-triton-benchmark-model-" + ts
model_package_group_name = "pt-triton-benchmark-model-group-" + ts
advanced_job = "pt-triton-benchmark-advanced-job-" + ts

print(f"SageMaker Model Name: {sm_model_name}")
print(f"SageMaker Mode Package Name: {model_package_group_name}")
print(f"SageMaker Advanced Job Name: {advanced_job}")

In [None]:
s3_model_path_mme

In [None]:
container_infrec_mme = {
    'Image': triton_image_uri,
    "NearestModelName": model_tested, #'model-1',
    "Framework": ml_framework,
    'ModelDataUrl': s3_model_path_mme,
    #'Mode':'MultiModel',
    'Environment': {
        #'SAGEMAKER_PROGRAM' : 'inference.py',
        #'SAGEMAKER_SUBMIT_DIRECTORY' : 'code',
        'SAGEMAKER_TRITON_DEFAULT_MODEL_NAME': 'model-1',
        "SAGEMAKER_TRITON_BATCH_SIZE": "16",
        "SAGEMAKER_TRITON_MAX_BATCH_DELAY": "1000",
        "SAGEMAKER_TRITON_SHM_DEFAULT_BYTE_SIZE" : "16777216", #"16777216000",
        "SAGEMAKER_TRITON_SHM_GROWTH_BYTE_SIZE": "1048576"
    },
}


In [None]:
model_pacakge_group_response = sm_client.create_model_package_group(
    ModelPackageGroupName=str(model_package_group_name),
    ModelPackageGroupDescription="BERT large uncased Model group for Triton Serving",
)
print(f"Model Registry package group: {model_pacakge_group_response}")


In [None]:
model_package_version_response = sm_client.create_model_package(
    ModelPackageGroupName=str(model_package_group_name),
    ModelPackageDescription="BERT large uncased Model group for Triton Serving",
    Domain=ml_domain,
    Task=ml_task,
    SamplePayloadUrl=s3_sample_data_path_triton,
    InferenceSpecification={
        "Containers": [container_infrec_mme],
        "SupportedRealtimeInferenceInstanceTypes": [
            "ml.g4dn.4xlarge",
            "ml.g4dn.4xlarge",
        ],
        "SupportedContentTypes": ["application/octet-stream"],
        "SupportedResponseMIMETypes": ["application/json"],
    },
)
model_package_version_response

In [None]:
advanced_response = sm_client.create_inference_recommendations_job(
    JobName=advanced_job,
    JobDescription="nlp triton Inference Advanced Recommender Job",
    JobType="Advanced",
    RoleArn=role,
    InputConfig={
        "ModelPackageVersionArn": model_package_version_response["ModelPackageArn"],
        "JobDurationInSeconds": 7200,
        "EndpointConfigurations": [
            #{"InstanceType": "ml.p3.8xlarge"},
            #{"InstanceType": "ml.p3.2xlarge"},
            {"InstanceType": "ml.p2.16xlarge"},
            {"InstanceType": "ml.g4dn.xlarge"},
            {"InstanceType": "ml.g4dn.8xlarge"},
            {"InstanceType": "ml.g4dn.4xlarge"},
            {"InstanceType": "ml.g4dn.2xlarge"},
            {"InstanceType": "ml.g4dn.12xlarge"},
        ],
        "TrafficPattern": {
            "TrafficType": "PHASES",
            "Phases": [
                {
                    "InitialNumberOfUsers": 2,
                    "SpawnRate": 3,
                    "DurationInSeconds": 900,
                },  # simulating 50 users, 2 initial and 3 new users every minute for 16 minutes
            ],  # second phase, we will strt with 50 users, steady traffic for 5 minutes
        },
        "ResourceLimit": {"MaxNumberOfTests": 10, "MaxParallelOfTests": 5},
    },
    StoppingConditions={
        "MaxInvocations": 30000,
        "ModelLatencyThresholds": [{"Percentile": "P95", "ValueInMilliseconds": 500}],
    },
)

print(advanced_response)


In [None]:
%%time

ended = False
while not ended:
    inference_recommender_job = sm_client.describe_inference_recommendations_job(
        JobName=str(advanced_job)
    )
    if inference_recommender_job["Status"] in ["COMPLETED", "STOPPED", "FAILED"]:
        print(f"Inference recommender job status: {inference_recommender_job['Status']} ")
        ended = True
    else:
        print("Inference recommender job in progress")
        time.sleep(300)

if inference_recommender_job["Status"] == "FAILED":
    print("Inference recommender job failed ")
    print("Failed Reason: {}".inference_recommender_job["FailedReason"])
else:
    print("Inference recommender job completed")

## Clean up

In [None]:
sm_client.delete_endpoint(EndpointName=endpoint_name_p5_mme)
sm_client.delete_endpoint_config(EndpointConfigName=endpoint_name_p5_mme)
sm_client.delete_model(ModelName=endpoint_name_p5_mme)