# Sentiment analysis on product reviews

In [1]:
import sagemaker

In [2]:
session = sagemaker.Session()
bucket = session.default_bucket()
prefix = 'hugging-face/sentiment-analysis'

In [3]:
role = "arn:aws:iam::XXXXXXXXXX:role/Sagemaker-FullAccess"

## Preprocessing

In [4]:
from sagemaker.sklearn.processing import SKLearnProcessor

sklearn_processor = SKLearnProcessor(role=role,
                                     framework_version='0.23-1',
                                     instance_type='ml.m5.xlarge',
                                     instance_count=1)

In [5]:
%%time

from sagemaker.processing import ProcessingInput, ProcessingOutput

sklearn_processor.run(code='preprocessing_hf.py',
                      outputs=[ProcessingOutput(source='/opt/ml/processing/output/training',
                                                output_name='training'),
                               ProcessingOutput(source='/opt/ml/processing/output/validation',
                                                output_name='validation')],
                      arguments=["--threshold", "4",
                                 "--s3-bucket", bucket,
                                 "--s3-prefix", prefix]
                     )


Job Name:  sagemaker-scikit-learn-2023-03-18-15-21-18-761
Inputs:  [{'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-312202024311/sagemaker-scikit-learn-2023-03-18-15-21-18-761/input/code/preprocessing_hf.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'training', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-east-1-312202024311/sagemaker-scikit-learn-2023-03-18-15-21-18-761/output/training', 'LocalPath': '/opt/ml/processing/output/training', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'validation', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-east-1-312202024311/sagemaker-scikit-learn-2023-03-18-15-21-18-761/output/validation', 'LocalPath': '/opt/ml/processing/output/validation', 'S3UploadMode': 'EndOfJob'}}]
.............................[34mCollecting trans

[34mDataset generated_reviews_enth downloaded and prepared to /root/.cache/huggingface/datasets/generated_reviews_enth/generated_reviews_enth/1.0.0/48de9722233d125bf2408e04a2efe5281d2de21331da10b4237750dd77ee3a04. Subsequent calls will reuse this data.[0m
[34mtrain dataset shape: (141369, 3)[0m
[34mvalidation dataset shape: (15708, 3)[0m
[34m#015  0%|          | 0/141369 [00:00<?, ?ex/s]#015  1%|          | 1457/141369 [00:00<00:09, 14567.51ex/s]#015  2%|▏         | 2867/141369 [00:00<00:09, 14422.76ex/s]#015  3%|▎         | 4240/141369 [00:00<00:09, 14205.12ex/s]#015  4%|▍         | 5702/141369 [00:00<00:09, 14324.57ex/s]#015  5%|▌         | 7111/141369 [00:00<00:09, 14250.61ex/s]#015  6%|▌         | 8571/141369 [00:00<00:09, 14353.29ex/s]#015  7%|▋         | 9911/141369 [00:00<00:09, 14051.34ex/s]#015  8%|▊         | 11318/141369 [00:00<00:09, 14056.05ex/s]#015  9%|▉         | 12781/141369 [00:00<00:09, 14221.90ex/s]#015 10%|█         | 14179/141369 [00:01<00:08, 14146.43ex/s]

[34m#015  0%|          | 0/1 [00:00<?, ?ba/s]#015100%|██████████| 1/1 [00:36<00:00, 36.84s/ba]#015100%|██████████| 1/1 [00:36<00:00, 36.84s/ba][0m
[34m#015  0%|          | 0/1 [00:00<?, ?ba/s]#015100%|██████████| 1/1 [00:03<00:00,  3.07s/ba]#015100%|██████████| 1/1 [00:03<00:00,  3.07s/ba][0m

CPU times: user 2.46 s, sys: 270 ms, total: 2.73 s
Wall time: 6min 43s


In [46]:
sklearn_processor.jobs[-1].describe()

{'ProcessingInputs': [{'InputName': 'code',
   'AppManaged': False,
   'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-312202024311/sagemaker-scikit-learn-2023-03-18-15-21-18-761/input/code/preprocessing_hf.py',
    'LocalPath': '/opt/ml/processing/input/code',
    'S3DataType': 'S3Prefix',
    'S3InputMode': 'File',
    'S3DataDistributionType': 'FullyReplicated',
    'S3CompressionType': 'None'}}],
 'ProcessingOutputConfig': {'Outputs': [{'OutputName': 'training',
    'S3Output': {'S3Uri': 's3://sagemaker-us-east-1-312202024311/sagemaker-scikit-learn-2023-03-18-15-21-18-761/output/training',
     'LocalPath': '/opt/ml/processing/output/training',
     'S3UploadMode': 'EndOfJob'},
    'AppManaged': False},
   {'OutputName': 'validation',
    'S3Output': {'S3Uri': 's3://sagemaker-us-east-1-312202024311/sagemaker-scikit-learn-2023-03-18-15-21-18-761/output/validation',
     'LocalPath': '/opt/ml/processing/output/validation',
     'S3UploadMode': 'EndOfJob'},
    'AppManaged': False}]},


In [47]:
preprocessing_job_description = sklearn_processor.jobs[-1].describe()

output_config = preprocessing_job_description['ProcessingOutputConfig']
for output in output_config['Outputs']:
    print(output['S3Output']['S3Uri'])

s3://sagemaker-us-east-1-312202024311/sagemaker-scikit-learn-2023-03-18-15-21-18-761/output/training
s3://sagemaker-us-east-1-312202024311/sagemaker-scikit-learn-2023-03-18-15-21-18-761/output/validation


In [43]:
print(preprocessing_job_description['ProcessingJobStatus'])
print(preprocessing_job_description['ProcessingJobName'])
print(preprocessing_job_description['ProcessingResources'])

Completed
sagemaker-scikit-learn-2023-03-18-15-21-18-761
{'ClusterConfig': {'InstanceCount': 1, 'InstanceType': 'ml.m5.xlarge', 'VolumeSizeInGB': 30}}


In [8]:
train_data_path = "s3://sagemaker-us-east-1-XXXXXXXXXXX/sagemaker-scikit-learn-2023-03-18-15-21-18-761/output/training"
valid_data_path = "s3://sagemaker-us-east-1-XXXXXXXXXXX/sagemaker-scikit-learn-2023-03-18-15-21-18-761/output/validation"

In [9]:
# s3_prefix = 'hugging-face/sentiment-analysis'
# train_data_path = session.upload_data(path='./training/', bucket=bucket, key_prefix=s3_prefix+'/training')
# valid_data_path = session.upload_data(path='./validation/', bucket=bucket, key_prefix=s3_prefix+'/validation')

# Fine-tuning & starting Sagemaker Training Job

## Fine-tune the Hugging Face model on SageMaker

In [22]:
hyperparameters={
    'epochs': 1,
    'train_batch_size': 32,
    'model_name':'distilbert-base-uncased'
}

In [23]:
from sagemaker.huggingface import HuggingFace

huggingface_estimator = HuggingFace(
    role=role,
    # Fine-tuning script
    entry_point='train_hf.py',
    hyperparameters=hyperparameters,
    # Infrastructure
    transformers_version='4.6.1',
    pytorch_version='1.7.1',
    py_version='py36',
#     checkpoint_s3_uri=f's3://{bucket}/{prefix}/checkpoints',
#     use_spot_instances=True,
#     # max_wait should be equal to or greater than max_run in seconds
#     max_wait=3600,
#     max_run=3000,
    instance_type='ml.p3.2xlarge',
    instance_count=1
)

In [24]:
huggingface_estimator.fit({'train': train_data_path, 'valid': valid_data_path})

2023-03-18 17:09:52 Starting - Starting the training job...ProfilerReport-1679159391: InProgress
...
2023-03-18 17:10:56 Starting - Preparing the instances for training...
2023-03-18 17:11:23 Downloading - Downloading input data...
2023-03-18 17:11:58 Training - Downloading the training image..
2023-03-18 17:27:40 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2023-03-18 17:14:26,567 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2023-03-18 17:14:26,598 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2023-03-18 17:14:26,601 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2023-03-18 17:14:26,860 sagemaker-training-toolkit INFO     Invoking user script[0m
[34mTraining Env:[0m
[34m{
    "additi

[34m{'loss': 0.244, 'learning_rate': 3.302399275690358e-05, 'epoch': 0.34}[0m


ClientError: An error occurred (InvalidSignatureException) when calling the DescribeLogStreams operation: Signature expired: 20230318T173231Z is now earlier than 20230318T174337Z (20230318T174837Z - 5 min.)

In [30]:
huggingface_estimator.model_data

's3://sagemaker-us-east-1-312202024311/huggingface-pytorch-training-2023-03-18-17-09-49-922/output/model.tar.gz'

In [76]:
huggingface_estimator.base_job_name

'huggingface-pytorch-training'

In [82]:
traninghuggingface_estimator.jobs[-1].describe()

{'TrainingJobName': 'huggingface-pytorch-training-2023-03-18-17-09-49-922',
 'TrainingJobArn': 'arn:aws:sagemaker:us-east-1:312202024311:training-job/huggingface-pytorch-training-2023-03-18-17-09-49-922',
 'ModelArtifacts': {'S3ModelArtifacts': 's3://sagemaker-us-east-1-312202024311/huggingface-pytorch-training-2023-03-18-17-09-49-922/output/model.tar.gz'},
 'TrainingJobStatus': 'Completed',
 'SecondaryStatus': 'Completed',
 'HyperParameters': {'epochs': '1',
  'model_name': '"distilbert-base-uncased"',
  'sagemaker_container_log_level': '20',
  'sagemaker_job_name': '"huggingface-pytorch-training-2023-03-18-17-09-49-922"',
  'sagemaker_program': '"train_hf.py"',
  'sagemaker_region': '"us-east-1"',
  'sagemaker_submit_directory': '"s3://sagemaker-us-east-1-312202024311/huggingface-pytorch-training-2023-03-18-17-09-49-922/source/sourcedir.tar.gz"',
  'train_batch_size': '32'},
 'AlgorithmSpecification': {'TrainingImage': '763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch

In [86]:
training_job_description = huggingface_estimator.jobs[-1].describe()
print(training_job_description['TrainingJobStatus'])
print(training_job_description['AlgorithmSpecification']['TrainingImage'])
print(training_job_description['ResourceConfig'])
print(training_job_description['TrainingTimeInSeconds'])

Completed
763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-training:1.7.1-transformers4.6.1-gpu-py36-cu110-ubuntu18.04
{'InstanceType': 'ml.p3.2xlarge', 'InstanceCount': 1, 'VolumeSizeInGB': 30}
2799


# Deploy with the Hugging Face container

In [31]:
huggingface_predictor = huggingface_estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.xlarge')

----!

In [32]:
test_data = {"inputs": "Brilliant phone allaround, I'm extremely happy with it."}

In [33]:
prediction = huggingface_predictor.predict(test_data)
print(prediction)

[{'label': 'LABEL_1', 'score': 0.9276689291000366}]


In [34]:
test_data = {"inputs": "Dissapointed with the product, have to return it now!"}

In [35]:
prediction = huggingface_predictor.predict(test_data)
print(prediction)

[{'label': 'LABEL_0', 'score': 0.999360203742981}]


In [36]:
huggingface_predictor.delete_endpoint()

# Deploy with the PyTorch container (Torchserve)

In [61]:
from sagemaker.pytorch import PyTorchModel 
from sagemaker.predictor import Predictor
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

In [62]:
class SentimentAnalysis(Predictor):
    def __init__(self, endpoint_name, sagemaker_session):
        super().__init__(endpoint_name, 
                         sagemaker_session=sagemaker_session, 
                         serializer=JSONSerializer(), 
                         deserializer=JSONDeserializer())

In [63]:
model = PyTorchModel(
    model_data=huggingface_estimator.model_data,
    role=role, 
    entry_point='torchserve-predictor.py',
    source_dir='src',
    framework_version='1.7.1',
    py_version='py36',
    predictor_cls=SentimentAnalysis)

In [64]:
pytorch_predictor = model.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.xlarge')

------!

In [65]:
test_data = {'text': "Brilliant phone allaround, I'm extremely happy with it."}

In [66]:
pytorch_predictor.endpoint_name

'pytorch-inference-2023-03-18-19-20-37-442'

In [67]:
prediction = pytorch_predictor.predict(test_data)
print(prediction)

positive


In [68]:
test_data = {'text': "Dissapointed with the product, have to return it now!"}

In [69]:
prediction = pytorch_predictor.predict(test_data)
print(prediction)

negative


In [74]:
pytorch_predictor.list_monitors()

No monitors found for endpoint. endpoint: pytorch-inference-2023-03-18-19-20-37-442


[]

In [75]:
pytorch_predictor.delete_endpoint()