In [10]:
import os
import sys
import pandas as pd
import text_classification as tc
import nltk
import re
import json
import boto3
import sagemaker
from sagemaker.serverless import ServerlessInferenceConfig

In [13]:
sess = sagemaker.Session()

role = sagemaker.get_execution_role()
print("role : ", role)

bucket = sess.default_bucket()
print("bucket : ", bucket)

tc._s3_bucket_name = bucket

role :  arn:aws:iam::768516036628:role/SageMaker-MLEngineer
bucket :  sagemaker-ap-south-1-768516036628


In [14]:
s3_train_filePath = 's3://{}/{}{}'.format(tc._s3_bucket_name, tc._s3_dataset_dir, tc._train_dataset_filePath)
print("s3_train_filePath : ", s3_train_filePath)
train_df = pd.read_csv(s3_train_filePath, header=None)
train_df.head(5)

s3_train_filePath :  s3://sagemaker-ap-south-1-768516036628/002_dataset/dbpedia_csv/train.csv


Unnamed: 0,0,1,2
0,1,E. D. Abbott Ltd,Abbott of Farnham E D Abbott Limited was a Br...
1,1,Schwan-Stabilo,Schwan-STABILO is a German maker of pens for ...
2,1,Q-workshop,Q-workshop is a Polish company located in Poz...
3,1,Marvell Software Solutions Israel,Marvell Software Solutions Israel known as RA...
4,1,Bergan Mercy Medical Center,Bergan Mercy Medical Center is a hospital loc...


In [15]:
s3_test_filePath = 's3://{}/{}{}'.format(tc._s3_bucket_name, tc._s3_dataset_dir, tc._test_dataset_filePath)
print("s3_test_filePath : ", s3_test_filePath)
test_df = pd.read_csv(s3_test_filePath, header=None)
test_df.head(5)

s3_test_filePath :  s3://sagemaker-ap-south-1-768516036628/002_dataset/dbpedia_csv/test.csv


Unnamed: 0,0,1,2
0,1,TY KU,TY KU /taɪkuː/ is an American alcoholic bever...
1,1,Odd Lot Entertainment,OddLot Entertainment founded in 2001 by longt...
2,1,Henkel,Henkel AG & Company KGaA operates worldwide w...
3,1,GOAT Store,The GOAT Store (Games Of All Type Store) LLC ...
4,1,RagWing Aircraft Designs,RagWing Aircraft Designs (also called the Rag...


In [16]:
s3_output = 's3://{}/{}'.format(tc._s3_bucket_name, tc._s3_output_dir)
s3_output

's3://sagemaker-ap-south-1-768516036628/004_output/'

In [17]:
region_name = boto3.Session().region_name
print("region_name : ", region_name)

region_name :  ap-south-1


In [18]:
container = sagemaker.amazon.amazon_estimator.get_image_uri(region_name, "blazingtext", "latest")
print('Using SageMaker BlazingText container: {} ({})'.format(container, region_name))

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: latest.


Using SageMaker BlazingText container: 991648021394.dkr.ecr.ap-south-1.amazonaws.com/blazingtext:1 (ap-south-1)


In [19]:
bt_model = sagemaker.estimator.Estimator(container,
                                         role, 
                                         train_instance_count=1, 
                                         train_instance_type='ml.c4.4xlarge',
                                         train_volume_size = 30,
                                         train_max_run = 360000,
                                         input_mode= 'File',
                                         output_path=s3_output,
                                         sagemaker_session=sess)

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_max_run has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_volume_size has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [20]:
bt_model.set_hyperparameters(mode="supervised",
                            epochs=10,
                            min_count=2,
                            learning_rate=0.05,
                            vector_dim=10,
                            early_stopping=True,
                            patience=4,
                            min_epochs=5,
                            word_ngrams=2)

In [25]:
s3_train_data = 's3://{}/{}{}'.format(tc._s3_bucket_name, tc._s3_preprocessed_dataset_dir, tc._transformed_train_filePath)
s3_validation_data = 's3://{}/{}{}'.format(tc._s3_bucket_name, tc._s3_preprocessed_dataset_dir, tc._transformed_test_filePath)
print("s3_train_data : ", s3_train_data)
print("s3_validation_data : ", s3_validation_data)

s3_train_data :  s3://sagemaker-ap-south-1-768516036628/003_dataset/dbpedia.train
s3_validation_data :  s3://sagemaker-ap-south-1-768516036628/003_dataset/dbpedia.validation


In [26]:
train_data = sagemaker.session.s3_input(s3_train_data, distribution='FullyReplicated', 
                        content_type='text/plain', s3_data_type='S3Prefix')
validation_data = sagemaker.session.s3_input(s3_validation_data, distribution='FullyReplicated', 
                             content_type='text/plain', s3_data_type='S3Prefix')
data_channels = {'train': train_data, 'validation': validation_data}

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [27]:
bt_model.fit(inputs=data_channels, logs=True)

INFO:sagemaker:Creating training-job with name: blazingtext-2023-04-07-08-28-51-060


2023-04-07 08:28:51 Starting - Starting the training job...
2023-04-07 08:29:05 Starting - Preparing the instances for training......
2023-04-07 08:30:18 Downloading - Downloading input data
2023-04-07 08:30:18 Training - Training image download completed. Training in progress...[34mArguments: train[0m
[34m[04/07/2023 08:30:29 INFO 139676824971072] nvidia-smi took: 0.025225162506103516 secs to identify 0 gpus[0m
[34m[04/07/2023 08:30:29 INFO 139676824971072] Running single machine CPU BlazingText training using supervised mode.[0m
[34mNumber of CPU sockets found in instance is  1[0m
[34m[04/07/2023 08:30:29 INFO 139676824971072] Processing /opt/ml/input/data/train/dbpedia.train . File size: 35.036842346191406 MB[0m
[34m[04/07/2023 08:30:29 INFO 139676824971072] Processing /opt/ml/input/data/validation/dbpedia.validation . File size: 21.887577056884766 MB[0m
[34mRead 6M words[0m
[34mNumber of words:  149105[0m
[34mLoading validation data from /opt/ml/input/data/validati

In [28]:
serverless_config = ServerlessInferenceConfig()
text_classifier = bt_model.deploy(serverless_inference_config=serverless_config)
print("endpoint path : ", text_classifier.endpoint)

INFO:sagemaker:Creating model with name: blazingtext-2023-04-07-08-31-38-684
INFO:sagemaker:Creating endpoint-config with name blazingtext-2023-04-07-08-31-38-684
INFO:sagemaker:Creating endpoint with name blazingtext-2023-04-07-08-31-38-684


----!

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


endpoint path :  blazingtext-2023-04-07-08-31-38-684


In [30]:
text_classifier_cp = text_classifier

In [31]:
REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def review_to_words(review):
    words = REPLACE_NO_SPACE.sub("", review.lower())
    words = REPLACE_WITH_SPACE.sub(" ", words)
    return words

In [32]:
def test_the_model(original_sentences):
    sentences = [' '.join(nltk.word_tokenize(sent)) for sent in original_sentences]
    tokenized_sentences = [review_to_words(sentence) for sentence in sentences]
    payload = {"instances" : tokenized_sentences}
    results = text_classifier.predict(json.dumps(payload), initial_args={'ContentType': 'application/json'})
    predictions = json.loads(results)
    for index, predict in enumerate(predictions):
        predict.update({'input': original_sentences[index]})
    return predictions

In [33]:
original_sentences = [
"beilschmiedia kweo beilschmiedia kweo is a species of plant in the lauraceae family . it is endemic to tanzania .",
"empire burlesque empire burlesque is the twenty-third studio album by american singer-songwriter bob dylan released on june 10 1985 on columbia records . self-produced the album peaked at number thirty-three in the us and at number eleven in the uk.accompanied by multiple session musicians—including tom petty & the heartbreakers members mike campbell benmont tench and howie epstein—the album foregrounds a distinct 80s style aesthetic .",
"synodontis sp . nov. 'lower tana ' synodontis sp . nov. 'lower tana ' is a species of fish in the mochokidae family . it is endemic to kenya . its natural habitat is rivers .",
"songs from the earth songs from the earth is the first album by horrorpunk/deathrock supergroup son of sam . the album was a tribute to glenn danzig 's former band samhain though there are no cover songs on the album .",
"stay in your own sled stay in your own sled ( russian : не в свои сани не садись ) an idiom meaning do n't bite off more than you can chew is a play by alexander ostrovsky written in 1852 and first published in the no.5 ( march book 1 ) 1853 issue of moskvityanin . it was premiered in bolshoi theatre on january 14 1853 ."
]
predictions = test_the_model(original_sentences)
print(json.dumps(predictions, indent=4))
tested_data = pd.DataFrame(predictions)
tested_data

[
    {
        "label": [
            "__label__Plant"
        ],
        "prob": [
            1.0000064373016357
        ],
        "input": "beilschmiedia kweo beilschmiedia kweo is a species of plant in the lauraceae family . it is endemic to tanzania ."
    },
    {
        "label": [
            "__label__Album"
        ],
        "prob": [
            0.999957799911499
        ],
        "input": "empire burlesque empire burlesque is the twenty-third studio album by american singer-songwriter bob dylan released on june 10 1985 on columbia records . self-produced the album peaked at number thirty-three in the us and at number eleven in the uk.accompanied by multiple session musicians\u2014including tom petty & the heartbreakers members mike campbell benmont tench and howie epstein\u2014the album foregrounds a distinct 80s style aesthetic ."
    },
    {
        "label": [
            "__label__Animal"
        ],
        "prob": [
            0.9945312738418579
        ],
       

Unnamed: 0,label,prob,input
0,[__label__Plant],[1.0000064373016357],beilschmiedia kweo beilschmiedia kweo is a spe...
1,[__label__Album],[0.999957799911499],empire burlesque empire burlesque is the twent...
2,[__label__Animal],[0.9945312738418579],synodontis sp . nov. 'lower tana ' synodontis ...
3,[__label__Album],[1.0000098943710327],songs from the earth songs from the earth is t...
4,[__label__WrittenWork],[0.9991356730461121],stay in your own sled stay in your own sled ( ...
