# Topic Classification with BlazingText using Kaggle News Dataset
Reference: https://sagemaker-examples.readthedocs.io/en/latest/introduction_to_amazon_algorithms/blazingtext_text_classification_dbpedia/blazingtext_text_classification_dbpedia.html#Training-the-BlazingText-model-for-supervised-text-classification

## Setup

In [1]:
import sagemaker
from sagemaker import get_execution_role
import json
import boto3

sess = sagemaker.Session()

role = get_execution_role()
print(
    role
)  # the role that SageMaker would use to leverage AWS resources (S3, CloudWatch) on your behalf

bucket = "urop1100f"
print(bucket)
prefix = "topic_classification/blazingtext"

arn:aws:iam::698264780541:role/service-role/AmazonSageMaker-ExecutionRole-20220330T145844
urop1100f


## Load Dataset and Store in S3

In [2]:
import pandas as pd
import json

list_ = []
with open(r"News_Category_Dataset_v2.json") as files:
    for file in files:
        list_.append(json.loads(file))
df = pd.DataFrame(list_)
df.head()

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26


### Cluster similar categories: from 41 categories to 29 categories

In [None]:
# cluster the categories
clusters = [['ARTS', 'ARTS & CULTURE', 'CULTURE & ARTS'],
 ['POLITICS', 'RELIGION'],
 ['PARENTING', 'PARENTS'],
 ['BLACK VOICES', 'LATINO VOICES', 'QUEER VOICES'],
 ['HEALTHY LIVING', 'HOME & LIVING'],
 ['STYLE', 'STYLE & BEAUTY'],
 ['COMEDY', 'ENTERTAINMENT'],
 ['WORLD NEWS', 'THE WORLDPOST', 'WORLDPOST'],
 ['COLLEGE', 'EDUCATION']]

for ind, cluster in enumerate(clusters):
    for c in cluster[1:]:
        df['category'] = df['category'].str.replace(c, cluster[0])

In [3]:
# group headline and description together
df['text'] = df['headline'] + '. ' + df['short_description']
df = df[['text', 'category']]
df.head()

Unnamed: 0,text,category
0,There Were 2 Mass Shootings In Texas Last Week...,CRIME
1,Will Smith Joins Diplo And Nicky Jam For The 2...,ENTERTAINMENT
2,Hugh Grant Marries For The First Time At Age 5...,ENTERTAINMENT
3,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,ENTERTAINMENT
4,Julianna Margulies Uses Donald Trump Poop Bags...,ENTERTAINMENT


### Train Test Split

In [4]:
import numpy as np
train_test_ratio = 0.9
train_val_ratio = 0.9

# train:test = 0.9:0.1
train_val, test = np.split(df.sample(frac=1, random_state=1), [int(train_test_ratio * len(df))])
train, val = np.split(train_val.sample(frac=1, random_state=1), [int(train_val_ratio * len(train_val))])

# upload csv
train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)
val.to_csv("val.csv", index=False)

train.shape, val.shape, test.shape

((162690, 2), (18077, 2), (20086, 2))

## Data Preprocessing

In [5]:
from random import shuffle
import multiprocessing
from multiprocessing import Pool
import csv
import nltk

nltk.download("punkt")

### preprocess the training data into space separated tokenized text format which can be consumed by BlazingText algorithm. 
### the class label(s) should be prefixed with __label__
### it should be present in the same line along with the original sentence.

def transform_instance(row):
    cur_row = []
    label = "__label__" + row[1]  # Prefix the index-ed label with __label__
    cur_row.append(label)
    cur_row.extend(nltk.word_tokenize(row[0].lower()))
    return cur_row


[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [6]:
def preprocess(input_file, output_file, keep=1):
    all_rows = []
    with open(input_file, "r") as csvinfile:
        csv_reader = csv.reader(csvinfile, delimiter=",")
        for row in csv_reader:
            all_rows.append(row)
    shuffle(all_rows)
    all_rows = all_rows[: int(keep * len(all_rows))]
    pool = Pool(processes=multiprocessing.cpu_count())
    transformed_rows = pool.map(transform_instance, all_rows)
    pool.close()
    pool.join()

    with open(output_file, "w") as csvoutfile:
        csv_writer = csv.writer(csvoutfile, delimiter=" ", lineterminator="\n")
        csv_writer.writerows(transformed_rows)

In [7]:
%%time

# Preparing the training dataset
preprocess("train.csv", "kaggle.train", keep=1)

# Preparing the validation dataset
preprocess("val.csv", "kaggle.validation")

CPU times: user 3.16 s, sys: 322 ms, total: 3.48 s
Wall time: 38.3 s


## Upload dataset to s3

In [8]:
%%time

train_channel = prefix + "/train"
validation_channel = prefix + "/validation"

sess.upload_data(path="kaggle.train", bucket=bucket, key_prefix=train_channel)
sess.upload_data(path="kaggle.validation", bucket=bucket, key_prefix=validation_channel)

s3_train_data = "s3://{}/{}".format(bucket, train_channel)
s3_validation_data = "s3://{}/{}".format(bucket, validation_channel)

CPU times: user 396 ms, sys: 41.9 ms, total: 438 ms
Wall time: 960 ms


In [9]:
# output location
s3_output_location = "s3://{}/{}/output".format(bucket, prefix)

## Training

In [10]:
region_name = boto3.Session().region_name
container = sagemaker.amazon.amazon_estimator.get_image_uri(region_name, "blazingtext", "latest")
print("Using SageMaker BlazingText container: {} ({})".format(container, region_name))

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: latest.


Using SageMaker BlazingText container: 811284229777.dkr.ecr.us-east-1.amazonaws.com/blazingtext:1 (us-east-1)


In [24]:
# BlazingText Model

bt_model = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type="ml.m5.2xlarge",
    volume_size=30,
    max_run=360000,
    input_mode="File",
    output_path=s3_output_location,
    hyperparameters={
        "mode": "supervised",
        "min_count": 2,
        "learning_rate": 0.05,
        "vector_dim": 64,
        "early_stopping": True,
        "patience": 4,
        "min_epochs": 5
        "word_ngrams": 2
    }
)

### Link data channels to algorithm

In [25]:
train_data = sagemaker.inputs.TrainingInput(
    s3_train_data,
    distribution="FullyReplicated",
    content_type="text/plain",
    s3_data_type="S3Prefix",
)
validation_data = sagemaker.inputs.TrainingInput(
    s3_validation_data,
    distribution="FullyReplicated",
    content_type="text/plain",
    s3_data_type="S3Prefix",
)
data_channels = {"train": train_data, "validation": validation_data}

### Fit model

In [26]:
bt_model.fit(inputs=data_channels, logs=True)

2022-03-30 07:28:06 Starting - Starting the training job...
2022-03-30 07:28:32 Starting - Preparing the instances for trainingProfilerReport-1648625286: InProgress
......
2022-03-30 07:29:32 Downloading - Downloading input data...
2022-03-30 07:29:52 Training - Downloading the training image..[34mArguments: train[0m
[34m[03/30/2022 07:30:14 INFO 139674839508800] nvidia-smi took: 0.02520132064819336 secs to identify 0 gpus[0m
[34m[03/30/2022 07:30:14 INFO 139674839508800] Running single machine CPU BlazingText training using supervised mode.[0m
[34mNumber of CPU sockets found in instance is  1[0m
[34m[03/30/2022 07:30:14 INFO 139674839508800] Processing /opt/ml/input/data/train/kaggle.train . File size: 31.106579780578613 MB[0m
[34m[03/30/2022 07:30:14 INFO 139674839508800] Processing /opt/ml/input/data/validation/kaggle.validation . File size: 3.4609127044677734 MB[0m
[34mRead 5M words[0m
[34mNumber of words:  57515[0m
[34mLoading validation data from /opt/ml/input/da

## Deploy Model

In [27]:
from sagemaker.serializers import JSONSerializer

text_classifier = bt_model.deploy(
    initial_instance_count=1, instance_type="ml.m4.xlarge", serializer=JSONSerializer()
)

-----------!

## Test Model

In [28]:
import time
sentences = list(test.text)[:10]
print(f"test data: {len(sentences)}")

start_time = time.time()
# using the same nltk tokenizer that we used during data preparation for training
tokenized_sentences = [" ".join(nltk.word_tokenize(sent)) for sent in sentences]

payload = {"instances": tokenized_sentences}

response = text_classifier.predict(payload)

predictions = json.loads(response)

[
  {
    "label": [
      "__label__WOMEN"
    ],
    "prob": [
      0.8411675095558167
    ]
  },
  {
    "label": [
      "__label__WOMEN"
    ],
    "prob": [
      0.28858622908592224
    ]
  },
  {
    "label": [
      "__label__WORLDPOST"
    ],
    "prob": [
      0.2353600412607193
    ]
  },
  {
    "label": [
      "__label__IMPACT"
    ],
    "prob": [
      0.2537541687488556
    ]
  },
  {
    "label": [
      "__label__POLITICS"
    ],
    "prob": [
      0.9926941990852356
    ]
  },
  {
    "label": [
      "__label__SPORTS"
    ],
    "prob": [
      0.5051364898681641
    ]
  },
  {
    "label": [
      "__label__TRAVEL"
    ],
    "prob": [
      0.402586966753006
    ]
  },
  {
    "label": [
      "__label__WELLNESS"
    ],
    "prob": [
      0.48565739393234253
    ]
  },
  {
    "label": [
      "__label__WELLNESS"
    ],
    "prob": [
      0.5202071070671082
    ]
  },
  {
    "label": [
      "__label__WEDDINGS"
    ],
    "prob": [
      0.3270853161811828

In [29]:
test.category.values[:10]

array(['WOMEN', 'ENTERTAINMENT', 'CRIME', 'GREEN', 'POLITICS', 'CRIME',
       'FOOD & DRINK', 'POLITICS', 'WELLNESS', 'WEDDINGS'], dtype=object)

In [18]:
sess.delete_endpoint(text_classifier.endpoint)

The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
