Firstly download this dataset from http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip, and then unzip and upload to Google Drive's folder:
./drive/MyDrive/text_classification_test.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Then we are going to do some raw data processing work. You don't need to run the following cell again if you alreay have the dataset ready.

In [None]:
import pandas as pd

workspace_path = "./drive/MyDrive/text_classification_test"
original_dataset_path = workspace_path + "/training.1600000.processed.noemoticon.csv"

# this will cause error like "utf-8' codec can't decode bytes ..." 
# tweetsDF = pd.read_csv(original_dataset_path, header = None)
tweetsDF = pd.read_csv(original_dataset_path, engine = "python", header = None)

 # check the first few records
print(tweetsDF.head(5))

# counting records
first_column = tweetsDF[0]
print(first_column.value_counts())

# using the first column's value as label
tweetsDF["sentiment_category"] = first_column.astype('category')
tweetsDF["sentiment"] = tweetsDF["sentiment_category"].cat.codes

# save the processed data
tweetsDF.to_csv(workspace_path + "/train-processed.csv", header = None, index = None)
# smaller dataset
tweetsDF.sample(10000).to_csv(workspace_path + "/train-processed-sample.csv", header = None, index = None)


   0  ...                                                  5
0  0  ...  @switchfoot http://twitpic.com/2y1zl - Awww, t...
1  0  ...  is upset that he can't update his Facebook by ...
2  0  ...  @Kenichan I dived many times for the ball. Man...
3  0  ...    my whole body feels itchy and like its on fire 
4  0  ...  @nationwideclass no, it's not behaving at all....

[5 rows x 6 columns]
4    800000
0    800000
Name: 0, dtype: int64


Then we are going to use torchtext to process the csv file and get the dataset for training.

In [None]:
# note newer torchtext's API has changed and we are only using the legacy API for learning purpose.
import torchtext.legacy
from torchtext.legacy import data
import torch

LABEL = data.LabelField()
# check https://spacy.io/usage/linguistic-features#how-tokenizer-works for what is spacy.
# Using spacy can improve the quality of the generated vocabulary.
TWEET = data.Field(tokenize = 'spacy', lower = True)

# these are just mapping the columns in the csv file
fields = [('score', None), ('id', None), ('date', None), ('query', None), 
          ('name', None), ('tweet', TWEET), ('category', None), ('label', LABEL)]

twitterDataset = data.TabularDataset(
    path = workspace_path + "/train-processed-sample.csv",
    format = "CSV",
    fields = fields,
    skip_header = False)

# splitting the dataset
(train_data, val_data, test_data) = twitterDataset.split(split_ratio = [0.8, 0.1, 0.1])

print((len(train_data), len(test_data), len(val_data)))
print(vars(train_data.examples[7]))

# building the vocabulary
vocab_size = 10000
LABEL.build_vocab(train_data, max_size = vocab_size)
TWEET.build_vocab(train_data, max_size = vocab_size)

# note torchtext will add <unk> and <pad> in additional to the required vocabulary
# search for more info if needed
print("length of vocabulary")
print(len(TWEET.vocab))

# check the most common words
print("top 10 common words")
print(TWEET.vocab.freqs.most_common(10))

# Now create the dataloader
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
batch_size = 32

train_data_iter, val_data_iter, test_data_iter = data.BucketIterator.splits(
    (train_data, val_data, test_data),
    batch_size = batch_size,
    device = device,
    # note the following are necessary in training, otherwise
    # training process will raise error saying
    # sorting can't be done in the tweet
    sort_key = lambda x: len(x.tweet),
    sort_within_batch = False
)


(8000, 1000, 1000)
{'tweet': ['i', 'do', 'nt', 'know', 'what', 'to', 'do', 'on', 'my', 'lame', 'fridaay', ' ', 'maybe', 'go', 'out', 'with', 'my', 'besties', '!', '!'], 'label': '0'}
length of vocabulary
10002
top 10 common words
[('i', 5017), ('!', 4455), ('.', 3972), (' ', 2966), ('to', 2802), ('the', 2575), (',', 2371), ('a', 1811), ('my', 1529), ('you', 1498)]


Now we are going to define the LSTM model (search for LSTM for its definition and advantages in natural language processing).

In [None]:
import torch.nn as nn
class simpleLSTM(nn.Module):
    def __init__(self, hidden_size, embedding_dim, vocab_size):
        super(simpleLSTM, self).__init__()
        # search for the usage of Embedding if needed
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.encoder = nn.LSTM(input_size = embedding_dim, hidden_size = hidden_size, num_layers = 1)
        self.predictor = nn.Linear(hidden_size, 2)
    
    def forward(self, seq):
        output, (hidden, _) = self.encoder(self.embedding(seq))
        prediction = self.predictor(hidden.squeeze(0))
        return prediction

my_model = simpleLSTM(hidden_size = 100, embedding_dim = 300, vocab_size = 10002)
my_model.to(device)


simpleLSTM(
  (embedding): Embedding(10002, 300)
  (encoder): LSTM(300, 100)
  (predictor): Linear(in_features=100, out_features=2, bias=True)
)

Begin the training process.

In [None]:
import torch.optim as optim
optimizer = optim.Adam(my_model.parameters(), lr = 0.02)

def loss_update(model, batch, loss_fn):
    output = model(batch.tweet)
    loss = loss_fn(output, batch.label)
    return loss

def train(model, optimizer, loss_fn, train_data_iter, val_data_iter, epochs):
  for epoch in range(epochs):
    training_loss = 0.0
    valid_loss = 0.0
    # this is to set model in training mode
    model.train()
    check_result = False
    # training process
    # note the difference here between the training for image processing
    for _, batch in enumerate(train_data_iter):
        optimizer.zero_grad()
        loss = loss_update(model, batch, loss_fn)
        loss.backward()
        optimizer.step()
        training_loss += loss.data.item() * batch.tweet.size(0)
    training_loss /= len(train_data_iter)

    # this is to set model in evaluation mode
    model.eval()
    for _, batch in enumerate(val_data_iter):
        loss = loss_update(model, batch, loss_fn)
        valid_loss += loss.data.item() * batch.tweet.size(0)
    valid_loss /= len(val_data_iter)

    print('Epoch: {}, Training Loss: {:.2f}, Validation Loss: {:.2f}'.format(epoch, training_loss, valid_loss))

train(my_model, optimizer, torch.nn.CrossEntropyLoss(), train_data_iter, val_data_iter, 10)


Epoch: 0, Training Loss: 24.69, Validation Loss: 12.90
Epoch: 1, Training Loss: 24.21, Validation Loss: 13.26
Epoch: 2, Training Loss: 22.73, Validation Loss: 13.41
Epoch: 3, Training Loss: 21.61, Validation Loss: 13.75
Epoch: 4, Training Loss: 20.98, Validation Loss: 13.67
Epoch: 5, Training Loss: 19.83, Validation Loss: 14.78
Epoch: 6, Training Loss: 19.46, Validation Loss: 14.15
Epoch: 7, Training Loss: 19.05, Validation Loss: 14.21
Epoch: 8, Training Loss: 18.72, Validation Loss: 15.27
Epoch: 9, Training Loss: 18.36, Validation Loss: 16.25


The prediction in torchtext is not as trivial as in torchvision. Here is the process.

In [None]:
my_model.to("cpu")
def classify_tweet(tweet, model):
    categories = {0 : "Negative", 1 : "Postive"}
    TWEET.preprocess(tweet)
    prediction = model(TWEET.process([TWEET.preprocess(tweet)]))
    print("the tweet is ")
    print(tweet)
    print(prediction)
    print("the prediction is ")
    print(categories[prediction.argmax().item()])
    return 



tweet1 = "Forgot to bring socks to the gym. I bet I get blisters!"
tweet2 = "Wishing I could sneak in to watch the Star Trek premiere"

classify_tweet(tweet1, my_model)
classify_tweet(tweet2, my_model)

the tweet is 
Forgot to bring socks to the gym. I bet I get blisters!
tensor([[ 2.5766, -2.7281]], grad_fn=<AddmmBackward>)
the prediction is 
Negative
the tweet is 
Wishing I could sneak in to watch the Star Trek premiere
tensor([[-1.2038,  1.4498]], grad_fn=<AddmmBackward>)
the prediction is 
Postive


In the following we are going to demonstrate a few ways for data augmentation. We are not going to apply them in the training in this notebook, but you can always test it out when you have time.

In [None]:
import random
from random import randrange

my_sentence = "The cat sat on the mat"

# split the sentence into words
def get_words_lists(x):
    return x.split()

# get a random word from choices available
def get_random_word():
    random_word_list = ["apple", "banana", "candy", "drink", "egg", "fish", "grape"]
    return random_word_list[randrange(0, len(random_word_list))]

def random_insertion(sentence, n):
    words = get_words_lists(sentence)
    for _ in range(n):
        words.insert(randrange(0, n), get_random_word())
    # combine the words back into one sentence
    return ' '.join(word for word in words)


my_new_sentence = random_insertion(my_sentence, 3)
print(my_new_sentence)

apple The egg banana cat sat on the mat


In [None]:
def random_deletion(sentence):
    words = get_words_lists(sentence)
    words.pop(randrange(0, len(words)))
    # combine the words back into one sentence
    return ' '.join(word for word in words)


my_new_sentence = random_deletion(my_sentence)
print(my_new_sentence)

The cat sat on the


In [None]:
def random_swap(sentence):
    words = get_words_lists(sentence)
    index1 = randrange(0, len(words))
    index2 = index1
    while(index2 == index1):
        index2 = randrange(0, len(words))
    words[index1], words[index2] = words[index2], words[index1]

    # combine the words back into one sentence
    return ' '.join(word for word in words)


my_new_sentence = random_swap(my_sentence)
print(my_new_sentence)

The sat cat on the mat


In the following we are going to try to use Google Translate Service for Data Augmentation.

In [None]:
# install googletrans, newer version may have some API issue
!pip install googletrans==3.1.0a0



In [None]:
# NOTE calling this too often would cause Google Translate to stop serving you for a while...
import googletrans
import random
from random import randrange

def tranlate_call(sentence_list, dest_language):
    translation_result = translator.translate(sentence_list, dest = dest_language)
    translation_sentence = [word.text for word in translation_result]
    print(translation_sentence)
    return(translation_sentence)

translator = googletrans.Translator()
my_sentence = ['The cat sat on the mat']

my_sentence_cn = tranlate_call(my_sentence, 'zh-CN')

# translate back into English
my_sentence_en = tranlate_call(my_sentence_cn, 'en')

# translate into a random language
available_langs = list(googletrans.LANGUAGES.keys())
tr_lang = random.choice(available_langs)
print(f"Translating to {googletrans.LANGUAGES[tr_lang]}")
my_sentence_random = tranlate_call(my_sentence, tr_lang)
