In [None]:
!pip install datasets
from datasets import load_dataset
dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset["train"]
validation_dataset = dataset["validation"]
test_dataset = dataset["test"]

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xx

README.md:   0%|          | 0.00/7.46k [00:00<?, ?B/s]

train.parquet:   0%|          | 0.00/699k [00:00<?, ?B/s]

validation.parquet:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/92.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

In [None]:
import nltk
from nltk import word_tokenize, sent_tokenize
nltk.download('punkt')
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# convert datasets to pandas dataframe
train_df = train_dataset.to_pandas()
validation_df = validation_dataset.to_pandas()
test_df = test_dataset.to_pandas()

In [None]:
# view train dataset
train_df.head()

Unnamed: 0,text,label
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1


In [None]:
# view example of NLTK tokenizer without cleaning the text
print("Original sentence: ", train_df['text'][1])
word_tokens = word_tokenize(train_df['text'][1])
print("\nAfter tokenising: \n")
print("Word Tokens:", word_tokens)

Original sentence:  the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson's expanded vision of j . r . r . tolkien's middle-earth .

After tokenising: 

Word Tokens: ['the', 'gorgeously', 'elaborate', 'continuation', 'of', '``', 'the', 'lord', 'of', 'the', 'rings', '``', 'trilogy', 'is', 'so', 'huge', 'that', 'a', 'column', 'of', 'words', 'can', 'not', 'adequately', 'describe', 'co-writer/director', 'peter', 'jackson', "'s", 'expanded', 'vision', 'of', 'j', '.', 'r', '.', 'r', '.', 'tolkien', "'s", 'middle-earth', '.']


As we see in one of the text examples above, the NLTK `word_tokenize` function does not give a very satisfactory tokenization result where more than 1 word are grouped as a single token and there are quotation marks surrounding a token which may be treated differently from the same word without quotation marks. Thus we shall clean the text first before using NLTK's `word_tokenize` function to improve the final quality of the tokens.

We have come up with a `clean_text` function that includes the handling of the word-internal punctuation that may appear in names or special nouns and separating the different words that are placed right next to a punctuation mark.

<font color="red">Note that we will not remove the stopwords since we would like to take into account the stopwords into the dictionary to answer question 1.</font>

In [None]:
def clean_text(text, stopwords=None):
    # remove non-ascii characters
    text = text.encode("ascii", "ignore").decode('utf-8', 'ignore')

    # remove URLs, mentions, hashtags
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\S+', '', text)
    text = re.sub(r'#\S+', '', text)

    # case folding
    text = text.lower()

    # split up hyphenated words into separate tokens
    text = re.sub(r'(\w+)-(\w+)', r'\1 \2', text)

    # remove slashes or dots between words - separate the words
    text = re.sub(r'(\w)[/\.](\w)', r'\1 \2', text)

    # remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # remove numbers
    text = re.sub(r'\d+', '', text)

    # normalize whitespaces (multiple whitespaces --> single space)
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def tokenize_text(text):
    # Word tokenizer
    word_tokens = word_tokenize(text)
    return word_tokens

In [None]:
# viewing tokenisation results on the same text after cleaning the text
print("Original sentence: ", train_df['text'][1])
cleaned_text = clean_text(train_df['text'][1])
word_tokens = tokenize_text(cleaned_text)

print("\nAfter normalising and tokenising: \n")
print("Word Tokens:", word_tokens)

Original sentence:  the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson's expanded vision of j . r . r . tolkien's middle-earth .

After normalising and tokenising: 

Word Tokens: ['the', 'gorgeously', 'elaborate', 'continuation', 'of', 'the', 'lord', 'of', 'the', 'rings', 'trilogy', 'is', 'so', 'huge', 'that', 'a', 'column', 'of', 'words', 'can', 'not', 'adequately', 'describe', 'co', 'writer', 'director', 'peter', 'jacksons', 'expanded', 'vision', 'of', 'j', 'r', 'r', 'tolkiens', 'middle', 'earth']


We can see that the tokens look much better according to our understanding of word parsing. Now let's apply the functions on all the training examples.

In [None]:
# apply the clean_text function to the whole df
train_df['cleaned_text'] = train_df['text'].apply(clean_text)
train_df['tokens'] = train_df['cleaned_text'].apply(tokenize_text)
train_df.head()

Unnamed: 0,text,label,cleaned_text,tokens
0,the rock is destined to be the 21st century's ...,1,the rock is destined to be the st centurys new...,"[the, rock, is, destined, to, be, the, st, cen..."
1,"the gorgeously elaborate continuation of "" the...",1,the gorgeously elaborate continuation of the l...,"[the, gorgeously, elaborate, continuation, of,..."
2,effective but too-tepid biopic,1,effective but too tepid biopic,"[effective, but, too, tepid, biopic]"
3,if you sometimes like to go to the movies to h...,1,if you sometimes like to go to the movies to h...,"[if, you, sometimes, like, to, go, to, the, mo..."
4,"emerges as something rare , an issue movie tha...",1,emerges as something rare an issue movie thats...,"[emerges, as, something, rare, an, issue, movi..."


## Question 1a
To get the size of vocabulary, we can use `Counter` as a better alternative to `set()` function so as to keep track of each token's frequency from the beginning. We can get the vocabulary size from the number of unique keys in the `token_counts` dictionary.

In [None]:
# get the vocabulary size of training data
from collections import Counter
all_tokens = [token for sublist in train_df['tokens'] for token in sublist]

# count token frequencies
token_counts = Counter(all_tokens)
print("Vocabulary size:", len(token_counts))

Vocabulary size: 17029


## Question 1b
**We use OOV (out-of-vocabulary) to refer to those words appeared in the training data but
not in the Word2vec (or Glove) dictionary. How many OOV words exist in your training data?**

We will import Word2vec dictionary and compare our training data tokens against the words in the dictionary.

In [None]:
import gensim.downloader as api # to download the model instead of storing in file path
from gensim.models import Word2Vec
# load word2vec model
pretrained_vectors = api.load('word2vec-google-news-300')



In [None]:
# identify OOV words
oov_words = [word for word in token_counts if word not in pretrained_vectors.key_to_index]
num_oov_words = len(oov_words)

print(f"Number of OOV words in the training data: {num_oov_words}")
oov_percentage = (num_oov_words / sum(token_counts.values())) * 100  # Percentage of OOV words
print(f"Percentage of OOV words in the training data: {oov_percentage:.2f}%")

Number of OOV words in the training data: 2062
Percentage of OOV words in the training data: 1.29%


In [None]:
# example of oov words
print(oov_words[:50])

['to', 'centurys', 'and', 'a', 'claud', 'damme', 'segal', 'of', 'jacksons', 'tolkiens', 'doesnt', 'wisegirls', 'wendigo', 'stevensons', 'cantet', 'vincents', 'fulford', 'wierzbicki', 'wahlberg', 'compleja', 'intelectualmente', 'retadora', 'ladrn', 'orqudeas', 'filmes', 'precisamente', 'originalidad', 'karmen', 'johnsons', 'nettelbeck', 'derridas', 'derrida', 'ormiss', 'labour', 'werner', 'herzog', 'morvern', 'callar', 'humour', 'ofage', 'treebeard', 'gollums', 'haneke', 'kapur', 'liman', 'damons', 'toreel', 'meeropol', 'earlys', 'mcfarlanes']


## Question 1c
**The existence of the OOV words is one of the well-known limitations of Word2vec (or Glove).
Without using any transformer-based language models (e.g., BERT, GPT, T5), what do you
think is the best strategy to mitigate such limitation? Implement your solution in your source
code. Show the corresponding code snippet.**

Based on the first 50 OOV words we have sampled, they are mainly made up of names or words in different languages. Since we cannot collect another movie domain-specific dataset to additionally encode for these unknown words, and because these names are not relevant emotion-related words or phrases, these OOV words that make up less than 2% of the training data may not affect the model's performance in determining sentiment from actual relevant contextual clues. Thus, we can perhaps **substitute these OOV words with `<UNK>`** instead!

In [None]:
# revised tokenize function with consideration of OOV words
def new_tokenize_text(text):
    word_tokens = word_tokenize(text)
    # Replace OOV words with <UNK>
    word_tokens = [token if token in pretrained_vectors.key_to_index else '<UNK>' for token in word_tokens]
    return word_tokens

train_df_new = train_df.copy()
train_df_new['tokens'] = train_df_new['cleaned_text'].apply(new_tokenize_text)
train_df_new.head()

Unnamed: 0,text,label,cleaned_text,tokens
0,the rock is destined to be the 21st century's ...,1,the rock is destined to be the st centurys new...,"[the, rock, is, destined, <UNK>, be, the, st, ..."
1,"the gorgeously elaborate continuation of "" the...",1,the gorgeously elaborate continuation of the l...,"[the, gorgeously, elaborate, continuation, <UN..."
2,effective but too-tepid biopic,1,effective but too tepid biopic,"[effective, but, too, tepid, biopic]"
3,if you sometimes like to go to the movies to h...,1,if you sometimes like to go to the movies to h...,"[if, you, sometimes, like, <UNK>, go, <UNK>, t..."
4,"emerges as something rare , an issue movie tha...",1,emerges as something rare an issue movie thats...,"[emerges, as, something, rare, an, issue, movi..."


In [None]:
all_tokens_new = [token for sublist in train_df_new['tokens'] for token in sublist]

token_counts_new = Counter(all_tokens_new)

# Check count of OOV words to ensure it is equal to 0
oov_words = [word for word in token_counts_new if word not in pretrained_vectors.key_to_index]
num_oov_words = len(oov_words)

print(f"Number of OOV words in the training data: {num_oov_words}")
oov_percentage = (num_oov_words / sum(token_counts.values())) * 100  # Percentage of OOV words
print(f"Percentage of OOV words in the training data: {oov_percentage:.2f}%")

Number of OOV words in the training data: 1
Percentage of OOV words in the training data: 0.00%


In [None]:
if oov_words:
    print("OOV words:", oov_words)
else:
    print("No OOV words found.")

OOV words: ['<UNK>']


Ok, we have replaced our OOV words with `<UNK>`. Now let's train the word2vec model on the tokenized training data.

In [None]:
training_sentences = train_df_new['tokens'].tolist() # feeding in sequences of tokens
assert train_df.shape[0] == len(training_sentences) # checking the number of sequences is the same as all rows of the train dataset

In [None]:
# train model on this dataset
model_new = Word2Vec(sentences=training_sentences, window=5, min_count=1, workers=4)

In [None]:
import numpy as np
# mapping from word to index (including UNK)
word_index = {word: i for i, word in enumerate(model_new.wv.index_to_key)}
word_index['UNK'] = len(word_index)  # add UNK at the end

# initialise the embedding matrix
embedding_dim = model_new.vector_size
embedding_matrix = np.zeros((len(word_index), embedding_dim))

# fill the embedding matrix
for word, i in word_index.items():
    if word in model_new.wv:
        embedding_matrix[i] = model_new.wv[word]
    else:
        embedding_matrix[i] = np.zeros(embedding_dim)  # Use zero vector for <UNK> words


print(f"Embedding matrix shape: {embedding_matrix.shape}")

Embedding matrix shape: (14969, 100)


In [None]:
# save the input word embedding matrix to be used in RNN model
np.save('input_embedding_matrix.npy', embedding_matrix)

In [None]:
# save the fine tuned word2vec model
model_new.save('fine_tuned_word2vec.model')