# Part 0

In [None]:
from datasets import load_dataset
dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset["train"]
validation_dataset = dataset["validation"]
test_dataset = dataset["test"]

In [3]:
# convert datasets to pandas dataframe
train_df = train_dataset.to_pandas()
validation_df = validation_dataset.to_pandas()
test_df = test_dataset.to_pandas()

In [4]:
import gensim.downloader
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import numpy as np

In [5]:
train_df.head()

Unnamed: 0,text,label
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1


The `simple_preprocess` function from `gensim.utils` package makes cleaning and tokenizing text easier with standard text pre-processing procedures like converting text to lowercase, eliminating punctuation, and splitting text into individual words.

In [6]:
# view on an example from train df
print(simple_preprocess(train_df.text[1]))

['the', 'gorgeously', 'elaborate', 'continuation', 'of', 'the', 'lord', 'of', 'the', 'rings', 'trilogy', 'is', 'so', 'huge', 'that', 'column', 'of', 'words', 'cannot', 'adequately', 'describe', 'co', 'writer', 'director', 'peter', 'jackson', 'expanded', 'vision', 'of', 'tolkien', 'middle', 'earth']


In [7]:
# apply gensim utils simple preprocess to tokenize the text in all datasets
train_df["tokens"] = train_df["text"].apply(simple_preprocess)
validation_df["tokens"] = validation_df["text"].apply(simple_preprocess)
test_df["tokens"] = test_df["text"].apply(simple_preprocess)

In [8]:
train_df.head()

Unnamed: 0,text,label,tokens
0,the rock is destined to be the 21st century's ...,1,"[the, rock, is, destined, to, be, the, st, cen..."
1,"the gorgeously elaborate continuation of "" the...",1,"[the, gorgeously, elaborate, continuation, of,..."
2,effective but too-tepid biopic,1,"[effective, but, too, tepid, biopic]"
3,if you sometimes like to go to the movies to h...,1,"[if, you, sometimes, like, to, go, to, the, mo..."
4,"emerges as something rare , an issue movie tha...",1,"[emerges, as, something, rare, an, issue, movi..."


In [9]:
# save the train, test and validation dataframes after tokenizing
train_df.to_csv("movie_train.csv", index=False)
validation_df.to_csv("movie_validation.csv", index=False)
test_df.to_csv("movie_test.csv", index=False)

In [10]:
# load pretrained word2vec dictionary
word2vec_model = gensim.downloader.load('word2vec-google-news-300')



# Part 1

## Question 1a
To get the size of vocabulary, we can use `Counter` as a better alternative to `set()` function so as to keep track of each token's frequency from the beginning. We can get the vocabulary size from the number of unique keys in the `token_counts` dictionary.

In [11]:
# get the vocabulary size of training data
from collections import Counter
all_tokens = [token for sublist in train_df['tokens'] for token in sublist]

# count token frequencies
token_counts = Counter(all_tokens)
print("Vocabulary size:", len(token_counts))

# 5 most common tokens and their frequencies
top_5_tokens = token_counts.most_common(5)
print("Top 5 most popular tokens and their frequencies:")
for token, frequency in top_5_tokens:
    print(f"{token}: {frequency}")

Vocabulary size: 16288
Top 5 most popular tokens and their frequencies:
the: 8115
and: 4971
of: 4886
to: 3447
it: 2743


## Question 1b
**We use OOV (out-of-vocabulary) to refer to those words appeared in the training data but
not in the Word2vec (or Glove) dictionary. How many OOV words exist in your training data?**


In [12]:
# get vocabulary of word2vec dictionary
word2vec_vocab = set(word2vec_model.key_to_index)
print("Vocabulary size of word2vec:", len(word2vec_vocab))

Vocabulary size of word2vec: 3000000


In [13]:
oov_tokens = []  # store OOV words

for token in all_tokens:
    if token not in word2vec_vocab:
        oov_tokens.append(token)

oov_count = len(oov_tokens)
print("Number of OOV words:", oov_count)
oov_percentage = (oov_count/len(token_counts)) * 100
print(f"Percentage of OOV words out of the whole dataset: {oov_percentage:.2f}%")

Number of OOV words: 15984
Percentage of OOV words out of the whole dataset: 98.13%


In [14]:
# Count the occurrences of each OOV token
oov_token_counts = Counter(oov_tokens)

# Get the top 20 most common OOV words
top_20_oov_tokens = oov_token_counts.most_common(20)

print("The top 20 most common OOV words:")
for word, count in top_20_oov_tokens:
    print(f"{word}: {count}")

The top 20 most common OOV words:
and: 4971
of: 4886
to: 3447
spielberg: 21
niro: 20
soderbergh: 17
solondz: 15
sandler: 14
seagal: 14
scorsese: 13
polanski: 12
frida: 12
sade: 12
herzog: 11
humour: 11
kissinger: 11
benigni: 11
broomfield: 10
carvey: 10
pinocchio: 10


In [15]:
# print the number of distinct OOV tokens
print("Number of distinct OOV tokens:", len(set(oov_tokens)))

Number of distinct OOV tokens: 1473


Most of these OOV tokens are named entities, let us see how many distinct named entities there are.

In [16]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [17]:
identified_entities = []

# identify named entities using spaCy NER
def identify_named_entities(oov_tokens):
    for token in oov_tokens:
        doc = nlp(token)

        # detect any entities
        for ent in doc.ents:
            if ent.text == token:
                identified_entities.append(ent.text)

    return set(identified_entities)

identified_entities = identify_named_entities(oov_tokens)
print("Number of identified entity tokens:", len(identified_entities))
print(f"Percentage of distinct OOV tokens identified as entities: {(len(identified_entities)/len(set(oov_tokens)))*100}%")
print("\nIdentified named entities in OOV tokens:", list(identified_entities))

Number of identified entity tokens: 233
Percentage of distinct OOV tokens identified as entities: 15.81805838424983%

Identified named entities in OOV tokens: ['neeson', 'arwen', 'elie', 'nettelbeck', 'gilliam', 'lapaglia', 'senegalese', 'incoloro', 'melville', 'abandono', 'hanukkah', 'leavitt', 'grandiosa', 'kurosawa', 'ivans', 'fabuleux', 'história', 'deniro', 'ahola', 'djeinaba', 'guión', 'dirigida', 'possui', 'alcatraz', 'ballhaus', 'avventura', 'kieslowski', 'direção', 'obligada', 'dench', 'famuyiwa', 'mesmos', 'lohman', 'eroti', 'nickleby', 'næs', 'colgate', 'mctiernan', 'girardot', 'carnahan', 'glizty', 'enternecedora', 'besco', 'juwanna', 'zoolander', 'contrária', 'veljohnson', 'eudora', 'magimel', 'armenians', 'ronn', 'fica', 'fracasso', 'caulfield', 'esforço', 'franz', 'besson', 'brecht', 'pasadena', 'superada', 'raimi', 'cronenberg', 'veronique', 'giovanni', 'fílmica', 'aidan', 'kiarostami', 'hitchcockian', 'fulford', 'carrey', 'tunisian', 'ararat', 'frida', 'galinsky', 'sco

## Question 1c
**The existence of the OOV words is one of the well-known limitations of Word2vec (or Glove).
Without using any transformer-based language models (e.g., BERT, GPT, T5), what do you
think is the best strategy to mitigate such limitation? Implement your solution in your source
code. Show the corresponding code snippet.**

Usually, the common approach to handling out-of-vocabulary (OOV) words is to replace them with `<UNK>`. However, the problem is with our high percentage of OOV words (i.e. 98.13%), the model is very likely to encounter the majority of the words as `<UNK>`. This means that a large proportion of text is seen as a single token, failing to capture the semantic differences between these OOV tokens.

While most research suggests character-level embeddings to handle `<UNK>`, they are not considered here, as they may introduce unnecessary complexity without significantly improving performance for this generic sentiment analysis, where word-level context is more crucial than individual character representations for capturing sentiment.

Furthermore, in the recent NER analysis of the OOV tokens, only 15.8% of the OOV tokens were identified as named entities, and the majority (84.2%) are non-entity words. To ensure that the model can learn meaningful embeddings for both named entities and other relevant non-entity words critical in this generic sentiment analysis task, we shall simply initialise all OOV tokens randomly and allow the model to fine-tune these embeddings. This is a better alternative to initialising all the OOV tokens as zero vectors for OOV tokens which which can result in sparse embeddings, information loss and hinder model performance.


In [18]:
import numpy as np

For part 2, we will initialise an embedding matrix that doesn't handle OOV tokens.

In [19]:
def prepare_non_oov_embeddings(df, word2vec_model, embedding_dim=300):
    # intersect unique tokens from df with word2vec vocabulary
    df_vocab = set(word for token_list in df['tokens'] for word in token_list)
    common_vocab = df_vocab.intersection(word2vec_model.key_to_index)

    # create word-to-index mapping
    word_to_idx = {word: idx for idx, word in enumerate(common_vocab)}
    idx_to_word = {idx: word for word, idx in word_to_idx.items()}

    # initialize embedding matrix for common vocabulary only
    vocab_size = len(common_vocab)
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    # fill embedding matrix with Word2Vec vectors for common words
    for word, idx in word_to_idx.items():
        embedding_matrix[idx] = word2vec_model[word]

    print(f"Non OOV vocabulary size: {vocab_size}")
    print(f"Embedding matrix shape: {embedding_matrix.shape}")

    return embedding_matrix, word_to_idx, idx_to_word

embedding_matrix_non_oov, word_to_idx, idx_to_word = prepare_non_oov_embeddings(df=train_df, word2vec_model=word2vec_model)

Non OOV vocabulary size: 14815
Embedding matrix shape: (14815, 300)


Now, let's initialise an embedding matrix that handles the OOV tokens with random initialisation.

In [20]:
def prepare_oov_embeddings(df, word2vec_model, embedding_dim=300):
    # intersect unique tokens from df with word2vec vocabulary
    df_vocab = set(word for token_list in df['tokens'] for word in token_list)
    common_vocab = df_vocab.intersection(word2vec_model.key_to_index)

    # create word-to-index mapping for all tokens, including OOV tokens
    word_to_idx = {word: idx for idx, word in enumerate(df_vocab)}
    idx_to_word = {idx: word for word, idx in word_to_idx.items()}

    # initialize embedding matrix for ALL TOKENS (OOV and in-vocabulary)
    vocab_size = len(df_vocab)
    embedding_matrix = np.random.randn(vocab_size, embedding_dim)  # random initialization for all words

    # fill embedding matrix with Word2Vec vectors for words in common vocabulary
    for word, idx in word_to_idx.items():
        if word in word2vec_model.key_to_index:
            embedding_matrix[idx] = word2vec_model[word]

    print(f"Vocabulary size with OOV tokens: {vocab_size}")
    print(f"Embedding matrix shape: {embedding_matrix.shape}")

    return embedding_matrix, word_to_idx, idx_to_word

embedding_matrix_oov, word_to_idx, idx_to_word = prepare_oov_embeddings(df=train_df, word2vec_model=word2vec_model)

Vocabulary size with OOV tokens: 16288
Embedding matrix shape: (16288, 300)


In [21]:
# save the embedding matrix with no oov
np.save("embedding_matrix_non_oov.npy", embedding_matrix_non_oov)
# save embedding matrix with OOV
np.save("embedding_matrix_oov.npy", embedding_matrix_oov)

# References:

https://radimrehurek.com/gensim/utils.html