# Part 0

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [2]:
from datasets import load_dataset
dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset["train"]
validation_dataset = dataset["validation"]
test_dataset = dataset["test"]

README.md:   0%|          | 0.00/7.46k [00:00<?, ?B/s]

train.parquet:   0%|          | 0.00/699k [00:00<?, ?B/s]

validation.parquet:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/92.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

In [3]:
# convert datasets to pandas dataframe
train_df = train_dataset.to_pandas()
validation_df = validation_dataset.to_pandas()
test_df = test_dataset.to_pandas()

In [4]:
import gensim.downloader
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import numpy as np

In [5]:
train_df.head()

Unnamed: 0,text,label
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1


The `simple_preprocess` function from `gensim.utils` package makes cleaning and tokenizing text easier with standard text pre-processing procedures like converting text to lowercase, eliminating punctuation, and splitting text into individual words.

In [6]:
# view on an example from train df
print(simple_preprocess(train_df.text[1]))

['the', 'gorgeously', 'elaborate', 'continuation', 'of', 'the', 'lord', 'of', 'the', 'rings', 'trilogy', 'is', 'so', 'huge', 'that', 'column', 'of', 'words', 'cannot', 'adequately', 'describe', 'co', 'writer', 'director', 'peter', 'jackson', 'expanded', 'vision', 'of', 'tolkien', 'middle', 'earth']


In [7]:
# apply gensim utils simple preprocess to tokenize the text in all datasets
train_df["tokens"] = train_df["text"].apply(simple_preprocess)
validation_df["tokens"] = validation_df["text"].apply(simple_preprocess)
test_df["tokens"] = test_df["text"].apply(simple_preprocess)

In [8]:
train_df.head()

Unnamed: 0,text,label,tokens
0,the rock is destined to be the 21st century's ...,1,"[the, rock, is, destined, to, be, the, st, cen..."
1,"the gorgeously elaborate continuation of "" the...",1,"[the, gorgeously, elaborate, continuation, of,..."
2,effective but too-tepid biopic,1,"[effective, but, too, tepid, biopic]"
3,if you sometimes like to go to the movies to h...,1,"[if, you, sometimes, like, to, go, to, the, mo..."
4,"emerges as something rare , an issue movie tha...",1,"[emerges, as, something, rare, an, issue, movi..."


In [9]:
# save the train, test and validation dataframes after tokenizing
train_df.to_csv("movie_train.csv", index=False)
validation_df.to_csv("movie_validation.csv", index=False)
test_df.to_csv("movie_test.csv", index=False)

In [10]:
# load pretrained word2vec dictionary
word2vec_model = gensim.downloader.load('word2vec-google-news-300')



# Part 1

## Question 1a
To get the size of vocabulary, we can use `Counter` as a better alternative to `set()` function so as to keep track of each token's frequency from the beginning. We can get the vocabulary size from the number of unique keys in the `token_counts` dictionary.

In [11]:
# get the vocabulary size of training data
from collections import Counter
all_tokens = [token for sublist in train_df['tokens'] for token in sublist]
# count the total number of tokens
total_tokens = len(all_tokens)

# count token frequencies
token_counts = Counter(all_tokens)
print("Vocabulary size:", len(token_counts))

# 5 most common tokens and their frequencies
top_5_tokens = token_counts.most_common(5)
print("Top 5 most popular tokens and their frequencies:")
for token, frequency in top_5_tokens:
    print(f"{token}: {frequency}")

Vocabulary size: 16288
Top 5 most popular tokens and their frequencies:
the: 8115
and: 4971
of: 4886
to: 3447
it: 2743


## Question 1b
**We use OOV (out-of-vocabulary) to refer to those words appeared in the training data but
not in the Word2vec (or Glove) dictionary. How many OOV words exist in your training data?**


In [12]:
# get vocabulary of word2vec dictionary
word2vec_vocab = set(word2vec_model.key_to_index)
print("Vocabulary size of word2vec:", len(word2vec_vocab))

Vocabulary size of word2vec: 3000000


In [13]:
oov_tokens = [token for token in all_tokens if token not in word2vec_vocab]

oov_count = len(oov_tokens)
print("Number of OOV words:", oov_count)
# print the number of distinct OOV tokens
print("Number of distinct OOV tokens:", len(set(oov_tokens)))
oov_percentage = (oov_count / total_tokens) * 100
print(f"Percentage of OOV tokens out of the whole dataset: {oov_percentage:.2f}%")

Number of OOV words: 15984
Number of distinct OOV tokens: 1473
Percentage of OOV tokens out of the whole dataset: 10.39%


## Question 1c
**The existence of the OOV words is one of the well-known limitations of Word2vec (or Glove).
Without using any transformer-based language models (e.g., BERT, GPT, T5), what do you
think is the best strategy to mitigate such limitation? Implement your solution in your source
code. Show the corresponding code snippet.**

Since out-of-vocabulary (OOV) tokens are not too frequent and make up only 10.39% of all tokens, for handling the OOV tokens, we would randomly initialise character-level embeddings which would later be fine-tuned by the RNN/CNN/LSTM/GRU models.

In [15]:
import numpy as np

For part 2, we will initialise an embedding matrix that doesn't handle OOV tokens.

In [27]:
def prepare_non_oov_embeddings(df, word2vec_model, embedding_dim=300, char_embedding_dim=50):
    df_vocab = set(word for token_list in df['tokens'] for word in token_list)

    # word-to-index mapping for all tokens in the dataset vocabulary
    word_to_idx = {word: idx for idx, word in enumerate(df_vocab)}
    idx_to_word = {idx: word for word, idx in word_to_idx.items()}

    # initialize embedding matrix with zeros for all tokens
    vocab_size = len(df_vocab)
    embedding_matrix = np.zeros((vocab_size, embedding_dim + char_embedding_dim))  # Use 350 dim for all words to match with OOV embeddings

    # update vectors for in-vocabulary words from word2vec
    for word, idx in word_to_idx.items():
        if word in word2vec_model.key_to_index:
            embedding_matrix[idx, :embedding_dim] = word2vec_model[word]  # Word-level embedding (300)

    print(f"Vocabulary size (without handling OOV): {vocab_size}")
    print(f"Embedding matrix shape: {embedding_matrix.shape}")

    return embedding_matrix, word_to_idx, idx_to_word

embedding_matrix_non_oov, word_to_idx, idx_to_word = prepare_non_oov_embeddings(df=train_df, word2vec_model=word2vec_model)

Vocabulary size (without handling OOV): 16288
Embedding matrix shape: (16288, 350)


For part 3, let's initialise an embedding matrix that handles the OOV tokens using character-level embeddings.

In [33]:
def prepare_oov_embeddings(df, word2vec_model, embedding_dim=300, char_embedding_dim=50):
    df_vocab = set(word for token_list in df['tokens'] for word in token_list)
    word_to_idx = {word: idx for idx, word in enumerate(df_vocab)}
    idx_to_word = {idx: word for word, idx in word_to_idx.items()}
    vocab_size = len(df_vocab)
    embedding_matrix = np.zeros((vocab_size, embedding_dim + char_embedding_dim))

    # Character-level embedding initialization
    char_to_idx = {char: idx for idx, char in enumerate(set(''.join(df_vocab)))}

    for word, idx in word_to_idx.items():
        if word in word2vec_model.key_to_index:
            embedding_matrix[idx, :embedding_dim] = word2vec_model[word]  # Word-level embedding
        else:
            char_embedding = np.mean([np.random.randn(char_embedding_dim) for _ in word if _ in char_to_idx], axis=0)
            embedding_matrix[idx, embedding_dim:] = char_embedding  # Character-level embedding for OOV words

    print(f"Vocabulary size with OOV token handling: {vocab_size}")
    print(f"Embedding matrix shape: {embedding_matrix.shape}")

    return embedding_matrix, word_to_idx, idx_to_word

embedding_matrix_oov, word_to_idx, idx_to_word = prepare_oov_embeddings(df=train_df, word2vec_model=word2vec_model)

Vocabulary size with OOV token handling: 16288
Embedding matrix shape: (16288, 350)


In [34]:
embedding_matrix_non_oov

array([[-0.02990723,  0.15820312, -0.2890625 , ...,  0.        ,
         0.        ,  0.        ],
       [-0.09814453,  0.12353516,  0.02355957, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.33007812,  0.2734375 , -0.1953125 , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.05834961, -0.42773438,  0.25585938, ...,  0.        ,
         0.        ,  0.        ]])

In [35]:
embedding_matrix_oov

array([[-2.99072266e-02,  1.58203125e-01, -2.89062500e-01, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [-9.81445312e-02,  1.23535156e-01,  2.35595703e-02, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
        -2.26680550e-01,  5.24005060e-01,  2.65400463e-01],
       ...,
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         4.02229839e-04,  2.72861673e-01, -4.21928833e-01],
       [ 3.30078125e-01,  2.73437500e-01, -1.95312500e-01, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 5.83496094e-02, -4.27734375e-01,  2.55859375e-01, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00]])

In [36]:
# save the embedding matrix with no oov
np.save("embedding_matrix_non_oov.npy", embedding_matrix_non_oov)
# save embedding matrix with OOV
np.save("embedding_matrix_oov.npy", embedding_matrix_oov)

# References:

* Documentation on using gensim.utils library: https://radimrehurek.com/gensim/utils.html

* Idea on using character-level embeddings to handle OOV: https://medium.com/@tam.tamanna18/power-of-character-level-rnns-and-embeddings-in-natural-language-processing-b84321d199ad
