In [None]:
!pip install datasets
from datasets import load_dataset
dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset["train"]
validation_dataset = dataset["validation"]
test_dataset = dataset["test"]

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

README.md:   0%|          | 0.00/7.46k [00:00<?, ?B/s]

train.parquet:   0%|          | 0.00/699k [00:00<?, ?B/s]

validation.parquet:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/92.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

In [None]:
# convert datasets to pandas dataframe
train_df = train_dataset.to_pandas()
validation_df = validation_dataset.to_pandas()
test_df = test_dataset.to_pandas()

In [None]:
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

In [None]:
train_df.head()

Unnamed: 0,text,label
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1


The `simple_preprocess` function from `gensim.utils` package makes cleaning and tokenizing text easier with standard text pre-processing procedures like converting text to lowercase, eliminating punctuation, and splitting text into individual words.

In [None]:
# view on an example from train df
print(simple_preprocess(train_df.text[1]))

['the', 'gorgeously', 'elaborate', 'continuation', 'of', 'the', 'lord', 'of', 'the', 'rings', 'trilogy', 'is', 'so', 'huge', 'that', 'column', 'of', 'words', 'cannot', 'adequately', 'describe', 'co', 'writer', 'director', 'peter', 'jackson', 'expanded', 'vision', 'of', 'tolkien', 'middle', 'earth']


In [None]:
# apply gensim utils simple preprocess to tokenize the text in all datasets
train_df["tokens"] = train_df["text"].apply(simple_preprocess)
validation_df["tokens"] = validation_df["text"].apply(simple_preprocess)
test_df["tokens"] = test_df["text"].apply(simple_preprocess)

In [None]:
train_df.head()

Unnamed: 0,text,label,tokens
0,the rock is destined to be the 21st century's ...,1,"[the, rock, is, destined, to, be, the, st, cen..."
1,"the gorgeously elaborate continuation of "" the...",1,"[the, gorgeously, elaborate, continuation, of,..."
2,effective but too-tepid biopic,1,"[effective, but, too, tepid, biopic]"
3,if you sometimes like to go to the movies to h...,1,"[if, you, sometimes, like, to, go, to, the, mo..."
4,"emerges as something rare , an issue movie tha...",1,"[emerges, as, something, rare, an, issue, movi..."


In [None]:
# save the train, test and validation dataframes after tokenizing
train_df.to_csv("movie_train.csv", index=False)
validation_df.to_csv("movie_validation.csv", index=False)
test_df.to_csv("movie_test.csv", index=False)

## Question 1a
To get the size of vocabulary, we can use `Counter` as a better alternative to `set()` function so as to keep track of each token's frequency from the beginning. We can get the vocabulary size from the number of unique keys in the `token_counts` dictionary.

In [None]:
# get the vocabulary size of training data
from collections import Counter
all_tokens = [token for sublist in train_df['tokens'] for token in sublist]

# count token frequencies
token_counts = Counter(all_tokens)
print("Vocabulary size:", len(token_counts))

Vocabulary size: 16288


## Question 1b
**We use OOV (out-of-vocabulary) to refer to those words appeared in the training data but
not in the Word2vec (or Glove) dictionary. How many OOV words exist in your training data?**


In [None]:
# load pretrained word2vec dictionary
import gensim.downloader
word2vec_google_vect = gensim.downloader.load('word2vec-google-news-300')



In [None]:
# get vocabulary of word2vec dictionary
word2vec_vocab = set(word2vec_google_vect.key_to_index)

In [None]:
oov_count = 0
for token in all_tokens:
  if token not in word2vec_vocab:
    oov_count += 1

print("Number of OOV words:", oov_count)

Number of OOV words: 15984


Training word2vec embedding on train data

In [None]:
import numpy as np

# 300 dimensions in the loaded pretrained word2vec embedding (in the name itself)
def prepare_embeddings(df, word2vec_model, embedding_dim=300):
    vocab = set()
    for token_list in df['tokens']:
        for word in token_list:
            vocab.add(word)

    # create word-to-index mapping
    word_to_idx = {word: idx for idx, word in enumerate(vocab)}
    idx_to_word = {idx: word for word, idx in word_to_idx.items()}

    # initialize embedding matrix with zeros
    vocab_size = len(vocab)
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    # fill embedding matrix with Word2Vec vectors
    for word, idx in word_to_idx.items():
        if word in word2vec_model:  # only add words that exist in pre-trained model
            embedding_matrix[idx] = word2vec_model[word]

    print(f"Vocabulary size: {vocab_size}")

    return embedding_matrix, word_to_idx, idx_to_word

embedding_matrix, word_to_idx, idx_to_word = prepare_embeddings(train_df, word2vec_model=word2vec_google_vect)

Vocabulary size: 16288


In [None]:
embedding_matrix

array([[ 0.1328125 ,  0.10644531, -0.07763672, ...,  0.06933594,
         0.19140625, -0.08886719],
       [-0.45507812,  0.13183594,  0.15820312, ...,  0.06982422,
         0.14941406, -0.06542969],
       [ 0.17675781, -0.11083984, -0.08935547, ..., -0.20703125,
         0.17382812,  0.10693359],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.12695312,  0.17578125, -0.02819824, ..., -0.18652344,
         0.0062561 ,  0.12304688],
       [ 0.10839844,  0.21386719,  0.21386719, ...,  0.05493164,
         0.04003906, -0.18164062]])

In [None]:
# save the embedding matrix
np.save("embedding_matrix.npy", embedding_matrix)

In [None]:
print("\nEmbedding matrix shape:", embedding_matrix.shape)


Embedding matrix shape: (16288, 300)


## Question 1c
**The existence of the OOV words is one of the well-known limitations of Word2vec (or Glove).
Without using any transformer-based language models (e.g., BERT, GPT, T5), what do you
think is the best strategy to mitigate such limitation? Implement your solution in your source
code. Show the corresponding code snippet.**

Based on the first 50 OOV words we have sampled, they are mainly made up of names or words in different languages. Since we cannot collect another movie domain-specific dataset to additionally encode for these unknown words, and because these names are not relevant emotion-related words or phrases, these OOV words that make up less than 2% of the training data may not affect the model's performance in determining sentiment from actual relevant contextual clues. Thus, we can perhaps **substitute these OOV words with `<UNK>`** instead!

References:

https://radimrehurek.com/gensim/utils.html