In [None]:
# Installing missing libraries
!pip install datasets
!pip install gensim
!pip install nltk

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

# Part 0: Dataset Preparation

In [None]:
import pandas as pd
import numpy as np
import nltk
import gensim
import gensim.downloader as api
from gensim.utils import simple_preprocess
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors
from datasets import load_dataset
from collections import Counter

In [None]:
# Load the dataset
dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset["train"]
validation_dataset = dataset["validation"]
test_dataset = dataset["test"]

print(train_dataset[:5])
print(validation_dataset[:5])
print(test_dataset[:5])

{'text': ['the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth .', 'effective but too-tepid biopic', 'if you sometimes like to go to the movies to have fun , wasabi is a good place to start .', "emerges as something rare , an issue movie that's so honest and keenly observed that it doesn't feel like one ."], 'label': [1, 1, 1, 1, 1]}
{'text': ['compassionately explores the seemingly irreconcilable situation between conservative christian parents and their estranged gay and lesbian children .', 'the soundtrack alone is worth the price of admission .', 'rodriguez does a splendid job of racial profiling hollywood style--casting excellent

# Part 1. Prepare Word Embeddings

## Answers to Question 1: Word Embedding

In [None]:
# Function to preprocess the text
def preprocess_text(text):
    return simple_preprocess(text, deacc=True)  # deacc = True -> remove accent marks e.g. café to cafe

# Preprocess the training dataset
train_tokens = [preprocess_text(example['text']) for example in train_dataset]

# Print the first 5 preprocessed sentences
print("First 5 Tokens:", train_tokens[:5])

First 5 Tokens: [['the', 'rock', 'is', 'destined', 'to', 'be', 'the', 'st', 'century', 'new', 'conan', 'and', 'that', 'he', 'going', 'to', 'make', 'splash', 'even', 'greater', 'than', 'arnold', 'schwarzenegger', 'jean', 'claud', 'van', 'damme', 'or', 'steven', 'segal'], ['the', 'gorgeously', 'elaborate', 'continuation', 'of', 'the', 'lord', 'of', 'the', 'rings', 'trilogy', 'is', 'so', 'huge', 'that', 'column', 'of', 'words', 'cannot', 'adequately', 'describe', 'co', 'writer', 'director', 'peter', 'jackson', 'expanded', 'vision', 'of', 'tolkien', 'middle', 'earth'], ['effective', 'but', 'too', 'tepid', 'biopic'], ['if', 'you', 'sometimes', 'like', 'to', 'go', 'to', 'the', 'movies', 'to', 'have', 'fun', 'wasabi', 'is', 'good', 'place', 'to', 'start'], ['emerges', 'as', 'something', 'rare', 'an', 'issue', 'movie', 'that', 'so', 'honest', 'and', 'keenly', 'observed', 'that', 'it', 'doesn', 'feel', 'like', 'one']]


### 1(a): What is the size of the vocabulary formed from your training data?

In [None]:
# Iterate through the train dataset tokens and get the unique words to build vocabulary
vocab_counter = Counter([word for tokens in train_tokens for word in tokens])
train_vocab = list(vocab_counter.keys())

# Get the size of the vocabulary
vocab_size = len(train_vocab)

# Answer to 1(a)
print("Vocabulary Size:", vocab_size)

Vocabulary Size: 16256


### 1(b): How many OOV words exist in your training data?

There are mulitple pre-trained models in the Gensim library in Python:
- ```conceptnet-numberbatch-17-06-300```
- ```fasttext-wiki-news-subwords-300```
- ```glove-twitter-25```
- ```glove-twitter-50```
- ```glove-twitter-100```
- ```glove-twitter-200```
- ```glove-wiki-gigaword-50```
- ```glove-wiki-gigaword-100```
- ```glove-wiki-gigaword-200```
- ```glove-wiki-gigaword-300```
- ```word2vec-google-news-300```
- ```word2vec-ruscorpora-300```

where the number in ```model-name-number``` represents the dimensionality of the word vectors
- Lower-dimensional embeddings (like 25 or 50) are faster to compute and use less memory, but they might capture less nuance in word meanings.
- Higher-dimensional embeddings (like 100 or 200) capture more detailed relationships between words, which can improve model performance for NLP tasks but require more memory and computational power.




We will only analyse the OOVs for ```word2vec-google-news-300```, ```glove-twitter``` and ```glove-wiki-gigaword``` as rotten tomatoes data contain more informal and social media terms which makes it more relevant to provide more similar vocabulary.

The benchmark for dimensionality will start at **100** before comparing to higher dimensionality models.

In [None]:
import gensim.downloader as api
import os

# List of model names to download and save
target_models = [
    "word2vec-google-news-300",
    "glove-twitter-100",
    "glove-twitter-200",
    "glove-wiki-gigaword-100",
    "glove-wiki-gigaword-200",
    "glove-wiki-gigaword-300"
]

# File Path to directory to store the pre-defined models (change as necessary)
model_dir_path = "/content/drive/My Drive/SC4002/Models"

# Ensure the directory exists
os.makedirs(model_dir_path, exist_ok=True)

In [None]:
# Loop through each model to download and save from gensim library (will take a few minutes)
for model_name in target_models:
    print(f"Downloading and saving model: {model_name}")
    model = api.load(model_name)
    model_file = os.path.join(model_dir_path, f"{model_name}.model")
    model.save(model_file)
    print(f"Model saved to {model_file}")

print("All models downloaded and saved successfully.")

In [None]:
# Dictionary to store OOV counts for each model
oov_counts = {}

# Loop through each model, load it, and calculate OOV
for model_name in target_models:

    # Load the model from the saved file
    model_path = os.path.join(model_dir_path, f"{model_name}.model")
    print(f"\nLoading model: {model_name}")
    model = KeyedVectors.load(model_path, mmap='r')

    # Get the vocabulary size of the model
    model_vocab_size = len(model.key_to_index)
    print(f"Size of vocabulary in {model_name}:", model_vocab_size)

    # Calculate OOV words for the model by comparing with the words in train_vocab
    oov_words = [word for word in train_vocab if word not in model.key_to_index]
    oov_count = len(oov_words)
    print(f"Number of OOV words for {model_name}:", oov_count)

    # Store the result
    oov_counts[model_name] = oov_count

# Print summary of OOV counts for each model
print("\n######## Summary of OOV counts per model ########")
for model_name, count in oov_counts.items():
    print(f"{model_name}: {count} OOV words")


Loading model: word2vec-google-news-300
Size of vocabulary in word2vec-google-news-300: 3000000
Number of OOV words for word2vec-google-news-300: 1454

Loading model: glove-twitter-100
Size of vocabulary in glove-twitter-100: 1193514
Number of OOV words for glove-twitter-100: 1477

Loading model: glove-twitter-200
Size of vocabulary in glove-twitter-200: 1193514
Number of OOV words for glove-twitter-200: 1477

Loading model: glove-wiki-gigaword-100
Size of vocabulary in glove-wiki-gigaword-100: 400000
Number of OOV words for glove-wiki-gigaword-100: 546

Loading model: glove-wiki-gigaword-200
Size of vocabulary in glove-wiki-gigaword-200: 400000
Number of OOV words for glove-wiki-gigaword-200: 546

Loading model: glove-wiki-gigaword-300
Size of vocabulary in glove-wiki-gigaword-300: 400000
Number of OOV words for glove-wiki-gigaword-300: 546

######## Summary of OOV counts per model ########
word2vec-google-news-300: 1454 OOV words
glove-twitter-100: 1477 OOV words
glove-twitter-200: 

From the summary result above, we can deduce that the number of OOV words is independent of the dimensionality of the model.
> The reason for this is because GloVe embeddings are pre-trained word vectors that come with a fixed vocabulary based on the corpus they were trained on.

Hence, we will proceed to use ```glove-wiki-gigaword-300``` as it has the least number of OOV words out of the 3 models and it has a good balance of high dimensionality and computational resources.

### 1(c): Implement a strategy to solve the OOV problems present in Word2vec (or Glove).

### First Strategy: Use Lemmatization with Fallback Embedding
- Lemmatization works by reducing the words to their base or dictionary form, i.e. running, ran, runs -> run
- Then, for the words that are still not found or rare, create an embedded matrix that assigns the average vector of the vocabulary to these words


In [None]:
from nltk.stem import WordNetLemmatizer

# Download WordNet lemmatizer data
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

# Preprocess a word with lemmatization
def preprocess_word(word):
    lemmatized_word = lemmatizer.lemmatize(word.lower())  # Convert to lowercase and lemmatize
    return lemmatized_word

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
# Load glove-wiki-gigaword-200 model
glove_model = KeyedVectors.load(os.path.join(model_dir_path, "glove-wiki-gigaword-200.model"))
embedding_dim = glove_model.vector_size

# Calculate the average vector for fallback
avg_vector = np.mean([glove_model[word] for word in glove_model.key_to_index], axis=0)

# Function to get GloVe embeddings with fallback handling
def get_glove_embedding(word):
    word = preprocess_word(word)
    if word in glove_model.key_to_index:
        return glove_model[word]
    else:
        return avg_vector  # Use average vector as a fallback for OOV words

# OOV Count without Lemmatization and Fallback Embedding
oov_count_before = sum(1 for word in train_vocab if word not in glove_model.key_to_index)
print("Number of OOV words (without Lemmatization and Fallback Embedding):", oov_count_before)

# OOV Count with Lemmatization and Fallback Embedding
oov_count_after = sum(1 for word in train_vocab if preprocess_word(word) not in glove_model.key_to_index)
print("Number of OOV words (with Lemmatization and Fallback Embedding):", oov_count_after)

Number of OOV words (without Lemmatization and Fallback Embedding): 546
Number of OOV words (with Lemmatization and Fallback Embedding): 538


### Second Strategy: FastText with Subword Embeddings

FastText’s subword embeddings reduce OOV words by:
- Generating embeddings for any word through character n-grams, even if the full word does not appear in the training data
- Producing meaningful representations for morphologically rich words, incorrect spellings, and unknown terms by using common subword patterns
- Eliminating the need for fallback strategies, unlike GloVe, which requires preprocessing or fallback vectors to handle OOV words

FastText has the following two models:
- [```wiki.en.vec```](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.vec)
  - Model was trained exclusively on English Wikipedia data
  - The vocabulary is focused on formal, encyclopedic language, which includes terms from diverse domains like history, science, arts, and popular culture
  - Typically smaller than Common Crawl models since Wikipedia has a limited (though diverse) lexicon, centered around factual, standardised language
- [```cc.en.300.vec```](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz)
  - Model was trained on the Common Crawl dataset, a massive, multilingual dataset pulled from a wide range of online sources
  - The vocabulary is larger and more diverse. It captures a broader array of language, including slang, informal speech, niche terminology, and multilingual content
  - Significantly larger due to the vast range of sources, making it suitable for general NLP applications

In [None]:
# Note: These models will take a few minutes to run
# Load the Wiki FastText vectors from the .vec file
wiki_fasttext_model = KeyedVectors.load_word2vec_format(os.path.join(model_dir_path, "wiki.en.vec"), binary=False)

# Load the Common Crawl FastText vectors from the .vec file
crawl_fasttext_model = KeyedVectors.load_word2vec_format(os.path.join(model_dir_path, "cc.en.300.vec"), binary=False)

In [None]:
# Function to get FastText embeddings, with subword handling
def get_fasttext_embedding(model, word):
    return model[word]

# Initialize OOV counters
wiki_oov_count = 0
crawl_oov_count = 0

# Loop through each word in the vocabulary
for word in train_vocab:
    preprocessed_word = preprocess_word(word)

    # Check if the word is in the vocabulary of the Wikipedia model
    if preprocessed_word not in wiki_fasttext_model:
        wiki_oov_count += 1
    else:
        # Retrieve embedding if the word is in the vocabulary
        wiki_embedding = get_fasttext_embedding(wiki_fasttext_model, preprocessed_word)

    # Check if the word is in the vocabulary of the Common Crawl model
    if preprocessed_word not in crawl_fasttext_model:
        crawl_oov_count += 1
    else:
        # Retrieve embedding if the word is in the vocabulary
        crawl_embedding = get_fasttext_embedding(crawl_fasttext_model, preprocessed_word)

# Print the number of OOV words for each model
print("Number of OOV words (Wikipedia Model):", wiki_oov_count)
print("Number of OOV words (Common Crawl Model):", crawl_oov_count)

Number of OOV words (Wikipedia Model): 240
Number of OOV words (Common Crawl Model): 734


From the result of the number of OOV words from the two models, we can see that ```wiki.en.vec``` model performed better which was not expected as the Rotten Tomatoes reviews are more conversational and could be thought to contain slang or informal language.

Some possible reasons could be due to:
- **Relevant Vocabulary:** Wikipedia includes extensive formal descriptions and movie-related vocabulary that aligns with the language used in reviews.
- **Reduced Informal Noise:** ```cc.en.300.vec``` covers broader, more informal internet language, leading to mismatches with the structured, descriptive tone of movie reviews.
- **Coverage of Proper Nouns and Critic Terms:** Wikipedia’s curated content better captures names, technical terms, and critic jargon, reducing OOV rates for datasets centered on entertainment media.

Hence, we will use ```wiki.en.vec``` embedded matrix as the method to reduce the number of OOV words present.