<a href="https://colab.research.google.com/github/sahanyafernando/My_NLP_Learning/blob/main/Project_01_Public_Responce_Analysis/notebooks/Project_01_Public_Response_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Load preprocessing artifacts
Loads outputs saved by `01_data_loading_and_preprocessing.ipynb`. Run that notebook first if this file is missing.

In [None]:
import pickle, pathlib
# Update this path if your project lives elsewhere in Drive
artifacts_root = pathlib.Path("/content/drive/MyDrive/My_NLP_Learning/Project_01_Public_Responce_Analysis")
artifacts_path = artifacts_root / "artifacts/preprocessing_outputs.pkl"
if artifacts_path.exists():
    with open(artifacts_path, "rb") as f:
        artifacts = pickle.load(f)
    df = artifacts["df"]
    one_hot_vectorizer = artifacts["one_hot_vectorizer"]
    one_hot_matrix = artifacts["one_hot_matrix"]
    bow_vectorizer = artifacts["bow_vectorizer"]
    bow_matrix = artifacts["bow_matrix"]
    tfidf_vectorizer = artifacts["tfidf_vectorizer"]
    tfidf_matrix = artifacts["tfidf_matrix"]
    cooccurrence_vectorizer = artifacts["cooccurrence_vectorizer"]
    cooccurrence_matrix = artifacts["cooccurrence_matrix"]
    print("Loaded preprocessing outputs from artifacts/preprocessing_outputs.pkl")
else:
    print("Run 01_data_loading_and_preprocessing.ipynb to generate artifacts first.")


## Dense Embeddings Creation
Generate Word2Vec embeddings from the preprocessed text data (`cleaned_tokens`) using the `gensim` library. This will involve training a Word2Vec model and then displaying a sample of learned embeddings and demonstrating how to find similar words for a given term.

In [19]:
!pip install gensim
print("Gensim library installed.")

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m52.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0
Gensim library installed.


In [20]:
from gensim.models import Word2Vec

# Initialize a Word2Vec model
# vector_size: Dimension of the word embeddings
# window: Maximum distance between the current and predicted word within a sentence
# min_count: Ignores all words with total frequency lower than this
# workers: Use these many worker threads to train the model (=faster training with multicore CPUs)
model = Word2Vec(vector_size=100, window=5, min_count=1, workers=4)

# Train the Word2Vec model on the 'cleaned_tokens' column
# The 'cleaned_tokens' column already contains lists of words (sentences).
model.build_vocab(df['cleaned_tokens'])
model.train(df['cleaned_tokens'], total_examples=model.corpus_count, epochs=10)

print("Word2Vec model trained successfully.")

# Display the vector for a sample word
# Check if the vocabulary is not empty before attempting to access words
if len(model.wv.key_to_index) > 0:
    # Choose a word that is likely to be in the vocabulary, e.g., a top word from previous analysis
    sample_word = "educationpolicy"
    if sample_word in model.wv.key_to_index:
        print(f"\nVector for '{sample_word}':")
        print(model.wv[sample_word])
    else:
        # If the chosen word is not in vocab, pick the first available word
        sample_word = list(model.wv.key_to_index.keys())[0]
        print(f"\n'{sample_word}' was not in vocabulary. Displaying vector for '{sample_word}':")
        print(model.wv[sample_word])

    # Find and print the top 5 most similar words for a chosen term
    if sample_word in model.wv.key_to_index:
        print(f"\nTop 5 similar words to '{sample_word}':")
        try:
            similar_words = model.wv.most_similar(sample_word, topn=5)
            for word, similarity in similar_words:
                print(f"{word}: {similarity:.4f}")
        except KeyError:
            print(f"Could not find similar words for '{sample_word}' (possibly too infrequent).")
else:
    print("\nWord2Vec vocabulary is empty. Cannot display vectors or similar words.")

Word2Vec model trained successfully.

Vector for 'educationpolicy':
[-0.00888482  0.00406113  0.00528503  0.00613152  0.00766228 -0.00712012
  0.00097215  0.00734187 -0.00319307 -0.00609324 -0.0010288  -0.00907577
 -0.00539532  0.0072668   0.00321684  0.00669209  0.00704312  0.00698161
 -0.00381457 -0.00140761  0.00271286 -0.00427031  0.00859728 -0.01029839
  0.0066366   0.00299851 -0.0056163   0.00349566 -0.00205988  0.00716666
  0.01083251 -0.00407961 -0.0006324  -0.00577595  0.00368347  0.00356117
  0.0064988   0.00535338  0.00910546  0.00779381  0.00769153 -0.00743954
 -0.00939768 -0.00062215 -0.00242623  0.0073053   0.00563285 -0.00148196
  0.00184443  0.00194863  0.00817344 -0.0102214  -0.00027507  0.0036235
 -0.00144402  0.00872862  0.00922212  0.00664775 -0.00145341  0.00798887
 -0.00845824  0.00342318 -0.00514189 -0.00560878  0.00283036  0.00561479
  0.00805331 -0.00530009  0.00672292  0.00718346 -0.00436599 -0.00887923
  0.00614025  0.00608644 -0.00036038 -0.00644362 -0.00701

## Generate FastText Embeddings
Generate FastText embeddings from the preprocessed text data (`cleaned_tokens`) using the `gensim` library. This will involve training a FastText model and then displaying a sample of learned embeddings and demonstrating how to find similar words for a given term.

In [21]:
from gensim.models import FastText

# Initialize a FastText model
# vector_size: Dimension of the word embeddings
# window: Maximum distance between the current and predicted word within a sentence
# min_count: Ignores all words with total frequency lower than this
# workers: Use these many worker threads to train the model (=faster training with multicore CPUs)
fasttext_model = FastText(vector_size=100, window=5, min_count=1, workers=4, sg=1) # sg=1 for skip-gram, common for FastText

# Train the FastText model on the 'cleaned_tokens' column
# The 'cleaned_tokens' column already contains lists of words (sentences).
fasttext_model.build_vocab(df['cleaned_tokens'])
fasttext_model.train(df['cleaned_tokens'], total_examples=fasttext_model.corpus_count, epochs=10)

print("FastText model trained successfully.")

# Display the vector for a sample word
if len(fasttext_model.wv.key_to_index) > 0:
    sample_word_ft = "educationpolicy"
    if sample_word_ft in fasttext_model.wv.key_to_index:
        print(f"\nVector for '{sample_word_ft}' (FastText):")
        print(fasttext_model.wv[sample_word_ft])
    else:
        sample_word_ft = list(fasttext_model.wv.key_to_index.keys())[0]
        print(f"\n'{sample_word_ft}' was not in vocabulary. Displaying vector for '{sample_word_ft}' (FastText):")
        print(fasttext_model.wv[sample_word_ft])

    # Find and print the top 5 most similar words for a chosen term
    if sample_word_ft in fasttext_model.wv.key_to_index:
        print(f"\nTop 5 similar words to '{sample_word_ft}' (FastText):")
        try:
            similar_words_ft = fasttext_model.wv.most_similar(sample_word_ft, topn=5)
            for word, similarity in similar_words_ft:
                print(f"{word}: {similarity:.4f}")
        except KeyError:
            print(f"Could not find similar words for '{sample_word_ft}' (possibly too infrequent).")
else:
    print("\nFastText vocabulary is empty. Cannot display vectors or similar words.")

FastText model trained successfully.

Vector for 'educationpolicy' (FastText):
[-0.02180171 -0.00214988 -0.00262839 -0.01025171  0.00643344  0.03016493
 -0.0005664  -0.00301163 -0.00150538 -0.02377232  0.00433622 -0.00604296
 -0.00388171  0.02126461 -0.0078538  -0.04925325 -0.024071    0.02225541
 -0.02101325 -0.01215765 -0.03873902  0.03926503 -0.04683848 -0.01634804
  0.00625108 -0.00161708 -0.02398336  0.01321854  0.03458745  0.00321404
 -0.01849218  0.00056485  0.03159113 -0.01305179 -0.02635822  0.0062166
 -0.00345002  0.00716515 -0.03167881  0.00127246  0.02498819 -0.03717434
  0.01392756 -0.02894269 -0.05074468 -0.03692038 -0.00187663 -0.02918363
 -0.02457221  0.00573102  0.00615972 -0.01595497  0.01608039 -0.00454777
 -0.01033915  0.0066973  -0.04249965 -0.01159947 -0.00989624 -0.0045944
  0.01352564 -0.04472865 -0.00841846  0.01732462  0.02793944  0.03424306
  0.00578606  0.02987581  0.00644182  0.01611372  0.00897061 -0.00127
  0.00906626 -0.01583804  0.01422153  0.00200769  

## Apply Non-Negative Matrix Factorization (NMF) for Topic Modeling
Apply Non-Negative Matrix Factorization (NMF) to the TF-IDF matrix to extract latent topics from the text data.


In [22]:
from sklearn.decomposition import NMF

# 2. Determine an appropriate number of topics
n_topics = 5 # Example: choosing 5 topics

# 3. Initialize an NMF model
nmf_model = NMF(n_components=n_topics, random_state=42)

# 4. Fit the NMF model to the tfidf_matrix and transform it
doc_topic_matrix = nmf_model.fit_transform(tfidf_matrix)
topic_word_matrix = nmf_model.components_

# 5. Print the shape of both matrices
print(f"Shape of Document-Topic Matrix: {doc_topic_matrix.shape}")
print(f"Shape of Topic-Word Matrix: {topic_word_matrix.shape}")

Shape of Document-Topic Matrix: (100, 5)
Shape of Topic-Word Matrix: (5, 745)


## Interpret NMF Topics

Interpret the topics extracted by the NMF model by displaying the top words associated with each topic.


In [23]:
feature_names = tfidf_vectorizer.get_feature_names_out()
n_top_words = 10

print("\nTopics in NMF model:")
for topic_idx, topic in enumerate(topic_word_matrix):
    print(f"Topic #{topic_idx + 1}:")
    # Sort words by their weights in descending order
    top_words_indices = topic.argsort()[:-n_top_words - 1:-1]
    top_words = [feature_names[i] for i in top_words_indices]
    print(f"{', '.join(top_words)}")


Topics in NMF model:
Topic #1:
economicrelief, paper, way, boy, face, difficult, market, officer, such, remember
Topic #2:
publictransport, although, worker, others, choice, job, reveal, prove, everyone, never
Topic #3:
healthcarereform, wait, state, doctor, add, seek, mr, move, agent, maintain
Topic #4:
educationpolicy, second, most, develop, plan, itself, mrs, car, much, reflect
Topic #5:
environmentallaws, current, occur, painting, various, hear, specific, generation, class, industry


## Generate GloVe Embeddings

Generate GloVe embeddings from the preprocessed text data (`cleaned_tokens`) using an appropriate library. This will involve obtaining pre-trained GloVe vectors or training a GloVe model if necessary, then displaying a sample of learned embeddings and demonstrating how to find similar words for a given term.


In [24]:
import numpy as np
import os
import urllib.request
import zipfile

# Define paths and filenames
glove_zip_file = 'glove.6B.zip'
glove_txt_file = 'glove.6B.100d.txt'
glove_url = 'http://nlp.stanford.edu/data/glove.6B.zip'

# download the pre-trained GloVe embeddings (glove.6B.100d.txt)
# Check if GloVe file exists, if not, download and extract
if not os.path.exists(glove_txt_file):
    if not os.path.exists(glove_zip_file):
        print(f"Downloading {glove_zip_file}...")
        urllib.request.urlretrieve(glove_url, glove_zip_file)
        print("Download complete.")

    print(f"Extracting {glove_txt_file} from {glove_zip_file}...")
    with zipfile.ZipFile(glove_zip_file, 'r') as zf:
        zf.extract(glove_txt_file, path='.')
    print("Extraction complete.")
else:
    print(f"{glove_txt_file} already exists. Skipping download and extraction.")

# Load the GloVe embeddings into a dictionary
glove_embeddings = {}
print(f"Loading GloVe embeddings from {glove_txt_file}...")
with open(glove_txt_file, 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.split()
        word = parts[0]
        vector = np.array(parts[1:], dtype=np.float32)
        glove_embeddings[word] = vector
print(f"Loaded {len(glove_embeddings)} GloVe embeddings.")
print(f"Embedding dimension: {len(next(iter(glove_embeddings.values())))}")

Downloading glove.6B.zip...
Download complete.
Extracting glove.6B.100d.txt from glove.6B.zip...
Extraction complete.
Loading GloVe embeddings from glove.6B.100d.txt...
Loaded 400000 GloVe embeddings.
Embedding dimension: 100


define a function to retrieve word vectors, handle out-of-vocabulary words, and then use these embeddings to display the vector for a sample word ('educationpolicy') and find its top 5 most similar words using cosine similarity, as per the subtask instructions.

In [26]:
from sklearn.metrics.pairwise import cosine_similarity

# Function to get GloVe vector for a word
def get_glove_vector(word, embeddings_dict, vector_dim=100):
    return embeddings_dict.get(word, np.zeros(vector_dim)) # Return zero vector if word not found

# Function to find most similar words
def find_similar_words_glove(word, embeddings_dict, topn=5, vector_dim=100):
    if word not in embeddings_dict:
        return [] # Return empty list if the word itself is not in the vocabulary

    word_vector = embeddings_dict[word].reshape(1, -1)

    similarities = []
    for vocab_word, vocab_vector in embeddings_dict.items():
        if vocab_word == word: # Skip self-comparison
            continue

        # Reshape vocab_vector for cosine_similarity function
        vocab_vector_reshaped = vocab_vector.reshape(1, -1)

        similarity = cosine_similarity(word_vector, vocab_vector_reshaped)[0][0]
        similarities.append((vocab_word, similarity))

    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:topn]

# Display the vector for a sample word
sample_word_glove = "educationpolicy"

# Check if the word exists in the GloVe vocabulary
if sample_word_glove in glove_embeddings:
    print(f"\nVector for '{sample_word_glove}' (GloVe):")
    print(glove_embeddings[sample_word_glove])
else:
    # If the chosen word is not in vocab, pick the first available word from our cleaned_tokens
    # and ensure it exists in GloVe embeddings for demonstration
    found_in_glove = False
    for token_list in df['cleaned_tokens']:
        for token in token_list:
            if token in glove_embeddings:
                sample_word_glove = token
                found_in_glove = True
                break
        if found_in_glove:
            break

    if found_in_glove:
        print(f"\n'{sample_word_glove}' from our data was found in GloVe. Displaying vector:")
        print(glove_embeddings[sample_word_glove])
    else:
        print("\nCould not find any common words between cleaned_tokens and GloVe vocabulary.")

# Find and print the top 5 most similar words for a chosen term
if sample_word_glove in glove_embeddings:
    print(f"\nTop 5 similar words to '{sample_word_glove}' (GloVe):")
    similar_words_glove = find_similar_words_glove(sample_word_glove, glove_embeddings, topn=5)
    if similar_words_glove:
        for word, similarity in similar_words_glove:
            print(f"{word}: {similarity:.4f}")
    else:
        print(f"Could not find similar words for '{sample_word_glove}' (not in GloVe vocabulary).")
else:
    print("Cannot find similar words as the sample word is not in GloVe vocabulary.")


'agent' from our data was found in GloVe. Displaying vector:
[ 5.4263e-01 -9.4556e-01  6.3338e-01 -1.1186e+00 -3.3968e-02 -5.0336e-01
  2.6620e-01 -4.3339e-01 -6.8826e-02  1.3260e-01 -1.1673e-01 -7.4280e-01
  2.4098e-01  3.1927e-01  1.4900e-01  2.5225e-01  8.9375e-01  1.3620e-01
 -2.0531e-01  2.2922e-01 -4.5712e-01  3.7074e-02 -9.4237e-01 -2.0870e-01
  1.0498e-01 -2.6186e-02  3.9272e-01 -6.1634e-01 -3.3848e-02  5.2247e-01
 -9.9738e-02  1.4282e+00 -9.5186e-02 -5.8463e-01  1.9847e-01 -2.1672e-01
 -6.6678e-01  2.5273e-01  6.6437e-01  1.1844e-01  4.9575e-01 -2.0699e-01
  5.4199e-01 -2.2103e-01  8.4711e-01 -1.7925e-01 -8.5337e-01 -8.4244e-01
  2.2508e-01 -2.3347e-01  1.0846e+00  3.4772e-01  1.7441e-01  7.9287e-01
  1.6292e-01 -1.2031e+00 -8.5298e-01 -2.0582e-01  1.0274e+00  8.1467e-01
  2.3544e-02  9.9344e-01 -2.2389e-01 -5.0794e-01  6.1108e-01 -8.4063e-01
  1.1529e+00  7.7760e-02  1.0547e-01  5.7986e-01 -4.7526e-01 -1.0866e+00
 -7.3660e-01 -3.4891e-01  1.2625e-01 -3.9275e-01  1.0230e-01 -

## Generate Byte-Pair Encoding (BPE) Embeddings
Generate Byte-Pair Encoding (BPE) subword embeddings from the preprocessed text data.

In [27]:
!pip install tokenizers
print("Tokenizers library installed.")

Tokenizers library installed.


In [28]:
from tokenizers import BertWordPieceTokenizer
from gensim.models import Word2Vec
import collections

# Prepare the text data for BPE training
# The tokenizer expects an iterator of sentences (strings)
text_for_bpe_training = df['preprocessed_text_string'].tolist()

# Initialize and train a BPE tokenizer
# BertWordPieceTokenizer is a good choice for BPE-like subword tokenization
tokenizer = BertWordPieceTokenizer(
    clean_text=True,
    handle_chinese_chars=False, # Assuming no Chinese characters based on initial data exploration
    strip_accents=True,
    lowercase=True,
)

# Train the tokenizer
tokenizer.train_from_iterator(
    text_for_bpe_training,
    vocab_size=30000, # Increased vocab size to potentially capture more subwords
    min_frequency=2, # Minimum frequency for words to be included in vocab
    show_progress=True,
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
)

# Use the trained BPE tokenizer to tokenize the preprocessed_text_string column
def tokenize_with_bpe(text):
    output = tokenizer.encode(text)
    return output.tokens

df['bpe_tokens'] = df['preprocessed_text_string'].apply(tokenize_with_bpe)

print("BPE tokenizer trained and 'bpe_tokens' column added to DataFrame.")
print("Displaying head of DataFrame with new 'bpe_tokens' column:")
print(df[['preprocessed_text_string', 'bpe_tokens']].head())

BPE tokenizer trained and 'bpe_tokens' column added to DataFrame.
Displaying head of DataFrame with new 'bpe_tokens' column:
                            preprocessed_text_string  \
0  agent every development say quality throughout...   
1   serve civil institution everyone publictransport   
2  benefit suggest page southern role movie win n...   
3  law street class great prove reduce raise auth...   
4  detail food shoulder argue start source husban...   

                                          bpe_tokens  
0  [agent, every, develop, ##ment, say, quality, ...  
1  [serve, civil, institution, everyone, publictr...  
2  [be, ##n, ##ef, ##it, suggest, page, southern,...  
3  [law, st, ##ree, ##t, class, great, prove, red...  
4  [detail, food, shoulder, ar, ##g, ##ue, start,...  


In [29]:
from gensim.models import Word2Vec

# 5. Train a gensim.Word2Vec model on the 'bpe_tokens' column
# The 'bpe_tokens' column already contains lists of subwords (sentences).
bpe_word2vec_model = Word2Vec(vector_size=100, window=5, min_count=1, workers=4)
bpe_word2vec_model.build_vocab(df['bpe_tokens'])
bpe_word2vec_model.train(df['bpe_tokens'], total_examples=bpe_word2vec_model.corpus_count, epochs=10)

print("Word2Vec model trained on BPE tokens successfully.")

# 6. Display a sample BPE subword embedding
# Check if the vocabulary is not empty before attempting to access words
if len(bpe_word2vec_model.wv.key_to_index) > 0:
    # Pick a common BPE subword. 'education' is a good candidate from previous analysis.
    sample_bpe_subword = "education"
    if sample_bpe_subword in bpe_word2vec_model.wv.key_to_index:
        print(f"\nVector for BPE subword '{sample_bpe_subword}':")
        print(bpe_word2vec_model.wv[sample_bpe_subword])
    else:
        # Fallback to the first available subword if 'education' is not in vocab
        sample_bpe_subword = list(bpe_word2vec_model.wv.key_to_index.keys())[0]
        print(f"\n'{sample_bpe_subword}' was not in BPE vocabulary. Displaying vector for '{sample_bpe_subword}':")
        print(bpe_word2vec_model.wv[sample_bpe_subword])

    # 7. Find and print the top 5 most similar BPE subwords for the chosen sample subword
    if sample_bpe_subword in bpe_word2vec_model.wv.key_to_index:
        print(f"\nTop 5 similar BPE subwords to '{sample_bpe_subword}':")
        try:
            similar_bpe_subwords = bpe_word2vec_model.wv.most_similar(sample_bpe_subword, topn=5)
            for word, similarity in similar_bpe_subwords:
                print(f"{word}: {similarity:.4f}")
        except KeyError:
            print(f"Could not find similar BPE subwords for '{sample_bpe_subword}' (possibly too infrequent).")
else:
    print("\nBPE Word2Vec vocabulary is empty. Cannot display vectors or similar subwords.")

Word2Vec model trained on BPE tokens successfully.

Vector for BPE subword 'education':
[ 4.1449177e-03 -1.0060999e-03 -2.8769341e-03 -6.9859810e-03
 -9.9111143e-03  7.1347994e-03  7.6038316e-03  1.0216403e-02
 -7.9620266e-03  7.4384669e-03  7.5606173e-03  4.1001765e-03
 -9.7969752e-03  1.3557842e-03  6.9591422e-03  5.7924306e-03
 -2.0944480e-04 -3.4422602e-03 -7.2046281e-03 -1.8854584e-03
  1.0077420e-02  1.7108109e-03  8.1060181e-04  4.2135911e-03
  7.1855192e-04 -5.4841670e-03 -1.6661159e-04 -1.0205462e-02
  2.9596600e-03 -7.9015046e-03  9.3029784e-03  4.3631825e-03
  1.8954043e-03 -2.5742857e-03  3.4966755e-03 -7.3880595e-03
  4.8619844e-03 -9.2516998e-03 -1.5971689e-03 -1.0558230e-02
  7.1480931e-03  1.4942370e-03  1.4381642e-04  4.3144426e-03
 -3.5410291e-03 -3.6841575e-03  6.7394222e-03  5.5679237e-03
  9.1663823e-03 -3.1387031e-03  4.6052951e-03  1.0650485e-04
 -5.6375782e-03  8.3988084e-04  2.3408670e-03 -1.4302431e-04
  7.0589781e-03  3.8767247e-03  5.9767631e-03  5.3352020e-

## Generate Unigram Language Models Embeddings

Generate Unigram Language Models embeddings from the preprocessed text data (`cleaned_tokens`). This will involve calculating word probabilities based on unigram frequencies, displaying a sample word's probability, and identifying words with similar probabilities.


In [30]:
import collections

# 1. Flatten the 'cleaned_tokens' column into a single list of all words
all_cleaned_words = [token for sublist in df['cleaned_tokens'] for token in sublist]

# 2. Use collections.Counter to calculate the frequency of each unique word
word_frequencies = collections.Counter(all_cleaned_words)

print("Total unique words:", len(word_frequencies))
print("Total words in corpus:", len(all_cleaned_words))
print("Sample word frequencies (top 10):")
for word, freq in word_frequencies.most_common(10):
    print(f"{word}: {freq}")

Total unique words: 746
Total words in corpus: 1463
Sample word frequencies (top 10):
economicrelief: 24
educationpolicy: 23
healthcarereform: 21
publictransport: 19
environmentallaws: 13
current: 6
police: 6
law: 5
boy: 5
rest: 5


compute the unigram probability for each word, select a sample word ('educationpolicy') to display its probability, and then identify and present the top 5 words with the most similar probabilities to illustrate unigram embeddings.

In [31]:
total_words = len(all_cleaned_words)
unigram_probabilities = {word: freq / total_words for word, freq in word_frequencies.items()}

# 4. Choose a sample word and display its calculated unigram probability
sample_word_unigram = "educationpolicy"
if sample_word_unigram in unigram_probabilities:
    print(f"\nUnigram probability for '{sample_word_unigram}': {unigram_probabilities[sample_word_unigram]:.6f}")
else:
    print(f"\n'{sample_word_unigram}' not found in vocabulary.")

# 5. Identify and display the top 5 words that have the most similar unigram probabilities to the sample word
if sample_word_unigram in unigram_probabilities:
    sample_prob = unigram_probabilities[sample_word_unigram]

    # Calculate absolute difference in probabilities
    similarity_scores = []
    for word, prob in unigram_probabilities.items():
        if word == sample_word_unigram:
            continue
        similarity_scores.append((word, abs(prob - sample_prob)))

    # Sort by the absolute difference (smallest difference means highest similarity)
    similarity_scores.sort(key=lambda x: x[1])

    print(f"\nTop 5 words with most similar unigram probabilities to '{sample_word_unigram}':")
    for i, (word, diff) in enumerate(similarity_scores[:5]):
        print(f"{word}: (Probability: {unigram_probabilities[word]:.6f}, Abs Diff: {diff:.6f})")
else:
    print(f"Cannot find similar words as '{sample_word_unigram}' is not in the unigram vocabulary.")


Unigram probability for 'educationpolicy': 0.015721

Top 5 words with most similar unigram probabilities to 'educationpolicy':
economicrelief: (Probability: 0.016405, Abs Diff: 0.000684)
healthcarereform: (Probability: 0.014354, Abs Diff: 0.001367)
publictransport: (Probability: 0.012987, Abs Diff: 0.002734)
environmentallaws: (Probability: 0.008886, Abs Diff: 0.006835)
current: (Probability: 0.004101, Abs Diff: 0.011620)


## Generate Hybrid Character+Word Embeddings

Generate hybrid character+word embeddings from the preprocessed text data (`cleaned_tokens`) by combining character n-gram features with existing Word2Vec embeddings.


In [32]:
from sklearn.feature_extraction.text import CountVectorizer

# 1. Aggregate all unique words from the cleaned_tokens column
# Flatten the list of lists into a single list of words and get unique words
all_unique_words = list(set([word for sublist in df['cleaned_tokens'] for word in sublist]))
print(f"Number of unique words: {len(all_unique_words)}")

# 2. Initialize a CountVectorizer with analyzer='char' and ngram_range=(2,3)
char_ngram_vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 3))

# Fit this vectorizer on the unique words to create a character n-gram vocabulary
char_ngram_vectorizer.fit(all_unique_words)
print("Character n-gram vectorizer fitted.")
print(f"Number of character n-gram features: {len(char_ngram_vectorizer.get_feature_names_out())}")

Number of unique words: 746
Character n-gram vectorizer fitted.
Number of character n-gram features: 1505


create a dictionary to store hybrid embeddings by iterating through each unique word, obtaining its character n-gram representation and its Word2Vec embedding, concatenating them, and storing the result.



In [33]:
word_vector_dim = model.vector_size # Word2Vec vector dimension (100)
char_ngram_vector_dim = len(char_ngram_vectorizer.get_feature_names_out())

# 3. Create a dictionary to store the hybrid embeddings
hybrid_embeddings = {}

for word in all_unique_words:
    # a. Get its character n-gram representation
    # Transform the word using the fitted CountVectorizer
    char_ngram_representation = char_ngram_vectorizer.transform([word]).toarray()[0]

    # b. Retrieve its corresponding Word2Vec embedding
    word_embedding = model.wv[word] if word in model.wv else np.zeros(word_vector_dim)

    # c. Concatenate the character n-gram vector and the Word2Vec embedding
    hybrid_embedding = np.concatenate((char_ngram_representation, word_embedding))

    # Store this in your dictionary
    hybrid_embeddings[word] = hybrid_embedding

print(f"Generated hybrid embeddings for {len(hybrid_embeddings)} unique words.")
print(f"Dimension of hybrid embeddings: {len(next(iter(hybrid_embeddings.values())))}")

Generated hybrid embeddings for 746 unique words.
Dimension of hybrid embeddings: 1605


display the vector for a sample word ('educationpolicy') and then implement a function to find and display the top 5 most similar words based on cosine similarity of their hybrid embeddings, as per the subtask instructions.

In [34]:
from sklearn.metrics.pairwise import cosine_similarity

# 4. Display the hybrid embedding vector for a sample word
sample_word_hybrid = "educationpolicy"
if sample_word_hybrid in hybrid_embeddings:
    print(f"\nHybrid embedding for '{sample_word_hybrid}':")
    print(hybrid_embeddings[sample_word_hybrid])
else:
    print(f"\n'{sample_word_hybrid}' not found in hybrid embeddings vocabulary.")

# 5. Implement a function to find the top 5 most similar words based on cosine similarity
def find_similar_hybrid_words(word, hybrid_embeddings_dict, topn=5):
    if word not in hybrid_embeddings_dict:
        return []

    word_vector = hybrid_embeddings_dict[word].reshape(1, -1)

    # Calculate cosine similarity with all other words
    similarities = []
    for vocab_word, vocab_vector in hybrid_embeddings_dict.items():
        if vocab_word == word:
            continue

        vocab_vector_reshaped = vocab_vector.reshape(1, -1)
        similarity = cosine_similarity(word_vector, vocab_vector_reshaped)[0][0]
        similarities.append((vocab_word, similarity))

    # Sort by similarity in descending order
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:topn]

# Demonstrate finding similar words for the chosen sample word
if sample_word_hybrid in hybrid_embeddings:
    print(f"\nTop 5 similar words to '{sample_word_hybrid}' (Hybrid Embeddings):")
    similar_hybrid_words = find_similar_hybrid_words(sample_word_hybrid, hybrid_embeddings, topn=5)
    if similar_hybrid_words:
        for word, similarity in similar_hybrid_words:
            print(f"{word}: {similarity:.4f}")
    else:
        print(f"Could not find similar words for '{sample_word_hybrid}'.")



Hybrid embedding for 'educationpolicy':
[ 0.          0.          0.         ... -0.00321824 -0.00937957
  0.00448368]

Top 5 similar words to 'educationpolicy' (Hybrid Embeddings):
education: 0.7453
policy: 0.5772
police: 0.4489
station: 0.4061
political: 0.3975
