In [None]:
# !pip install nltk
# !pip install gensim
# !pip install scikit-learn
# !pip install matplotlib
# !pip install requests

### Loading the wikipedia English text Data available from Hugging Face

https://huggingface.co/datasets/wikipedia

In [2]:
from datasets import load_dataset

data = load_dataset("wikipedia", "20220301.en", trust_remote_code=True)

In [3]:
type(data)

datasets.dataset_dict.DatasetDict

Converting DatasetDict type to Pandas Dataframe

In [5]:
from datasets import DatasetDict
import pandas as pd

# Convert a specific split to DataFrame
train_df = data['train'].to_pandas()

# Now you can view the DataFrame
print(train_df.head())

    id                                      url      title  \
0   12  https://en.wikipedia.org/wiki/Anarchism  Anarchism   
1   25     https://en.wikipedia.org/wiki/Autism     Autism   
2   39     https://en.wikipedia.org/wiki/Albedo     Albedo   
3  290          https://en.wikipedia.org/wiki/A          A   
4  303    https://en.wikipedia.org/wiki/Alabama    Alabama   

                                                text  
0  Anarchism is a political philosophy and moveme...  
1  Autism is a neurodevelopmental disorder charac...  
2  Albedo (; ) is the measure of the diffuse refl...  
3  A, or a, is the first letter and the first vow...  
4  Alabama () is a state in the Southeastern regi...  


In [6]:
len(train_df['text'])

6458670

## Sampling out 99999 Rows initially

In [7]:
# Assuming train_df is your original DataFrame
df = train_df.sample(n=99999, random_state=42)  # random_state for reproducibility

# Now df contains a random sample of 99999 rows from train_df

In [8]:
text_data = df['text'].apply(str)  # Ensure all data is string

In [9]:
text_data

5061375    Manmohan Tiwari (born 20 January 1984) is an I...
558397     Everything Is Wrong is the third studio album ...
4552231    Avalanche Canada is a non-government, non-prof...
297554     The fukiya (吹き矢) is the Japanese blowgun, as w...
426754     Vasanta or Vasantha may refer to:\n\n Vasanta ...
                                 ...                        
6209050    John J. Cronin is a Massachusetts state senato...
4824072    Radiate is a mobile app that connects people g...
4526894    Eddie Castrodad is an American former film, te...
4309994    Qaleh-ye Chum (, also Romanized as Qal‘eh-ye C...
3288019    Mahogany Bluff () is a rocky bluff  southwest ...
Name: text, Length: 99999, dtype: object

### Having sampled out about 99k rows we will move to pre-process the text

### Pre-processing the text, cleaning and preparing it for 

1) Lowercase all text.
2) Remove Punctuations.
3) Leaverged Stopwords API from NLTK to remove unnecessary stopwords.
4) Leaveraged word tokenizer API from NLTK to tokenise the words.

In [35]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def lowercase(text):
    return text.lower()

def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

def tokenize(text):
    return word_tokenize(text)

[nltk_data] Downloading package punkt to /home/km.s/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/km.s/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Applying Preprocessing

In [11]:
# Apply preprocessing steps
preprocessed_text = text_data.apply(lowercase).apply(remove_punctuation).apply(remove_stopwords)
tokens = preprocessed_text.apply(tokenize)

# Flattening the list of tokens for building vocabulary
flat_list_tokens = [item for sublist in tokens.tolist() for item in sublist]

In [12]:
len(flat_list_tokens)

30582358

In [13]:
type(flat_list_tokens)

list

In [14]:
preprocessed_text

5061375    manmohan tiwari born 20 january 1984 indian te...
558397     everything wrong third studio album american e...
4552231    avalanche canada nongovernment nonprofit organ...
297554     fukiya 吹き矢 japanese blowgun well term associat...
426754     vasanta vasantha may refer vasanta ritu basant...
                                 ...                        
6209050    john j cronin massachusetts state senator repr...
4824072    radiate mobile app connects people going event...
4526894    eddie castrodad american former film televisio...
4309994    qalehye chum also romanized qalehye chūm villa...
3288019    mahogany bluff rocky bluff southwest cape gord...
Name: text, Length: 99999, dtype: object

In [15]:
# Convert each row of text into a list of words
text_list = [text.split() for text in preprocessed_text]

# Text ready for GENSIM
* Now text_list is in the correct format for Word2Vec
* It is a list of lists, where each sublist is a sequence of words (tokens) from a single document, which can be fed into the GENSIM Model.

NOTE: Processing main dataset - the list of Tokens have now taken up more than 3 hours... Moving to a simpler Dataset

In [16]:
from collections import Counter

def build_vocab(tokens):
    vocab = {word: idx for idx, word in enumerate(set(tokens))}
    return vocab

def encode_tokens(tokens, vocab):
    encoded = [vocab[token] for token in tokens]
    return encoded

vocab = build_vocab(flat_list_tokens)
encoded_tokens = [encode_tokens(token_list, vocab) for token_list in tokens]

# Example output
print(f"Vocabulary size: {len(vocab)}")

Vocabulary size: 1040482


In [17]:
number_of_lists = sum(isinstance(item, list) for item in encoded_tokens)
print(number_of_lists)

99999


### Building Word2Vec Model
With our data preprocessed, now we define and train our Word2Vec model. I have used the Gensim library, which is straightforward for training Word2Vec models.

### Word2Vec Parameters
* **sentences** : List of Lists inside which preprocessed text sits, which is used by GENSIM to train word2vec based on the context and sematic relationships captured

* **vector_size**: It is the size of dimension's (2D, 3D, in our case 100D) which is being initialized to help the model catch complex semntic relationships

* **window**: number of words to be observed to catch context. The window size determines the span of words on either side of a target_word that can be considered a context word

* **sg**: Setting the Approach (1-Skip Gram approach, else if the status is set to "0", then it is CBOW approach)

* **min_count**: 

* **epochs**: Number of Loops over the data for 

Also we will be building models with both CBOW and Skip-gram. And we will compare the performance of the two models on this dataset alone.

### Approach 1: Skip-Gram Approach

* Continuous bag-of-words model: predicts the middle word based on surrounding context words. The context consists of a few words before and after the current (middle) word. This architecture is called a bag-of-words model as the order of words in the context is not important.

In [18]:
from gensim.models import Word2Vec

# Define Word2Vec model - Using Skip-gram (sg=1)
model = Word2Vec(sentences=text_list, vector_size=100, window=5, sg=1, min_count=1, epochs=10)

In [19]:
# from gensim.models import Word2Vec

# Assuming your Gensim model is loaded in the variable `model`

# Define the analogy components
positive = ['king', 'woman']
negative = ['man']

# Find the words that complete the analogy
result = model.wv.most_similar(positive=positive, negative=negative, topn=1)

# Print the result
print(f"man is to king as woman is to {result[0][0]}")

man is to king as woman is to throne


## The Skip-gram approach has failed the Analogy test. Whereas thr CBOW approach performed fine.

## Next we will be comparing both the approaches. 

A brief summary might include observations on which model captures semantic relationships better, their training time comparison, and any differences in performance on specific tasks (e.g., analogy solving).



In [24]:
# Words to compare
word1 = 'sun'
word2 = 'cloth'

# Check if both words are in the vocabulary of the model (assuming Skip-gram model here)
if word1 in model.wv.key_to_index and word2 in model.wv.key_to_index:
    similarity_skip_gram = model.wv.similarity(word1, word2)
    print(f'Skip-gram model similarity between {word1} and {word2}: {similarity_skip_gram}')
else:
    print(f'One or both words not in Skip-gram model vocabulary.')

Skip-gram model similarity between sun and cloth: 0.336364209651947


In [25]:
# Specify your input word
input_word = 'example'

# Assuming 'model' is your Skip-gram trained model
# Find the 10 words closest to the input word
closest_words = model.wv.most_similar(input_word, topn=10)

print(f"10 words closest to '{input_word}':")
for word, similarity in closest_words:
    print(f"{word}: {similarity}")

10 words closest to 'example':
instance: 0.9114886522293091
examples: 0.8914328217506409
particular: 0.8338468074798584
contrast: 0.8154786229133606
consider: 0.8134335875511169
likewise: 0.8124775886535645
similarly: 0.8118988871574402
therefore: 0.8118278384208679
necessarily: 0.8104322552680969
possible: 0.8036367297172546


In [26]:
import numpy as np

# Specify your input word
input_word = 'sun'  # Replace 'example' with your actual input word

# Assuming 'model' is your Skip-gram trained model
# Retrieve all words from the model's vocabulary
all_words = list(model.wv.key_to_index.keys())

# Calculate similarity of input word with all other words in the vocabulary
similarities = [(word, model.wv.similarity(input_word, word)) for word in all_words]

# Sort the words by similarity in ascending order (least similar first)
least_similar_words = sorted(similarities, key=lambda x: x[1])

# Display the 10 least similar words to the input word
print(f"10 words least similar to '{input_word}':")
for word, similarity in least_similar_words[:10]:
    print(f"{word}: {similarity}")

10 words least similar to 'sun':
id977: -0.3495708703994751
kargo: -0.3426513969898224
sw20: -0.3407485783100128
421778: -0.32830116152763367
manogi: -0.3118351399898529
460836: -0.30719104409217834
id919: -0.2978152334690094
04490: -0.29093706607818604
id971: -0.28943178057670593
846th: -0.2891865372657776


In [27]:
# List first 10 words in Skip-gram model's vocabulary
skip_gram_vocab = list(model.wv.key_to_index.keys())
print("First 10 words in Skip-gram model's vocabulary:")
print(skip_gram_vocab[:10])

First 10 words in Skip-gram model's vocabulary:
['also', 'first', 'new', 'references', 'one', 'people', 'american', 'two', 'united', 'university']


## Looks like we found the Issue

suggests that the words in your models' vocabularies are represented by numerical IDs rather than by the actual words themselves. This typically happens when the input data to the Word2Vec model is preprocessed into numerical form (e.g., indexed or encoded) rather than being left as raw text tokens.

Debugging: If retraining is not immediately feasible or if you're curious about the current state of your models, you might want to investigate further by checking the preprocessing steps to see how these numeric IDs were generated from text. This could involve reviewing the tokenization, encoding, or any custom preprocessing logic that we have applied.


In [28]:
# Correct format: List of sentences, where each sentence is a list of word tokens
correct_format_example = [['hello', 'world'], ['word2vec', 'model']]

# Then, train your models again with this format
skip_gram_model = Word2Vec(sentences=correct_format_example, vector_size=100, window=5, sg=1, min_count=1)

# Step 3: Evaluate Word Embeddings
3.1 Compute Similarity Between Word Pairs

In [29]:
similarity_king_queen = model.wv.similarity('man', 'women')
print(f'Similarity between king and queen: {similarity_king_queen}')

Similarity between king and queen: 0.5139315724372864


In [30]:
print('studio' in model.wv.key_to_index)  # For Gensim version 4.x
# or for older versions: 'canada' in model.wv.vocab

True


In [34]:
model.wv.similarity('laptop'.lower(), 'pc'.lower())

0.52500683

In [32]:
print(model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=1))

[('throne', 0.7363272309303284)]


## 3.3 Visualization

For visualization, you can use the Embedding Projector by exporting the model's embeddings and metadata. Here's a quick guide to do so:

In [33]:
# Save model vectors and metadata for projector
# saved_vectors = model.wv.save_word2vec_format('vecs.tsv', 'meta.tsv', binary=False)

Now, Upload vecs.tsv and meta.tsv to the TensorFlow Embedding Projector for visualization.

* Save your model vectors and metadata correctly for the TensorFlow Embedding Projector

## This provide qualitative evidence of your model's performance and insights into the data it was trained on.

In [None]:
from sklearn.cluster import KMeans

# Assume you want to create 10 clusters
kmeans = KMeans(n_clusters=10, random_state=0).fit(vectors)

# Check the cluster assignments for each word
word_clusters = kmeans.labels_
for word, cluster in zip(words, word_clusters):
    print(f"{word}: Cluster {cluster}")

alternative ways to visualize and evaluate your Word2Vec embeddings using Python libraries such as matplotlib, seaborn, and especially plotly for interactive visualizations. Additionally, you can use dimensionality reduction techniques like PCA (Principal Component Analysis) or t-SNE (t-Distributed Stochastic Neighbor Embedding) to reduce the embeddings to two or three dimensions, which are easier to visualize.

### Visualization with PCA or t-SNE

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Extract the word vectors from the model
words = list(model.wv.key_to_index.keys())
vectors = [model.wv[word] for word in words]

# Use t-SNE to reduce dimensionality
tsne = TSNE(n_components=2, random_state=0)
vectors_2d = tsne.fit_transform(vectors)

# Plotting the results
plt.figure(figsize=(10, 10))
plt.scatter(vectors_2d[:, 0], vectors_2d[:, 1], edgecolors='k', c='r')
for word, (x, y) in zip(words, vectors_2d):
    plt.text(x, y, word)
plt.show()

In [53]:
result = model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)

[('throne', 0.7547318935394287)]


In [54]:
import numpy as np

# Assuming you have a Word2Vec model named 'model'
vectors = np.array([model.wv[word] for word in model.wv.key_to_index.keys()])

# Save the vectors to a .npy file
np.save('skip_gram_word_embeddings.npy', vectors)

# Optionally, you might also want to save the corresponding words (metadata) to a separate file
words = list(model.wv.key_to_index.keys())
with open('metadata.txt', 'w', encoding='utf-8') as f:
    for word in words:
        f.write(word + '\n')

The .npy file format, which is a binary file format for storing NumPy arrays, offers a convenient way to persist and later access your Word2Vec embeddings in Python. Here are several scenarios where the .npy file containing your word embeddings might be useful:

1. Transfer Learning and Feature Extraction
Machine Learning Models: Use the word embeddings as features in machine learning models for various NLP tasks such as sentiment analysis, text classification, or clustering. Embeddings can provide rich, pre-trained representations of words that capture semantic relationships, improving model performance.

Deep Learning Architectures: Embeddings can be used as the initial layer in deep learning models for tasks like sequence modeling, language translation, or text generation. Using pre-trained embeddings can speed up training and improve model performance, especially when you have limited labeled data for training.

2. Data Analysis and Visualization
Similarity and Clustering Analyses: Load the embeddings to perform cosine similarity calculations between words, cluster words based on their embeddings, or identify outlier words. These analyses can reveal semantic structures in your dataset.

Visualization: Beyond TensorFlow's Embedding Projector, you can use tools like t-SNE or PCA for dimensionality reduction on the embeddings and then visualize them with libraries such as matplotlib, seaborn, or plotly. This can help you understand the embeddings' quality and the semantic relationships they've captured.

3. Natural Language Processing Applications
Information Retrieval: Enhance search algorithms or recommender systems by using word embeddings to find semantically related documents or items based on textual descriptions.

Text Generation and Auto-completion: Implement models that generate text or complete sentences where embeddings serve as a foundational layer, capturing linguistic nuances.

4. Combining with Other Models
Ensemble Models: Combine the embeddings with other data sources and models. For example, use embeddings alongside image features in a multimodal model that understands both text and visual input.
5. Educational and Experimental Purposes
Learning and Experimentation: Load the embeddings in different environments to experiment with new NLP libraries or to teach concepts related to word embeddings and their applications.


import numpy as np

* Load the embeddings
embeddings = np.load('word_embeddings.npy')

* If you've also saved the words in a separate file, you can load them to map embeddings back to words
with open('metadata.txt', 'r', encoding='utf-8') as f:
    words = [line.strip() for line in f]

* Example: Access the embedding for the first word
first_word_embedding = embeddings[0]
print(f"Embedding for the first word: {first_word_embedding}")



**Storing your embeddings in an .npy file thus provides a versatile, efficient way to reuse pre-trained word representations across a wide range of applications and analyses, enhancing the overall NLP workflow.** 

**Performing Analogy Reasoning**

Analogy reasoning involves finding a word that relates to another word in the same way that a third word relates to a fourth. For example, "man" is to "king" as "woman" is to "queen". To perform this task with your embeddings, you need the embeddings loaded (as you have) and a way to compute similarities:

In [55]:
from sklearn.metrics.pairwise import cosine_similarity

# Assuming you have a function to get the embedding for a word
def get_embedding(word, word_index, embeddings):
    idx = word_index[word]
    return embeddings[idx]

# Function to find the closest word for the vector created by "word_a - word_b + word_c"
def find_closest_word(word_a, word_b, word_c, word_index, embeddings):
    embedding_a = get_embedding(word_a, word_index, embeddings)
    embedding_b = get_embedding(word_b, word_index, embeddings)
    embedding_c = get_embedding(word_c, word_index, embeddings)
    analogy_vector = embedding_a - embedding_b + embedding_c

    # Compute similarity with all words
    similarities = cosine_similarity([analogy_vector], embeddings)[0]
    
    # Exclude the original words from consideration
    for word in [word_a, word_b, word_c]:
        idx = word_index[word]
        similarities[idx] = -1
        
    closest_word_idx = similarities.argmax()
    # Assuming you have a list or array `index_word` that maps indices back to words
    return index_word[closest_word_idx]

# Example usage:
# word_index = {word: idx for idx, word in enumerate(words)}  # Assuming 'words' is your list of words
# index_word = {idx: word for word, idx in word_index.items()}
# print(find_closest_word('king', 'man', 'woman', word_index, embeddings))

This method is effective for exploring semantic relationships captured by your embeddings. To get this code running:

Ensure you have precomputed word embeddings (embeddings) and a mapping of words to their indices in the embedding matrix (word_index and its inverse index_word).
Make sure scikit-learn is installed for cosine_similarity.
The example usage provided in the comment is a good starting point. You would just need to replace the placeholders (word_index, index_word, embeddings) with your actual data structures.