In [None]:
from datasets import load_dataset

data = load_dataset("wikipedia", "20220301.en", trust_remote_code=True)

### Loading the wikipedia English text Data available from Hugging Face

https://huggingface.co/datasets/wikipedia

In [6]:
type(data)

datasets.dataset_dict.DatasetDict

Converting DatasetDict type to Pandas Dataframe

In [7]:
from datasets import DatasetDict
import pandas as pd

# Convert a specific split to DataFrame
train_df = data['train'].to_pandas()

# Now you can view the DataFrame
print(train_df.head())

    id                                      url      title  \
0   12  https://en.wikipedia.org/wiki/Anarchism  Anarchism   
1   25     https://en.wikipedia.org/wiki/Autism     Autism   
2   39     https://en.wikipedia.org/wiki/Albedo     Albedo   
3  290          https://en.wikipedia.org/wiki/A          A   
4  303    https://en.wikipedia.org/wiki/Alabama    Alabama   

                                                text  
0  Anarchism is a political philosophy and moveme...  
1  Autism is a neurodevelopmental disorder charac...  
2  Albedo (; ) is the measure of the diffuse refl...  
3  A, or a, is the first letter and the first vow...  
4  Alabama () is a state in the Southeastern regi...  


In [8]:
len(train_df)

6458670

## Sampling out 99999 Rows initially

In [20]:
# Assuming train_df is your original DataFrame
df = train_df.sample(n=99999, random_state=42)  # random_state for reproducibility

# Now df contains a random sample of 2000 rows from train_df

In [21]:
text_data = df['text'].apply(str)  # Ensure all data is string

In [22]:
text_data

5061375    Manmohan Tiwari (born 20 January 1984) is an I...
558397     Everything Is Wrong is the third studio album ...
4552231    Avalanche Canada is a non-government, non-prof...
297554     The fukiya (吹き矢) is the Japanese blowgun, as w...
426754     Vasanta or Vasantha may refer to:\n\n Vasanta ...
                                 ...                        
6209050    John J. Cronin is a Massachusetts state senato...
4824072    Radiate is a mobile app that connects people g...
4526894    Eddie Castrodad is an American former film, te...
4309994    Qaleh-ye Chum (, also Romanized as Qal‘eh-ye C...
3288019    Mahogany Bluff () is a rocky bluff  southwest ...
Name: text, Length: 99999, dtype: object

### Having sampled out about 99k rows we will move to pre-process the text

### Pre-processing the text, cleaning and preparing it for 

1) Lowercase all text.
2) Remove Punctuations.
3) Leaverged Stopwords API from NLTK to remove unnecessary stopwords.
4) Leaveraged word tokenizer API from NLTK to tokenise the words.

In [23]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# nltk.download('punkt')
# nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def lowercase(text):
    return text.lower()

def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

def tokenize(text):
    return word_tokenize(text)

[nltk_data] Downloading package punkt to /home/km.s/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/km.s/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Applying Preprocessing

In [24]:
# Apply preprocessing steps
preprocessed_text = text_data.apply(lowercase).apply(remove_punctuation).apply(remove_stopwords)
tokens = preprocessed_text.apply(tokenize)

# Flattening the list of tokens for building vocabulary
flat_list_tokens = [item for sublist in tokens.tolist() for item in sublist]

In [25]:
len(flat_list_tokens)

30582358

In [26]:
text_list = [text.split() for text in preprocessed_text]

In [27]:
len(text_list)

99999

In [28]:
type(text_list)

list

In [29]:
# Initialize a count
count_of_lists = 0

# Loop through each item in text_list
for item in text_list:
    if isinstance(item, list):  # Check if the item is a list
        count_of_lists += 1

print(f"There are {count_of_lists} lists inside the list.")

There are 99999 lists inside the list.


### Approach 2: Continous Bag of Words (CBOW)

* Continuous skip-gram model: predicts words within a certain range before and after the current word in the same sentence. A worked example of this is given below.

In [30]:
# Apply preprocessing steps
preprocessed_text = text_data.apply(lowercase).apply(remove_punctuation).apply(remove_stopwords)
tokens = preprocessed_text.apply(tokenize)

# Flattening the list of tokens for building vocabulary
flat_list_tokens = [item for sublist in tokens.tolist() for item in sublist]

In [31]:
# Convert each row of text into a list of words
text_list = [text.split() for text in preprocessed_text]

In [32]:
from collections import Counter

def build_vocab(tokens):
    vocab = {word: idx for idx, word in enumerate(set(tokens))}
    return vocab

def encode_tokens(tokens, vocab):
    encoded = [vocab[token] for token in tokens]
    return encoded

vocab = build_vocab(flat_list_tokens)
encoded_tokens = [encode_tokens(token_list, vocab) for token_list in tokens]

# Example output
print(f"Vocabulary size: {len(vocab)}")

Vocabulary size: 1040482


In [33]:
number_of_lists = sum(isinstance(item, list) for item in encoded_tokens)
print(number_of_lists)

99999


### Building Word2Vec Model
With our data preprocessed, now we define and train our Word2Vec model. I have used the Gensim library, which is straightforward for training Word2Vec models.

### Word2Vec Parameters
* **sentences** : List of Lists inside which preprocessed text sits, which is used by GENSIM to train word2vec based on the context and sematic relationships captured

* **vector_size**: It is the size of dimension's (2D, 3D, in our case 100D) which is being initialized to help the model catch complex semntic relationships

* **window**: number of words to be observed to catch context. The window size determines the span of words on either side of a target_word that can be considered a context word

* **sg**: Setting the Approach (1-Skip Gram approach, else if the status is set to "0", then it is CBOW approach)

* **min_count**: 

* **epochs**: Number of Loops over the data for 

Also we will be building models with both CBOW and Skip-gram. And we will compare the performance of the two models on this dataset alone.

In [34]:
from gensim.models import Word2Vec

# Assuming 'sentences' is your preprocessed and tokenized text data
cbow_model = Word2Vec(sentences = text_list, vector_size=100, window=5, sg=0, min_count=1)
# cbow_model.train(encoded_tokens, total_examples=len(encoded_tokens), epochs=10)

# Now you can compare Skip-gram and CBOW based on:
# - Quality of the embeddings via similarity measures or analogy tasks
# - Performance on downstream tasks (if any)

* Training the CBOW model was significantly quicker than the skip gram approach

In [35]:
# Words to compare
word1 = 'sun'
word2 = 'cloth'

# Check if both words are in the vocabulary of the model (assuming CBOW model here)
if word1 in cbow_model.wv.key_to_index and word2 in cbow_model.wv.key_to_index:
    similarity_skip_gram = cbow_model.wv.similarity(word1, word2)
    print(f'Skip-gram model similarity between {word1} and {word2}: {similarity_skip_gram}')
else:
    print(f'One or both words not in Skip-gram model vocabulary.')

Skip-gram model similarity between sun and cloth: 0.22371233999729156


In [36]:
# Specify your input word
input_word = 'example'  # Replace 'example' with your actual input word

# Assuming 'model' is your Skip-gram trained model
# Find the 10 words closest to the input word
closest_words = cbow_model.wv.most_similar(input_word, topn=10)

print(f"10 words closest to '{input_word}':")
for word, similarity in closest_words:
    print(f"{word}: {similarity}")

10 words closest to 'example':
instance: 0.8416684865951538
examples: 0.7914485335350037
likewise: 0.7651951313018799
particular: 0.7328760027885437
contrast: 0.7312443852424622
clearly: 0.7232483625411987
similarly: 0.7210996150970459
necessarily: 0.7077968716621399
manner: 0.6973150372505188
rather: 0.695795476436615


In [44]:
# List first 10 words in Skip-gram model's vocabulary
skip_gram_vocab = list(cbow_model.wv.key_to_index.keys())
print("First 10 words in Skip-gram model's vocabulary:")
print(skip_gram_vocab[:10])

First 10 words in Skip-gram model's vocabulary:
['also', 'first', 'new', 'references', 'one', 'people', 'american', 'two', 'united', 'university']


In [39]:
import numpy as np

# Specify your input word
input_word = 'sun'  # Replace 'example' with your actual input word

# Assuming 'model' is your Skip-gram trained model
# Retrieve all words from the model's vocabulary
all_words = list(cbow_model.wv.key_to_index.keys())

# Calculate similarity of input word with all other words in the vocabulary
similarities = [(word, cbow_model.wv.similarity(input_word, word)) for word in all_words]

# Sort the words by similarity in ascending order (least similar first)
least_similar_words = sorted(similarities, key=lambda x: x[1])

# Display the 10 least similar words to the input word
print(f"10 words least similar to '{input_word}':")
for word, similarity in least_similar_words[:10]:
    print(f"{word}: {similarity}")

10 words least similar to 'sun':
zistersdorf: -0.4427976906299591
arithmomaniac: -0.44182586669921875
mattani: -0.42715543508529663
schünemanns: -0.4065355360507965
hostagetakers: -0.40505439043045044
atiglio: -0.4049690365791321
ollerías: -0.4011055529117584
acalyphus: -0.4006273150444031
ghid: -0.39958053827285767
savene: -0.3926926553249359


In [40]:
similarity_king_queen = cbow_model.wv.similarity('man', 'women')
print(f'Similarity between king and queen: {similarity_king_queen}')

Similarity between king and queen: 0.46683260798454285


In [45]:
cbow_model.wv.similarity('laptop'.lower(), 'pc'.lower())

0.65765923

In [41]:
print(cbow_model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=1))

[('queen', 0.7001268863677979)]


In [43]:
from gensim.models import Word2Vec

# Define the analogy components
positive = ['king', 'woman']
negative = ['man']

# Find the words that complete the analogy
result = cbow_model.wv.most_similar(positive=positive, negative=negative, topn=1)

# Print the result
print(f"man is to king as woman is to {result[0][0]}")

man is to king as woman is to queen


This test proves that the model has been able to learn and performs fine on Analogy tests

## Odd one out Test

In [46]:
# Save model vectors and metadata for projector
# saved_vectors = cbow_model.wv.save_word2vec_format('vecs.tsv', 'meta.tsv', binary=False)