In [1]:
# Using Hugging Face datasets library, we load the NewsQA dataset 
# But since we dont need the entire thing, we are only taking the first 1000 samples,
# From each data entry, we extract the "context" field,
# here, we are collecting a bunch of real-world English sentences from news articles

from datasets import load_dataset
from nltk.tokenize import sent_tokenize
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import pandas as pd
import nltk
import multiprocessing

# Downloading the necessary tokenizer models from NLTK
nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LOCALACCOUNT\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\LOCALACCOUNT\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
dataset = load_dataset("lucadiliello/newsqa", split="train")

dataset = dataset.select(range(1000))

texts = [item['context'] for item in dataset if item.get('context')]




In [3]:
#now, Word2Vec can not directly work on long paragraphs, it needs to be clean, tokenized sentences for this we will do text preprocessing
#For each sentence, we use Gensim simple_preprocess() to lowercase, clean, and tokenize the sentence and we are ignoring the sentences less than 2 words!
sentences = []
for text in texts:
    for sent in sent_tokenize(text):
        tokens = simple_preprocess(sent) 
        if len(tokens) > 2:
            sentences.append(tokens)


In [None]:
#we train a Word2Vec model using the CBOW (Continuous bag of words) approach
cbow_model = Word2Vec(
    sentences=sentences,
    vector_size=100,            
    window=10,                 
    min_count=2,                
    sg=0,                       # CBOW
    workers=multiprocessing.cpu_count(),
    epochs=10
)

