In [31]:
# ===== (a) Data preparation =====
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

# Load and clean textual document
with open("CBOW.txt", "r", encoding="utf-8") as f:
    text = f.read().lower()

# Tokenize and remove stopwords
tokens = [word for word in simple_preprocess(text) if word not in STOPWORDS]

# Convert into list of sentences (Word2Vec expects list of lists)
sentences = [tokens]
print("Sample tokens (after stopword removal):", sentences[0][:500])



Sample tokens (after stopword removal): ['speed', 'transmission', 'important', 'point', 'difference', 'viruses', 'influenza', 'shorter', 'median', 'incubation', 'period', 'time', 'infection', 'appearance', 'symptoms', 'shorter', 'serial', 'interval', 'time', 'successive', 'cases', 'covid', 'virus', 'serial', 'interval', 'covid', 'virus', 'estimated', 'days', 'influenza', 'virus', 'serial', 'interval', 'days', 'means', 'influenza', 'spread', 'faster', 'covid', 'transmission', 'days', 'illness', 'potentially', 'pre', 'symptomatic', 'transmission', 'transmission', 'virus', 'appearance', 'symptoms', 'major', 'driver', 'transmission', 'influenza', 'contrast', 'learning', 'people', 'shed', 'covid', 'virus', 'hours', 'prior', 'symptom', 'onset', 'present', 'appear', 'major', 'driver', 'transmission', 'reproductive', 'number', 'number', 'secondary', 'infections', 'generated', 'infected', 'individual', 'understood', 'covid', 'virus', 'higher', 'influenza', 'estimates', 'covid', 'influenza', 'vi

In [48]:
model = Word2Vec(sentences=sentences,
                 vector_size=100,
                 window=10,
                 min_count=1,
                 sg=0,
                 epochs=100)


print("Model training complete.")


Model training complete.


In [49]:
# ===== (d) Output =====
# Example: find most similar words to a given word
word = "covid"
if word in model.wv:
    print(f"Top similar words to '{word}':")
    for similar_word, similarity in model.wv.most_similar(word, topn=10):
        print(f"{similar_word} ({similarity:.3f})")
else:
    print(f"'{word}' not found in vocabulary.")

# Check vector representation of a word
print("\nEmbedding vector for 'virus':")
print(model.wv[word])

Top similar words to 'covid':
virus (0.983)
influenza (0.982)
transmission (0.978)
days (0.977)
number (0.976)
symptom (0.975)
serial (0.974)
symptomatic (0.972)
driver (0.972)
shorter (0.971)

Embedding vector for 'virus':
[-0.02293033  0.01738895  0.00860402  0.02352998 -0.00010871 -0.0673238
  0.03812869  0.08235797 -0.04353629 -0.07143451  0.00072724 -0.07900583
  0.03295023  0.03925068  0.03291447 -0.02146041  0.05138344 -0.00249045
 -0.02688164 -0.05538324  0.04661997  0.00789939  0.07560404 -0.07030333
  0.01660445 -0.00441933 -0.05430967  0.01234618 -0.03069012 -0.00141679
  0.04942898 -0.01174885  0.04618625 -0.09532764 -0.02210496  0.03727218
  0.00412673  0.01891489 -0.0254542  -0.02912026  0.00690944 -0.00808964
 -0.01602949 -0.00905918  0.0216405  -0.02692271 -0.01035081 -0.0365674
  0.01788008  0.01935412  0.03725318 -0.05376064 -0.02870009 -0.03534573
  0.02398488  0.00930145  0.04173145  0.01841035  0.00236125  0.00837199
 -0.01481997 -0.04194589  0.05367661  0.02364012