In [13]:
# ===== (a) Data preparation =====
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

# Load and clean textual document
with open("CBOW.txt", "r", encoding="utf-8") as f:
    text = f.read().lower()

# Tokenize and remove stopwords
tokens = [word for word in simple_preprocess(text) if word not in STOPWORDS]

# Convert into list of sentences (Word2Vec expects list of lists)
sentences = [tokens]
print("Sample tokens (after stopword removal):", sentences[0][:15])



Sample tokens (after stopword removal): ['speed', 'transmission', 'important', 'point', 'difference', 'viruses', 'influenza', 'shorter', 'median', 'incubation', 'period', 'time', 'infection', 'appearance', 'symptoms']


In [15]:
model = Word2Vec(sentences=sentences,
                 vector_size=100,
                 window=5,
                 min_count=1,
                 sg=0,
                 epochs=100)


print("Model training complete.")


Model training complete.


In [16]:
# ===== (d) Output =====
# Example: find most similar words to a given word
word = "virus"
if word in model.wv:
    print(f"Top similar words to '{word}':")
    for similar_word, similarity in model.wv.most_similar(word, topn=10):
        print(f"{similar_word} ({similarity:.3f})")
else:
    print(f"'{word}' not found in vocabulary.")

# Check vector representation of a word
print("\nEmbedding vector for 'virus':")
print(model.wv[word])


Top similar words to 'virus':
influenza (0.921)
appearance (0.909)
serial (0.908)
time (0.902)
appear (0.900)
covid (0.889)
symptoms (0.888)
days (0.887)
driver (0.885)
transmission (0.883)

Embedding vector for 'virus':
[-0.00744741  0.00976061 -0.00230966  0.0106724  -0.01204116 -0.03080683
  0.01706928  0.04660475 -0.02097418 -0.02893195  0.00889159 -0.02654956
  0.00477829  0.01619087  0.00623949 -0.01410797  0.02830138 -0.01332968
 -0.01641751 -0.04521523  0.02968431  0.00614589  0.04022946 -0.02144717
  0.008104   -0.0003295  -0.02938236  0.00956712 -0.01674792 -0.00534568
  0.00524743 -0.00076037  0.0323168  -0.05044021 -0.00687755  0.01049629
  0.00372245 -0.00310479 -0.02157882 -0.01479657 -0.01139371 -0.0009721
 -0.01005893 -0.00682696  0.00906566 -0.01927518 -0.00991515 -0.00350771
  0.01079033  0.0198154   0.00569895 -0.01145478 -0.01810568 -0.00563003
  0.01804575 -0.01962474  0.01934262 -0.0125496  -0.00930856  0.01076112
 -0.000827   -0.02456477  0.02659628  0.00928206 -