# Loading Data

In [None]:
import nltk
nltk.download('reuters')
nltk.download('punkt')
from nltk.corpus import reuters
sentences = [nltk.word_tokenize(reuters.raw(fid).lower())
             for fid in reuters.fileids()[:500]]
print("Number of sentences:", len(sentences))
print("Example sentence:", sentences[0][:15])

# Training Word2Vec

In [None]:
from gensim.models import Word2Vec
model = Word2Vec(
    sentences,
    vector_size=2,   #length of embedding vector
    window=5,          #context window size
    min_count=1,       #ignore rare words
    sg=1,              #1=Skip-gram, 0=CBOW
    epochs=10
)

In [None]:
print("Vocabulary size:", len(model.wv))
list(model.wv.key_to_index.keys())[:50]

# Similarity Check

In [None]:
def in_vocab(m, w): 
    return w in m.wv.key_to_index

def safe_most_similar(m, w, topn=10):
    return m.wv.most_similar(w, topn=topn) if in_vocab(m,w) else f"'{w}' not in vocab"

def safe_similarity(m, w1, w2):
    miss = [w for w in (w1,w2) if not in_vocab(m,w)]
    return f"Missing: {miss}" if miss else m.wv.similarity(w1, w2)

def safe_analogy(m, positive, negative, topn=10):
    miss = [w for w in (positive+negative) if not in_vocab(m,w)]
    return f"Missing: {miss}" if miss else m.wv.most_similar(positive=positive, negative=negative, topn=topn)

In [None]:
# Similarities
print(safe_most_similar(model, "bank"))
print(safe_most_similar(model, "oil"))
print(safe_similarity(model, "money", "currency"))

# Analogies (country ↔ currency patterns common in Reuters)
print(safe_analogy(model, positive=["japan", "dollar"], negative=["usa"]))     # ≈ 'yen'
print(safe_analogy(model, positive=["britain", "dollar"], negative=["usa"]))   # ≈ 'sterling'
print(safe_analogy(model, positive=["france", "dollar"], negative=["usa"]))    # ≈ 'franc' (1980s corpus)
print(safe_analogy(model, positive=["germany", "dollar"], negative=["usa"]))   # ≈ 'mark'

# Visualisation of Semantic Space

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

words = ["bank","money","currency","oil","trade","market","yen","sterling","franc","mark",]
vectors = [model.wv[w] for w in words if w in model.wv]
coords = PCA(n_components=2).fit_transform(vectors)

plt.figure(figsize=(8,6))
kept_words = [w for w in words if w in model.wv]
for i, word in enumerate(kept_words):
    plt.scatter(coords[i,0], coords[i,1])
    plt.annotate(word, (coords[i,0]+0.02, coords[i,1]+0.02))
plt.title("Word Embedding Visualisation")
plt.show()

In [None]:
model_cbow = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=0, epochs=10)
print("Skip-gram → ", safe_most_similar(model, "bank"))
print("CBOW     → ", safe_most_similar(model_cbow, "bank"))

# Compare Cbow and Skip-gram

In [None]:
import gensim.downloader as api
glove = api.load("glove-wiki-gigaword-50")
print(glove.most_similar("car"))

# Bias Checking

In [None]:
print(glove.similarity("man","woman"))

In [None]:
print(glove.most_similar("woman"))

In [None]:
print(glove.most_similar(positive=["man","doctor"],  negative=["woman"]))

In [None]:
print(glove.most_similar(positive=["woman","nurse"], negative=["man"]))