In [6]:
from pathlib import Path
import numpy as np

# Load a list of common stopwords to filter out
import string
import re
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# ✅ Step 1: Load the GloVe embeddings
def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            word = parts[0]
            vector = np.array(parts[1:], dtype=np.float32)
            embeddings[word] = vector
    return embeddings

# ✅ Step 2: Cosine similarity function
from numpy.linalg import norm

def cosine_similarity(v1, v2):
    return np.dot(v1, v2) / (norm(v1) * norm(v2))

def is_valid_word(word):
    return (
        len(word) > 2 and
        word.isalpha() and
        word.lower() not in stop_words
    )

def get_similar_words(word, embeddings, top_n=5):
    if word not in embeddings:
        return []
    vec = embeddings[word]
    similarities = []
    for other_word, other_vec in embeddings.items():
        if other_word == word:
            continue
        if not is_valid_word(other_word):
            continue
        sim = cosine_similarity(vec, other_vec)
        similarities.append((other_word, sim))
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:top_n]

# ✅ Step 4: Load file with pathlib
base_path = Path("C:/Users/drumw/Dev Projects/thesaurus/glove.6B")
file_path = base_path / "glove.6B.100d.txt"

print("Loading embeddings... (this may take a minute)")
glove = load_glove_embeddings(file_path)
print("Done.")

# ✅ Step 5: Try it out!
query = "happy"
results = get_similar_words(query, glove)

print(f"Top words similar to '{query}':")
for word, score in results:
    print(f"{word} ({score:.4f})")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\drumw\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Loading embeddings... (this may take a minute)
Done.
Top words similar to 'happy':
feel (0.8133)
really (0.7904)
glad (0.7833)
good (0.7822)
sure (0.7788)


In [7]:
query = "sad"
results = get_similar_words(query, glove)

print(f"Top words similar to '{query}':")
for word, score in results:
    print(f"{word} ({score:.4f})")

Top words similar to 'sad':
sorry (0.7547)
awful (0.7284)
tragic (0.7239)
horrible (0.7049)
happy (0.6801)


In [8]:
query = "phone"
results = get_similar_words(query, glove)

print(f"Top words similar to '{query}':")
for word, score in results:
    print(f"{word} ({score:.4f})")

Top words similar to 'phone':
telephone (0.9113)
cellphone (0.8122)
phones (0.8031)
mobile (0.7307)
mail (0.7292)
