### Ruben Abbou

## Word Embeddings using PMI
## (a)

In [1]:
import numpy as np, pickle
from collections import Counter
from math import log
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

In [2]:
with open('/project2/cmsc25025/wikipedia/wiki-text.txt') as f:
    for ws in f:
        words = ws.split()
stopwords = {'ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', 'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no', 'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not', 'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 'was', 'here', 'than'}

In [3]:
words = [word for word in words if word not in stopwords]
v = Counter(words)
n = 414
v = {key: val for key, val in v.items() if val > n}
words = [word for word in words if word in v.keys()]
N = len(v)
print("Length of vocab: {0} words".format(N))

Length of vocab: 14989 words


In [4]:
embedding_pairs = Counter()
embedding_centers = Counter()
Ns = 0
for i in range(len(words)):
    word = words[i]
    lb = max(0, i-5)
    ub = min(i+5, len(words)-1)
    for k in range(lb, ub+1):
        if k != i:
            Ns+=1
            embedding_centers[word] += 1
            embedding_pairs[(word, words[k])] += 1

In [5]:
M = csr_matrix([[log((embedding_pairs[(wi, wj)] + 1)*Ns / \
                   (embedding_centers[wi]*embedding_centers[wj])) for wi in v] for wj in v])

## (b)

In [6]:
U, s, V = svds(M, k=50)

## (c)

In [7]:
W = np.dot(U, np.sqrt(np.diag(s)))
pickle.dump(W, open("matrix_W.p", "wb"))

In [4]:
W = pickle.load(open("matrix_W.p", "rb"))

In [5]:
W.shape

(14989, 50)

## (d)

To obtain the closest vectors to a certain word, I took the difference between $W$ and the word's vector $v_w$, and then the norm of this difference across each vector, which results in a vector of $14,989$ norms for each vector of the matrix. Then I took the smallest 5 entries' indexes and obtained the resulting words.

In [6]:
def print_top_5(word):
    word_index = list(v).index(word)
    diff_norms = np.linalg.norm(W-W[word_index], axis = 1)
    indexes = list(diff_norms.argsort()[1:6])
    print("The 5 closest words in the embedding space of %s are:" % word)
    for i in range(len(indexes)):
        print("%i." %(i+1), "%s" % list(v)[indexes[i]])

In [7]:
print_top_5("physics")
print_top_5("republican")
print_top_5("einstein")
print_top_5("algebra")
print_top_5("fish")

The 5 closest words in the embedding space of physics are:
1. mechanics
2. quantum
3. chemistry
4. theoretical
5. mathematics
The 5 closest words in the embedding space of republican are:
1. senator
2. democrat
3. democrats
4. presidential
5. candidate
The 5 closest words in the embedding space of einstein are:
1. relativity
2. physicists
3. paradox
4. maxwell
5. mechanics
The 5 closest words in the embedding space of algebra are:
1. algebraic
2. finite
3. theorem
4. topology
5. calculus
The 5 closest words in the embedding space of fish are:
1. fruit
2. eggs
3. eat
4. seeds
5. feed


## (e)

In [38]:
def top_5_analogy(w1, w2, w3, w4):
    v1 = W[list(v).index(w1)]
    v2 = W[list(v).index(w2)]
    v3 = W[list(v).index(w3)]
    new = v2-v1+v3
    diff_norms = np.linalg.norm(W-new, axis = 1)
    indexes = list(diff_norms.argsort()[:5])
    print("Top 5 words that completes the analogy {0}:{1}::{2}:______\nExpected: {3}" \
          .format(w1, w2, w3, w4))
    for i in range(len(indexes)):
        print("%i." %(i+1), "%s" % list(v)[indexes[i]])

In [40]:
top_5_analogy("france","paris","england", "london")
top_5_analogy("hospital","doctor", "college", "professor")
top_5_analogy("day","light","night", "dark")
top_5_analogy("winter", "cold", "summer", "hot/warm")
top_5_analogy("fall", "leaves", "spring", "flowers")
top_5_analogy("wine", "alcohol", "cocaine", "drug")

Top 5 words that completes the analogy france:paris::england:______
Expected: london
1. london
2. oxford
3. england
4. cambridge
5. edinburgh
Top 5 words that completes the analogy hospital:doctor::college:______
Expected: professor
1. college
2. school
3. academy
4. university
5. professor
Top 5 words that completes the analogy day:light::night:______
Expected: dark
1. light
2. medium
3. dark
4. ground
5. surface
Top 5 words that completes the analogy winter:cold::summer:______
Expected: hot/warm
1. cold
2. intense
3. hot
4. break
5. cool
Top 5 words that completes the analogy fall:leaves::spring:______
Expected: flowers
1. leaves
2. fruit
3. flowers
4. leaf
5. eggs
Top 5 words that completes the analogy wine:alcohol::cocaine:______
Expected: drug
1. drugs
2. cocaine
3. drug
4. chronic
5. addiction
