# Part 1 Meaning from Text

## 1) Imports and Downloads

In [1]:
import os
import requests
import nltk
import numpy as np
import scipy.sparse
import scipy.stats
import sklearn.decomposition
import sklearn.metrics.pairwise

import warnings
warnings.filterwarnings("ignore")

os.makedirs('data', exist_ok = True)
nltk.download(info_or_id = 'brown', download_dir = 'data')
nltk.data.path = ['data'] # makihng sure I use the one in data/

import nltk.corpus

[nltk_data] Downloading package brown to data...
[nltk_data]   Package brown is already up-to-date!


## 2) Extracting common words

In [2]:
brownWords = nltk.corpus.brown.words()
wordsDist = nltk.FreqDist((w.lower() for w in brownWords))
sortedWords = sorted(wordsDist.items(), reverse=True, key = lambda x : x[1])

This gets us all the words from there we can remove nonword tokens and reduce to the top 5000

In [3]:
Wc = [(w,c) for w,c in sortedWords if not re.match('.*[^a-z]+', w)][:5000]

Then the top and bottom words are:

In [4]:
print("Top 5 words:")
for w,c in Wc[:5]:
    print(w)
print()
print("Bottom 5 words:")
for w,c in Wc[:-6:-1]:
    print(w)

Top 5 words:
the
of
and
to
a

Bottom 5 words:
letch
haney
killpath
rourke
vertex


Gettign the RG65 dataset

In [5]:
r = requests.get('https://raw.githubusercontent.com/AlexGrinch/ro_sgns/master/datasets/rg65.csv') # could not fing a better source
rgWords = list(set([w for l in r.text.split('\n') for w in l.strip().split(';')[:2]]))
print(f"{len(rgWords)} words found in RG65")

48 words found in RG65


In [6]:
for rgW in rgWords:
    if rgW not in [w[0] for w in Wc]: # polynomial O is best O
        Wc.append((rgW, wordsDist[rgW]))
W = set((w for w,c in Wc))
print(f"Now we have {len(W)} words in W")

Now we have 5030 words in W


## 3) Construct Bigram Matrix

In [7]:
M1dict = {}
M1index = {w : i for i, w in enumerate(W)}
for w1, w2 in zip(brownWords[:-1], brownWords[1:]):
    try:
        w1i = M1index[w1.lower()]
        w2i = M1index[w2.lower()]
    except KeyError:
        pass
    else:
        try:
            M1dict[(w1i, w2i)] += 1
        except KeyError:
            M1dict[(w1i, w2i)] = 1
dat = []
i = []
j = []
for (k1, k2), v in M1dict.items():
    dat.append(v)
    i.append(k1)
    j.append(k2)

In [8]:
M1 = scipy.sparse.coo_matrix((dat, (i,j)), shape = (len(W), len(W)))
M1 = M1 + M1.T

In [9]:
print(f"Testing for the count on 'the' and 'driver': {M1[M1index['the'], M1index['driver']]}")

Testing for the count on 'the' and 'driver': 23


## 4) Calculate PPMI

In [10]:
m = np.matrix(M1.toarray())
m = m / m.sum() # Convert to probs

#some annoying zeros show up
cond = (M1 > 0).toarray()
x = m.sum(axis = 0)@m.sum(axis = 0).T
y = np.ones((len(W), len(W)))


M1P = np.where(cond, np.log2(M1 / np.where(cond, x, y)), np.zeros((len(W), len(W))))
M1P[M1P < 0] = 0

## 5) Do the LSA

In [11]:
M2_10 = sklearn.decomposition.PCA(n_components = 10).fit_transform(M1P)
M2_100 = sklearn.decomposition.PCA(n_components = 100).fit_transform(M1P)
M2_300 = sklearn.decomposition.PCA(n_components = 300).fit_transform(M1P)

# 6) Extract the Word Pairs, with Human Rankings

In [12]:
rgSets = [s.strip().split(';') for s in r.text.split('\n')]
P = [(w1,w2) for w1, w2, s in rgSets]
S = [float(s) for w1, w2, s in rgSets]

# 7) Extract the Word Pairs, for Matrices Vectors

In [13]:
S_M1 = []
S_M1P = []
S_M2_10 = []
S_M2_100 = []
S_M2_300 = []

for w1,w2 in P:
    w1i = M1index[w1]
    w2i = M1index[w2]
    for m, s in zip([M1, M1P, M2_10, M2_100, M2_300], [S_M1, S_M1P, S_M2_10, S_M2_100, S_M2_300]):
        try:
            m = m.toarray()
        except:
            pass
        s.append(sklearn.metrics.pairwise.cosine_similarity(m[w1i].reshape(1,-1), m[w2i].reshape(1,-1))[0][0])

# 8) Report results

In [14]:
print(f"The Pearson coefficient between between humans and M1 is: {scipy.stats.pearsonr(S_M1, S)[0]:.4f}")
print(f"The Pearson coefficient between between humans and M1+ is: {scipy.stats.pearsonr(S_M1P, S)[0]:.4f}")
print(f"The Pearson coefficient between between humans and M2_10 is: {scipy.stats.pearsonr(S_M2_10, S)[0]:.4f}")
print(f"The Pearson coefficient between between humans and M2_100 is: {scipy.stats.pearsonr(S_M2_100, S)[0]:.4f}")
print(f"The Pearson coefficient between between humans and M2_300 is: {scipy.stats.pearsonr(S_M2_300, S)[0]:.4f}")

The Pearson coefficient between between humans and M1 is: 0.1047
The Pearson coefficient between between humans and M1+ is: 0.1957
The Pearson coefficient between between humans and M2_10 is: 0.0709
The Pearson coefficient between between humans and M2_100 is: 0.1497
The Pearson coefficient between between humans and M2_300 is: 0.1592


# Part 2 Meaning Construction from Text
## 1) Import

In [15]:
import gensim

#the word2vec pretrain is hard to download without a browser so I did that manually

# 2) Load the word2vec model

In [16]:
model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [17]:
S_w2v = []
for w1, w2 in P:
    S_w2v.append(sklearn.metrics.pairwise.cosine_similarity(model[w1].reshape(1,-1), model[w2].reshape(1,-1))[0][0])

In [18]:
print(f"The pearson correlation coeffienct betweeen humans and the word2vec model is:\n{scipy.stats.pearsonr(S_w2v, S)[0]:.4f}")

The pearson correlation coeffienct betweeen humans and the word2vec model is:
0.7721


# 3) Do Analogies test


In [19]:
r_analogies = requests.get('http://www.fit.vutbr.cz/~imikolov/rnnlm/word-test.v1.txt')

In [20]:
analogies = {}     
for l in r_analogies.text.split('\n')[1:]:
    if l.startswith(':'):
        currentName = l[1:].strip()
        analogies[currentName] = []
    else:
        analogies[currentName].append(l.lower().strip().split())

In [21]:
#For the analogies we'll limit it to family for sematic and gram7-past-tense for syntatic

M1index_reverse = {v : k for k,v in M1index.items()}

words_sem = analogies['family']

good_analogies = []
for analogy in words_sem:
    if len([w for w in analogy if w in M1index]) == 4:
        good_analogies.append(analogy)
#Reducing set some more for better run time
good_analogies_sem = sorted(good_analogies)

#This seems to have at least a few hits
words_sem = analogies['gram7-past-tense']

good_analogies = []
for analogy in words_sem:
    if len([w for w in analogy if w in M1index]) == 4:
        good_analogies.append(analogy)
#Reducing set some more for better run time
good_analogies_syn = sorted(good_analogies)

print(f"We have {len(good_analogies_sem)} sematic and {len(good_analogies_syn)} syntatic analogies to test")

We have 90 sematic and 600 syntatic analogies to test


In [22]:
word2vec_results_syn = []
M300_results_syn = []


for w1, w2, w3, w4 in good_analogies_syn:
    w2vRet = model.most_similar(positive=[w3, w1], negative=[w2], topn = 1)
    if w2vRet[0][0].lower() == w4:
        word2vec_results_syn.append(True)
    else:
        word2vec_results_syn.append(False)
    v = M2_300[M1index[w1]] + M2_300[M1index[w2]] - M2_300[M1index[w3]]
    idM300 = np.linalg.norm((M2_300 - v), axis = 1).argmin()
    if M1index_reverse[idM300] == w4:
        M300_results_syn.append(True)
    else:
        M300_results_syn.append(False)
    print(w1, w2, w3, w4, ':' ,w2vRet[0][0], M1index_reverse[idM300])

falling fell feeding fed : feed fell
falling fell flying flew : fly fell
falling fell going went : coming fell
falling fell increasing increased : decreasing fell
falling fell knowing knew : Knowing fell
falling fell listening listened : Listening fell
falling fell looking looked : seeing fell
falling fell moving moved : Moving fell
falling fell paying paid : pay fell
falling fell playing played : play fell
falling fell reading read : read fell
falling fell running ran : Running fell
falling fell saying said : telling fell
falling fell seeing saw : noticing fell
falling fell selling sold : buying fell
falling fell singing sang : sing fell
falling fell sitting sat : sit fell
falling fell sleeping slept : sleep fell
falling fell spending spent : Spending fell
falling fell striking struck : fanning fell
falling fell taking took : take fell
falling fell thinking thought : talking fell
falling fell walking walked : Walking fell
falling fell writing wrote : penning fell
feeding fed falling f

looking looked striking struck : fanning looked
looking looked taking took : seeking looked
looking looked thinking thought : talking looked
looking looked walking walked : Walking looked
looking looked writing wrote : penning looked
moving moved falling fell : dropping moved
moving moved feeding fed : Feeding moved
moving moved flying flew : zipping moved
moving moved going went : gonna moved
moving moved increasing increased : decreasing moved
moving moved knowing knew : Knowing moved
moving moved listening listened : Listening moved
moving moved looking looked : seeing moved
moving moved paying paid : pay moved
moving moved playing played : play moved
moving moved reading read : read moved
moving moved running ran : Running moved
moving moved saying said : telling moved
moving moved seeing saw : witnessing moved
moving moved selling sold : buying moved
moving moved singing sang : sing moved
moving moved sitting sat : Sitting moved
moving moved sleeping slept : falling_asleep moved
m

singing sang feeding fed : feed sang
singing sang flying flew : fly sang
singing sang going went : gonna sang
singing sang increasing increased : decreasing sang
singing sang knowing knew : Knowing sang
singing sang listening listened : Listening singing
singing sang looking looked : searching sang
singing sang moving moved : move sang
singing sang paying paid : pay sang
singing sang playing played : play sang
singing sang reading read : writing sang
singing sang running ran : Running sang
singing sang saying said : telling sang
singing sang seeing saw : noticing sang
singing sang selling sold : buying sang
singing sang sitting sat : sit sang
singing sang sleeping slept : sleep sang
singing sang spending spent : Spending sang
singing sang striking struck : fanning sang
singing sang taking took : take sang
singing sang thinking thought : daydreaming sang
singing sang walking walked : walk sang
singing sang writing wrote : composing sang
sitting sat falling fell : dropping sat
sitting sa

walking walked taking took : Taking walked
walking walked thinking thought : imagining walked
walking walked writing wrote : reading walked
writing wrote falling fell : dropping wrote
writing wrote feeding fed : Feeding wrote
writing wrote flying flew : fly wrote
writing wrote going went : doing wrote
writing wrote increasing increased : decreasing wrote
writing wrote knowing knew : having wrote
writing wrote listening listened : listen wrote
writing wrote looking looked : searching wrote
writing wrote moving moved : Moving wrote
writing wrote paying paid : pay wrote
writing wrote playing played : play wrote
writing wrote reading read : solving_crossword_puzzles wrote
writing wrote running ran : Running wrote
writing wrote saying said : telling wrote
writing wrote seeing saw : noticing writing
writing wrote selling sold : buying wrote
writing wrote singing sang : sing wrote
writing wrote sitting sat : sit wrote
writing wrote sleeping slept : falling_asleep wrote
writing wrote spending 

In [34]:
print(f"M_300 was correct {np.sum(M300_results_syn)} out of {len(M300_results_syn)} times on the syntactic task")
print(f"word2vec was correct {np.sum(word2vec_results_syn)} out of {len(word2vec_results_syn)} times on the syntactic task")

M_300 was correct 0 out of 600 times on the syntactic task
word2vec was correct 12 out of 600 times on the syntactic task


In [24]:
word2vec_results_sem = []
M300_results_sem = []


for w1, w2, w3, w4 in good_analogies_sem:
    w2vRet = model.most_similar(positive=[w3, w1], negative=[w2], topn = 1)
    if w2vRet[0][0].lower() == w4:
        word2vec_results_sem.append(True)
    else:
        word2vec_results_sem.append(False)
    v = M2_300[M1index[w1]] + M2_300[M1index[w2]] - M2_300[M1index[w3]]
    idM300 = np.linalg.norm((M2_300 - v), axis = 1).argmin()
    if M1index_reverse[idM300] == w4:
        M300_results_sem.append(True)
    else:
        M300_results_sem.append(False)
    print(w1, w2, w3, w4, ':' ,w2vRet[0][0], M1index_reverse[idM300])

boy girl brother sister : younger_brother girl
boy girl father mother : son girl
boy girl he she : him joke
boy girl his her : he lawyer
boy girl husband wife : father girl
boy girl king queen : kings girl
boy girl man woman : teenager boy
boy girl son daughter : father girl
boy girl uncle aunt : nephew girl
brother sister boy girl : man autograph
brother sister father mother : son autograph
brother sister he she : him autograph
brother sister his her : himself autograph
brother sister husband wife : father sister
brother sister king queen : kings brother
brother sister man woman : boy autograph
brother sister son daughter : younger_brother brother
brother sister uncle aunt : nephew sister
father mother boy girl : son father
father mother brother sister : younger_brother mother
father mother he she : him soul
father mother his her : himself neighbors
father mother husband wife : son mother
father mother king queen : kings mother
father mother man woman : boy soul
father mother son daug

In [33]:
print(f"M_300 was correct {np.sum(M300_results_sem)} out of {len(M300_results_sem)} times on the semantics task")
print(f"word2vec was correct {np.sum(word2vec_results_sem)} out of {len(M300_results_sem)} times on the semantics task")

M_300 was correct 0 out of 90 times on the semantics task
word2vec was correct 4 out of 90 times on the semantics task
