In [1]:
%cd "/content/drive/MyDrive/NLP/Lab1"

/content/drive/MyDrive/NLP/Lab1


# Initialize required elements

In [2]:
import numpy as np
import pdb
# from sklearn.metrics.pairwise import cosine_distances as cosine
from scipy import stats
from scipy.spatial.distance import cosine
from tqdm import tqdm

In [3]:
V = open("31190-a1-files/vocab-wordsim.txt").read().split("\n")[:-1]
V_C = open("31190-a1-files/vocab-25k.txt").read().split("\n")[:-1]
V_set = {word: idx for idx, word in enumerate(V)}
V_C_set = {word: idx for idx, word in enumerate(V_C)}

In [3]:
corpus = open("wiki-1percent.txt/wiki-1percent.txt").read().split("\n")[:-1]

## 1.1 Distributional Counting

In [4]:
def counting(corpus, V, V_C, V_set, V_C_set, w):
  C = np.zeros((len(V), len(V_C)), dtype=float)
  for line in tqdm(corpus):
    words = ['<s>'] + line.split(" ") + ['</s>']
    length = len(words)

    for idx, word in enumerate(words):
      if idx > 0 and idx  < length - 1:
        # print(idx, word)
        if word in V_set:
          # print(word, words[max(idx-w, 0): idx], words[idx+1: min(idx+w+1, length)])
          # pdb.set_trace();
          for context_word in words[max(idx-w, 0): idx] + words[idx+1: min(idx+w+1, length)]:
            if context_word in V_C_set:
              C[V_set[word], V_C_set[context_word]] +=1
  return C


In [6]:
C = counting(corpus, V, V_C, V_set, V_C_set, w=3)

100%|██████████| 997898/997898 [00:17<00:00, 57999.49it/s]


In [7]:
print(C)

[[ 75. 118. 360. ...   0.   0.   0.]
 [  7.  24.  62. ...   0.   0.   0.]
 [101.  72. 849. ...   0.   0.   0.]
 ...
 [ 10.  16.  28. ...   0.   0.   0.]
 [ 99. 203. 687. ...   0.   0.   0.]
 [ 47.  45. 196. ...   0.   0.   0.]]


In [5]:
def eval_word_similarity(C, path):
  gold = [line.split("\t") for line in open(path).read().split("\n")[1:-1]]

  y, x = [], [] #chứa giá trị tương đồng cho từng cặp từ của người và máy

  for line in gold:
    word_1 = line[0]
    word_2 = line[1]
    gold_score = float(line[2])
    y.append(gold_score)
    if word_1 not in V_set or word_2 not in V_set:
      x.append(0)
    else: 
      word_1_vec = C[V_set[word_1], :]
      word_2_vec = C[V_set[word_2], :]
      x.append(1 - cosine(word_1_vec, word_2_vec))
  return stats.spearmanr(x, y, axis = None).correlation

In [9]:
spearmanr_correlation_MEN = eval_word_similarity(C, "31190-a1-files/men.txt")
spearmanr_correlation_SimLex = eval_word_similarity(C, "31190-a1-files/simlex-999.txt")
print(spearmanr_correlation_MEN, spearmanr_correlation_SimLex)

0.22433852567075044 0.06134632093956428


  dist = 1.0 - uv / np.sqrt(uu * vv)


## 1.2 Computing PMIs

In [6]:
def compute_PMI(C):
  C_pmi = np.zeros(C.shape, dtype=float) 
  total_count_C = C.sum()
  row_count_C = C.sum(axis=1)
  column_count_C = C.sum(axis=0)

  for i in tqdm(range(C.shape[0])):
    for j in range(C.shape[1]):
      if(C[i][j] == 0): continue
      else: 
        C_pmi[i][j] = np.log2(C[i][j] * total_count_C / ( row_count_C[i] * column_count_C[j] ))
  
  return C_pmi

In [11]:
C_pmi = compute_PMI(C)

100%|██████████| 1577/1577 [00:23<00:00, 67.60it/s]


In [12]:
spearmanr_correlation_MEN = eval_word_similarity(C_pmi, "31190-a1-files/men.txt")
spearmanr_correlation_SimLex = eval_word_similarity(C_pmi, "31190-a1-files/simlex-999.txt")
print(spearmanr_correlation_MEN, spearmanr_correlation_SimLex)

0.5340533067559707 0.22557679856872018


## 1.3 Experiment

In [13]:
windows = [1, 3, 6]
for w in windows:
  # print("Windows: ", w)
  C = counting(corpus, V, V_C, V_set, V_C_set, w)
  C_pmi = compute_PMI(C)
  print("EvalWS(C) MEN.txt: ", eval_word_similarity(C, "31190-a1-files/men.txt"))
  print("EvalWS(C) SimLex-999.txt: ", eval_word_similarity(C, "31190-a1-files/simlex-999.txt"))
  print("EvalWS(C_pmi) MEN.txt: ", eval_word_similarity(C_pmi, "31190-a1-files/men.txt"))
  print("EvalWS(C_pmi) SimLex-999.txt: ", eval_word_similarity(C_pmi, "31190-a1-files/simlex-999.txt"))

100%|██████████| 997898/997898 [00:11<00:00, 85965.03it/s]
100%|██████████| 1577/1577 [00:21<00:00, 72.02it/s]


EvalWS(C) MEN.txt:  0.20992265447069547
EvalWS(C) SimLex-999.txt:  0.07753059821419862
EvalWS(C_pmi) MEN.txt:  0.46530280519315015
EvalWS(C_pmi) SimLex-999.txt:  0.26977579767342286


100%|██████████| 997898/997898 [00:18<00:00, 52975.66it/s]
100%|██████████| 1577/1577 [00:23<00:00, 66.74it/s]


EvalWS(C) MEN.txt:  0.22433852567075044
EvalWS(C) SimLex-999.txt:  0.06134632093956428
EvalWS(C_pmi) MEN.txt:  0.5340533067559707
EvalWS(C_pmi) SimLex-999.txt:  0.22557679856872018


100%|██████████| 997898/997898 [00:23<00:00, 42592.98it/s]
100%|██████████| 1577/1577 [00:26<00:00, 60.11it/s]


EvalWS(C) MEN.txt:  0.23790636810978752
EvalWS(C) SimLex-999.txt:  0.040553547139409986
EvalWS(C_pmi) MEN.txt:  0.5423454654769123
EvalWS(C_pmi) SimLex-999.txt:  0.18367996243967827


-- Spearmanr correlation increases following to the increasement of window in dataset MEN.txt

-- Spearmanr correlation decreases following to the increasement of window in dataset SimLex-999.txt

## 1.4 Analysis

In [7]:
V = open("31190-a1-files/vocab-25k.txt").read().split("\n")[:-1]
V_C = open("31190-a1-files/vocab-25k.txt").read().split("\n")[:-1]
V_set = {word: idx for idx, word in enumerate(V)}
V_C_set = {word: idx for idx, word in enumerate(V_C)}

In [8]:
def n_nearest_neighbors(C_pmi, query_word, V, V_set,  n):
  cosine_similarities = []
  query_word_vec = C_pmi[V_set[query_word], :]
  # print(query_word_vec)
  word_num = C_pmi.shape[0]
  n_nearest = np.arange(word_num)
  for i in range(word_num):
    word_vec = C_pmi[i, :]
    cosine_similarities.append(1 - cosine(query_word_vec, word_vec))
  
  # discard words with cosine similarity = 1
  n_nearest = np.array(sorted(n_nearest, key=lambda x: cosine_similarities[x], reverse=True))
  n_nearest = [nearest for nearest in n_nearest if cosine_similarities[nearest] < 1]
  return [ V[i] for i in n_nearest[:n] ]

In [9]:
def n_nearest_neighbors_of_words(C_pmi, words, V, V_set, n):
  return np.array([n_nearest_neighbors(C_pmi, word, V, V_set, n) for word in words])

In [19]:
nouns = ["school", "city", "information", "family", "government"]
verbs = ["edits", "held", "took", "published", "created"]
adjectives = ["old", "central", "late", "political", "important"]
prepositions = ["about", "for", "before", "in", "always"]
multiple_sences_words = ["bank", "cell", "apple", "apples", "axes", "frame", "light", "well"]

### Windows size = 1

In [11]:
C_pmi = []

In [12]:
C_pmi = compute_PMI(counting(corpus, V, V_C, V_set, V_C_set, w=1))
# loại các giá trị ko dương để khi tính cosine sẽ ko trả về giá trị âm(trái nghĩa) trong khi thuật toán ko biểu hiện sự trái nghĩa
# C_pmi_w = np.where(C_pmi_w > 0, C_pmi_w, 0)

100%|██████████| 997898/997898 [01:08<00:00, 14492.76it/s]
100%|██████████| 25000/25000 [05:31<00:00, 75.51it/s]


In [13]:
print(n_nearest_neighbors(C_pmi, "monster", V, V_set, 10))

  dist = 1.0 - uv / np.sqrt(uu * vv)


['dragon', 'tyrant', 'creatures', 'monsters', 'jar', 'hornet', 'rhinoceros', 'invaders', 'gangster', 'robot']


#### Nouns

In [14]:
print(nouns)
print(n_nearest_neighbors_of_words(C_pmi, nouns, V, V_set, 10))

['school', 'city', 'information', 'family', 'government']
[['college' 'schools' 'university' 'education' 'high' 'church' 'city'
  "'s" 'at' 'new']
 ['town' 'county' 'university' 'area' 'district' 'state' 'school' 'south'
  'north' 'near']
 ['data' 'info' 'content' 'evidence' 'material' 'knowledge' 'sources'
  'details' 'articles' 'coverage']
 ["'s" 'john' 'house' 'and' 'william' 'families' 'who' 'james' 'father'
  '(']
 ['politician' 'authorities' 'army' 'footballer' 'constitution'
  'national' 'parliament' 'military' 'state' 'party']]


#### Verbs

In [15]:
print(verbs)
print(n_nearest_neighbors_of_words(C_pmi, verbs, V, V_set, 10))

['edits', 'held', 'took', 'published', 'created']
[['comments' 'articles' 'contributions' 'copies' 'edit' 'editing'
  'editors' 'contribs' 'people' 'years']
 ['organized' 'used' 'took' 'taken' 'found' 'hosted' 'were' 'played'
  'published' 'referred']
 ['take' "'s" 'won' 'had' 'takes' ',' 'and' '(' 'taking' 'became']
 ['written' 'printed' 'cited' 'released' 'edited' 'collected' 'appeared'
  'wrote' 'publish' 'reprinted']
 ['produced' 'added' 'made' 'done' 'designed' 'released' 'built' 'used'
  'developed' 'published']]


#### Adjectives

In [16]:
print(adjectives)
print(n_nearest_neighbors_of_words(C_pmi, adjectives, V, V_set, 10))

['old', 'central', 'late', 'political', 'important']
[['former' 'new' 'and' 'ancient' "'s" 'original' 'road' 'near'
  'traditional' 'own']
 ['southern' 'northern' 'western' 'eastern' 'east' 'northeastern' 'south'
  'southeastern' 'north' 'west']
 ['early' 'mid' 'october' 'december' 'september' 'january' 'february'
  'july' 'november' 'june']
 ['social' 'religious' 'economic' 'cultural' 'legal' 'communist'
  'financial' 'military' 'liberal' 'government']
 ['significant' 'major' 'notable' 'main' 'key' 'interesting' 'valuable'
  'useful' 'crucial' 'specific']]


#### Prepositions

In [17]:
print(prepositions)
print(n_nearest_neighbors_of_words(C_pmi, prepositions, V, V_set, 10))

['about', 'for', 'before', 'in', 'always']
[['over' 'approximately' 'than' 'around' 'years' 'people' 'some' 'when'
  'or' 'are']
 ['or' 'with' 'and' 'on' 'as' 'at' 'from' 'are' 'by' 'of']
 ['after' 'while' 'when' 'without' 'from' 'until' 'during' 'into' 'and'
  'began']
 ['.' 'at' 'from' 'february' ')' 'january' 'june' 'november' 'october'
  'july']
 ['usually' 'often' 'never' 'generally' 'still' 'really' 'actually' "'t"
  'sometimes' 'just']]


#### Multiple senses words 

In [20]:
# multiple senses words 
print(multiple_sences_words)
print(n_nearest_neighbors_of_words(C_pmi, multiple_sences_words, V, V_set, 10))

['bank', 'cell', 'apple', 'apples', 'axes', 'frame', 'light', 'well']


  dist = 1.0 - uv / np.sqrt(uu * vv)


[['banks' 'insurance' 'company' 'corporation' 'banking' 'government'
  'railway' 'library' 'business' 'companies']
 ['cells' 'cellular' 'tissue' 'neuronal' 'protein' 'proteins' 'syndrome'
  'brain' 'tissues' 'molecules']
 ['chili' 'cherry' 'olive' 'plum' 'almond' 'desktop' 'xm' 'pear' 'peach'
  'palm']
 ['bananas' 'brains' 'israelis' 'kinds' 'olives' 'oranges' 'candles'
  'jokes' 'specifics' 'translucent']
 ['facets' 'paths' 'phases' 'tributaries' 'concurrency' 'atypical'
  'engravings' 'b-sides' 'antagonists' 'rubble']
 ['frames' 'brick' 'two-story' 'tubing' 'rear' 'framed' 'cornice'
  'panels' 'framing' 'structure']
 ['heavy' 'lights' 'bright' 'dark' 'radiation' 'pale' 'fire' 'water' 'or'
  'regiment']
 ['poorly' 'be' 'been' 'however' 'there' 'not' 'debate' 'preserved'
  'discussion' 'united']]


### Windows size = 6

In [21]:
C_pmi = []

In [22]:
C_pmi = compute_PMI(counting(corpus, V, V_C, V_set, V_C_set, w=6))

100%|██████████| 997898/997898 [02:59<00:00, 5565.11it/s]
100%|██████████| 25000/25000 [06:09<00:00, 67.70it/s]


In [26]:
print(n_nearest_neighbors(C_pmi, "monster", V, V_set, 10))

['evil', 'giant', 'creature', 'monsters', 'godzilla', 'dragon', 'dog', 'ghost', 'horror', 'girl']


#### Nouns

In [36]:
print(nouns)
print(n_nearest_neighbors_of_words(C_pmi, nouns, V, V_set, 10))

['school', 'city', 'information', 'family', 'government']
[['college' 'university' 'high' 'city' 'students' 'schools' 'district'
  'year' 'county' 'state']
 ['town' 'north' 'located' 'south' 'county' 'area' 'west' 'district'
  'school' 'york']
 ['sources' 'data' 'wikipedia' 'any' 'articles' 'content' 'can' 'source'
  'use' 'if']
 ['father' 'known' 'house' 'her' 'son' 'born' 'species' 'church' 'who'
  'john']
 ['party' 'federal' 'political' 'minister' 'state' 'military' 'council'
  'president' 'public' 'act']]


#### Verbs

In [37]:
print(verbs)
print(n_nearest_neighbors_of_words(C_pmi, verbs, V, V_set, 10))

['edits', 'held', 'took', 'published', 'created']
[['edit' 'comments' 'user' 'discussion' 'talk' 'page' 'please'
  'wikipedia' 'appropriate' 'me']
 ['won' 'took' 'september' 'august' 'june' 'july' 'national' 'october'
  'november' 'march']
 ['place' 'held' 'became' 'march' 'began' 'june' 'january' 'april' 'july'
  'during']
 ['book' 'books' 'wrote' 'written' 'magazine' 'journal' 'author' 'novel'
  'born' 'released']
 ['using' 'including' 'produced' 'based' 'published' 'called' 'written'
  'create' 'developed' 'named']]


#### Adjectives

In [38]:
print(adjectives)
print(n_nearest_neighbors_of_words(C_pmi, adjectives, V, V_set, 10))

['old', 'central', 'late', 'political', 'important']
[['built' 'near' 'road' 'town' 'street' 'house' 'village' 'along' 'named'
  'around']
 ['southern' 'south' 'east' 'northern' 'north' 'eastern' 'located' 'west'
  'western' 'region']
 ['early' 'century' 'during' 'until' 'period' 'began' 'april' 'july'
  'september' 'august']
 ['social' 'government' 'party' 'economic' 'movement' 'religious'
  'rights' 'politics' 'civil' 'cultural']
 ['significant' 'most' 'such' 'these' 'various' 'many' 'development'
  'include' 'often' 'especially']]


#### Prepositions

In [39]:
print(prepositions)
print(n_nearest_neighbors_of_words(C_pmi, prepositions, V, V_set, 10))

['about', 'for', 'before', 'in', 'always']
[['what' 'there' 'you' 'than' 'if' 'more' 'some' 'people' 'so' 'i']
 ['as' 'be' 'to' 'not' 'this' 'on' 'that' 'provide' 'their' 'a']
 ['after' 'then' 'when' 'until' 'during' 'later' 'him' 'again' 'out'
  'back']
 ['at' 'was' 'of' 'the' 'university' 'from' 'first' 'he' 'city' 'january']
 ['your' 'something' 'we' 'me' 'even' 'often' 'might' 'because' 'very'
  'must']]


#### Multiple senses words 

In [23]:
# multiple senses words 
print(multiple_sences_words)
print(n_nearest_neighbors_of_words(C_pmi, multiple_sences_words, V, V_set, 10))

['bank', 'cell', 'apple', 'apples', 'axes', 'frame', 'light', 'well']
[['banks' 'river' 'corporation' 'company' 'west' 'capital' 'located'
  'railway' 'banking' 'east']
 ['cells' 'protein' 'proteins' 'membrane' 'cellular' 'dna' 'receptor'
  'signaling' 'molecules' 'blood']
 ['os' 'macintosh' 'microsoft' 'ios' 'mac' 'software' 'desktop' 'ipad'
  'iphone' 'atari']
 ['oranges' 'sugarcane' 'fruits' 'fruit' 'citrus' 'tomatoes' 'corn'
  'wheat' 'mango' 'seeds']
 ['grind' 'angles' 'vectors' 'axe' 'flint' 'intersect' 'orthogonal'
  'coordinate' 'directions' 'neolithic']
 ['roof' 'steel' 'rear' 'brick' 'frames' 'structure' 'two-story' 'wooden'
  'metal' 'one-story']
 ['using' 'surface' 'dark' 'water' 'red' 'usually' 'color' 'body' 'often'
  'white']
 ['such' 'many' 'other' 'including' 'most' 'some' 'like' 'several'
  'these' 'both']]


### Analysis result

Q1: Do nearest neighbors tend to have the same part-of-speech tag as the query word, or do they differ? (Nearest neighbors có cùng xu hướng cùng kiểu từ loại hay khác nhau?)

A: Chúng có cùng kiểu

Q2: Does the pattern differ across different part-of-speech tags for the query word? How does window size affect this?(Các từ trong mỗi kiểu từ loại có khác nhau về kiểu ý nghĩa hay không? Window size ảnh hưởng như thế nào?)

A: Hầu hết các từ đơn nghĩa sẽ không khác nhau về kiểu ý nghĩa, các từ đa nghĩa sẽ trộn lẫn với nhau gây sự khác nhau về kiểu ý nghĩa. Window size khiến cho các từ tương đồng khác nhau ở mỗi window size, có thể sai đi hoặc đúng hơn.

Q3: When the neighbors differ between window sizes, how do
they differ? Can you find any query words that have almost exactly the same nearest neighbors with
the two window sizes?

A: ?????

## 2 Dimensionality Reduction

In [30]:
from sklearn.decomposition import TruncatedSVD

In [24]:
V = open("31190-a1-files/vocab-25k+wordsim.txt").read().split("\n")[:-1]
V_C = open("31190-a1-files/vocab-3k.txt").read().split("\n")[:-1]
V_set = {word: idx for idx, word in enumerate(V)}
V_C_set = {word: idx for idx, word in enumerate(V_C)}

In [26]:
C_pmi = []

In [27]:
C_pmi = compute_PMI(counting(corpus, V, V_C, V_set, V_C_set, w=3))

100%|██████████| 997898/997898 [01:44<00:00, 9565.13it/s] 
100%|██████████| 26577/26577 [01:00<00:00, 440.52it/s]


In [29]:
# calculate baseline of Spearmanr
spearmanr_correlation_MEN = eval_word_similarity(C_pmi, "31190-a1-files/men.txt")
spearmanr_correlation_SimLex = eval_word_similarity(C_pmi, "31190-a1-files/simlex-999.txt")
print(spearmanr_correlation_MEN, spearmanr_correlation_SimLex)

0.4253589402017067 0.18795052209223617


  dist = 1.0 - uv / np.sqrt(uu * vv)


### Compute the truncated SVD of Cpmi

In [31]:
k_values = [10, 50, 100, 500, 1000]

In [47]:
def calculateSVD_C_pmi(C_pmi, k):
  svd = TruncatedSVD(n_components=k, n_iter=7)
  C_svd = svd.fit_transform(C_pmi)
  print(C_svd.shape)
  spearmanr_correlation_MEN = eval_word_similarity(C_svd, "31190-a1-files/men.txt")
  spearmanr_correlation_SimLex = eval_word_similarity(C_svd, "31190-a1-files/simlex-999.txt")
  print(spearmanr_correlation_MEN, spearmanr_correlation_SimLex)

In [48]:
for k in k_values:
  calculateSVD_C_pmi(C_pmi, k)

(26577, 10)
0.2519587441740883 0.14792814542124177


  dist = 1.0 - uv / np.sqrt(uu * vv)


(26577, 50)
0.38950313339406056 0.1809739338171111


  dist = 1.0 - uv / np.sqrt(uu * vv)


(26577, 100)


  dist = 1.0 - uv / np.sqrt(uu * vv)


0.4212942064337926 0.19407990004506143
(26577, 500)


  dist = 1.0 - uv / np.sqrt(uu * vv)


0.43046451544195596 0.19020648979005148
(26577, 1000)
0.42805560669596865 0.19240255889479801


  dist = 1.0 - uv / np.sqrt(uu * vv)


Với k = {500, 1000} của spearmanr correlation của 2 tập dataset có giá trị cao hơn so với C_pmi ban đầu khi chưa giảm số chiều