In [None]:
'''
Utilized code from https://github.com/mfaruqui/retrofitting for implementation
Reference paper: https://arxiv.org/abs/1411.4166 by Faruqui et al

'''

In [None]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive


In [None]:
import os
from gensim.models.keyedvectors import KeyedVectors
import numpy as np
os.chdir('/gdrive/My Drive/NLP_Project_Programs/Paper_Implementation')
os.listdir()

['Vector Representation of Words',
 'lexicons',
 '.gitignore',
 'README.md',
 'sample_vec.txt',
 'LICENSE',
 'retrofit.py',
 'out_vec.txt',
 'Evaluation_Benchmarks',
 'retrofit.ipynb',
 'Untitled0.ipynb']

In [None]:
import argparse
import gzip
import math
import numpy
import re
import sys
#pawann helloooooooooo    
from copy import deepcopy

isNumber = re.compile(r'\d+.*')
def norm_word(word):
  if isNumber.search(word.lower()):
    return '---num---'
  elif re.sub(r'\W+', '', word) == '':
    return '---punc---'
  else:
    return word.lower()

''' Read all the word vectors and normalize them '''
def read_word_vecs(filename):
  wordVectors = {}
  if filename.endswith('.gz'): fileObject = gzip.open(filename, 'r')
  else: fileObject = open(filename, 'r')
  
  for line in fileObject:
    line = line.strip().lower()
    word = line.split()[0]
    wordVectors[word] = numpy.zeros(len(line.split())-1, dtype=float)
    for index, vecVal in enumerate(line.split()[1:]):
      wordVectors[word][index] = float(vecVal)
    ''' normalize weight vector '''
    wordVectors[word] /= math.sqrt((wordVectors[word]**2).sum() + 1e-6)
    
  sys.stderr.write("Vectors read from: "+filename+" \n")
  return wordVectors
    
''' Read the PPDB word relations as a dictionary '''
def read_lexicon(filename):
  lexicon = {}
  for line in open(filename, 'r', encoding="utf8"):
    words = line.lower().strip().split()
    lexicon[norm_word(words[0])] = [norm_word(word) for word in words[1:]]
  return lexicon

''' Write word vectors to file '''
def print_word_vecs(wordVectors, outFileName):
  sys.stderr.write('\nWriting down the vectors in '+outFileName+'\n')
  outFile = open(outFileName, 'w')  
  for word, values in wordVectors.items():
    outFile.write(word+' ')
    for val in wordVectors[word]:
      outFile.write('%.4f' %(val)+' ')
    outFile.write('\n')      
  outFile.close()

''' Retrofit word vectors to a lexicon '''
def retrofit(wordVecs, lexicon, numIters):
  newWordVecs = deepcopy(wordVecs)
  wvVocab = set(newWordVecs.keys())
  loopVocab = wvVocab.intersection(set(lexicon.keys()))
  for it in range(numIters):
    # loop through every node also in ontology (else just use data estimate)
    for word in loopVocab:
      wordNeighbours = set(lexicon[word]).intersection(wvVocab)
      numNeighbours = len(wordNeighbours)
      #no neighbours, pass - use data estimate
      if numNeighbours == 0:
        continue
      # the weight of the data estimate if the number of neighbours
      newVec = numNeighbours * wordVecs[word]
      # loop over neighbours and add to new vector (currently with weight 1)
      for ppWord in wordNeighbours:
        newVec += newWordVecs[ppWord]
      newWordVecs[word] = newVec/(2*numNeighbours)
  return newWordVecs
  
# if __name__=='__main__':

#   parser = argparse.ArgumentParser()
#   parser.add_argument("-i", "--input", type=str, default=None, help="Input word vecs")
#   parser.add_argument("-l", "--lexicon", type=str, default=None, help="Lexicon file name")
#   parser.add_argument("-o", "--output", type=str, help="Output word vecs")
#   parser.add_argument("-n", "--numiter", type=int, default=10, help="Num iterations")
#   args = parser.parse_args()

  # wordVecs = read_word_vecs(args.input)
  # lexicon = read_lexicon(args.lexicon)
  # numIter = int(args.numiter)
  # outFileName = args.output
  
  # ''' Enrich the word vectors using ppdb and print the enriched vectors '''
  # print_word_vecs(retrofit(wordVecs, lexicon, numIter), outFileName) 


input_vector = 'sample_vec.txt'
lexicon = 'lexicons/ppdb-xl.txt'
numiter = 10
output = 'out_vec.txt'

wordVecs = read_word_vecs(input_vector)
lexicon = read_lexicon(lexicon)
numIter = int(numiter)
outFileName = output

''' Enrich the word vectors using ppdb and print the enriched vectors '''
print_word_vecs(retrofit(wordVecs, lexicon, numIter), outFileName) 

In [None]:
def process_1_word_vecs(path):
  '''
  Add code here...
  '''
  return None

path = 'Vector Representation of Words/1/...'
process_1_word_vecs(path)

In [None]:
def process_2_word_vecs(path):
  model = KeyedVectors.load_word2vec_format(path, binary=True)
  print('Done step 1')
  # 3000001 lines. 1st line is just description and to be ignored.
  model.save_word2vec_format('/'.join(path.split('/')[:-1])+'/GoogleNews-vectors-negative300.txt', binary=False)
  print('Done step 2')
  return None

path = 'Vector Representation of Words/2/GoogleNews-vectors-negative300.bin'
process_2_word_vecs(path)

In [None]:
def process_3_word_vecs(path_vocab, path_wordvectors):
  vocabObject = open(path_vocab, 'r')
  wordvectorsObject = open(path_wordvectors, 'r')

  outFileName = '/'.join(path_wordvectors.split('/')[:-1]) + '/3_word_vectors.txt'

  outFile = open(outFileName, 'w')  
  for word, vector in zip(vocabObject,wordvectorsObject):
    outFile.write(word.strip()+' ')
    for val in vector.strip().split():
      outFile.write('%.6f' %(float(val))+' ')
    outFile.write('\n')
  outFile.close()

  return None

path_vocab = 'Vector Representation of Words/3/vocab.txt'
path_wordvectors = 'Vector Representation of Words/3/wordVectors.txt'
process_3_word_vecs(path_vocab, path_wordvectors)

In [None]:
def process_4_word_vecs(path):
  '''
  Add code here...
  '''
  return None

path = 'Vector Representation of Words/4/...'
process_4_word_vecs(path)

In [None]:
temp = []
while True:
  temp.append('aaaaaaaaaaaaabbbbbbbbbbbbbbbcccccccccccc')

In [None]:
path_vocab = 'Vector Representation of Words/2/GoogleNews-vectors-negative300.txt'

vocabObject = open(path_vocab, 'r')
# wordvectorsObject = open(path_wordvectors, 'r')
# with open(path_vocab, "r+") as f:
#     d = f.readlines()
#     f.seek(0)
#     for index, i in enumerate(d[1:]):
#       if index%10000==0:print('->', index,'/ 3000000')
#       f.write(i)
#     f.truncate()

In [None]:
for index, line in enumerate(vocabObject):
  print(line, len(line.split()))
  if index ==10:break

</s> 0.0011291504 -0.00089645386 0.00031852722 0.0015335083 0.0011062622 -0.0014038086 -3.0517578e-05 -0.0004196167 -0.0005760193 0.0010757446 -0.0010223389 -0.00061798096 -0.00075531006 0.0014038086 -0.0016403198 -0.00063323975 0.0016326904 -0.0010070801 -0.0012664795 0.00065231323 -0.000415802 -0.0010757446 0.0015258789 -0.0002746582 0.00014019012 0.0015716553 0.0013580322 -0.000831604 -0.0014038086 0.0015792847 0.00025367737 -0.0007324219 -0.00010538101 -0.0011672974 0.0015792847 0.00065612793 -0.0006599426 2.9206276e-06 0.0011291504 0.0004272461 -0.00037002563 -0.0011520386 0.0012664795 -3.516674e-06 0.00026512146 -0.00040245056 0.0001411438 -3.361702e-05 0.00075912476 -0.0005187988 -7.104874e-05 0.00060272217 -0.00050735474 -0.001625061 -0.00043678284 -0.0009918213 -0.0012207031 -0.00032234192 6.866455e-05 -0.0011672974 -0.00051116943 0.001411438 0.00033569336 -0.0004749298 -0.001373291 0.00036621094 -0.0014419556 -0.00060653687 0.0008010864 0.0011291504 -0.0008354187 -0.001159668

In [None]:
# sum(1 for line in vocabObject)

Check WS353 Dataset


In [None]:
path1 = 'Evaluation_Benchmarks/Word_Similarity/WS353/wordsim_similarity_goldstandard.txt'
path2 = 'Evaluation_Benchmarks/Word_Similarity/WS353/wordsim_relatedness_goldstandard.txt'
path3 = 'Evaluation_Benchmarks/Word_Similarity/WS353/wordsim353_agreed.txt'
path4 = 'Evaluation_Benchmarks/Word_Similarity/WS353/wordsim353_annotator1.txt'
path5 = 'Evaluation_Benchmarks/Word_Similarity/WS353/wordsim353_annotator2.txt'
WS_sim_object = open(path1,'r')
WS_rel_object = open(path2,'r')
WS_agreed_obj = open(path3,'r')
WS_ann1_obj = open(path4,'r')
WS_ann2_obj = open(path5,'r') 
count1 =count2 = count3=count4 =count5= 0 

for index,line in enumerate(WS_sim_object):
  count1+=1
for index,line in enumerate(WS_rel_object):
  count2+=1
for index,line in enumerate(WS_agreed_obj):
  if line.strip(' ').split(' ')[0] == '#':
    #print(line)
    continue
  count3+=1
for index,line in enumerate(WS_ann1_obj):
  if line.strip(' ').split(' ')[0] == '#':
    #print(line)
    continue
  count4+=1
for index,line in enumerate(WS_ann2_obj):
  if line.strip(' ').split(' ')[0] == '#':
    #print(line)
    continue
  count5+=1      
print("sim",count1)  
print("rel",count2)
print("ag",count3)
print("ann1",count4)
print("ann2",count5)


sim 203
rel 252
ag 352
ann1 352
ann2 352


Check SYN-REL dataset

In [None]:
path_1 = 'Evaluation_Benchmarks/Syntactic_Relns/questions-words.txt'
synrel_obj = open(path_1,'r')
count=0
for index, line in enumerate(synrel_obj):
  count+=1
print("total", count)  


total 19558


In [None]:
path_1 = 'Evaluation_Benchmarks/Syntactic_Relns/questions-words.txt'
outFile_path = '/'.join(path_1.split('/')[:-1]) + '/syntactic_alone.txt'
outFile_obj = open(outFile_path, 'w') 

synrel_obj1 = open(path_1,'r')
count1=0
flag = 0
for index, line in enumerate(synrel_obj1):
  if line.strip().split(' ')[0] == ':' : 
    continue
  if flag:
    outFile_obj.write(line.strip()+'\n')
  count1+=1

  if count1 == 8869:
    flag = 1

outFile_obj.close()

print("total", count1)
count2 =0 
synrel_obj2 = open(outFile_path, 'r') 
for index, line in enumerate(synrel_obj2):
  count2+= 1
print("only syntactic", count2)  




total 19544
only syntactic 10675


Check Sentiment Analysis (SA) dataset

In [None]:
path_1 = 'Evaluation_Benchmarks/Sentiment_Analysis/train.txt'
sen_obj = open(path_1,'r')
count=0
for index, line in enumerate(sen_obj):
  count+=1
print("total", count)  

total 8544


In [None]:
import math
import numpy as np
from numpy import dot
from numpy.linalg import norm
from scipy.stats import spearmanr
from gensim.models import Word2Vec
from gensim.models.keyedvectors import KeyedVectors


def word_similarity_tasks(word_vec=None):
  """
  Compute Spearman's Correlation Coefficient for each of the 3 tasks for the given word_vec.

  Parameters
  ----------
  word_vec : dict
    A dictionary with keys as words and values as vectors.

  Returns
  -------
  arg1 : float
    Spearman's Correlation Coefficient for WS-353 task.
  arg2 : float
    Spearman's Correlation Coefficient for RG-65 task.
  arg3 : float
    Spearman's Correlation Coefficient for MEN task.
  """

  # Task 1: WS-353
  # print('Working on WS_353...')
  path_to_dataset = 'Evaluation_Benchmarks/Word_Similarity/WS353_Exact/EN-WS-353-ALL.txt'
  WS_353 = open(path_to_dataset,'r')
  sim_score_true = []
  sim_score_vecs = []
  missed_count=0
  for index, line in enumerate(WS_353):
    line = line.strip().lower().split()
    sim_score =float(line[2])
    sim_score_true.append(sim_score/10)

    word_pair = (line[0], line[1])
    try: # If word vectors exist for both words.
      a = word_vec[word_pair[0]]
      b = word_vec[word_pair[1]]
      cos_sim = dot(a, b)/(norm(a)*norm(b))
    except: # If there is no word vector for either of the words.
      missed_count+=1
      cos_sim = 0
    sim_score_vecs.append(cos_sim)
  if missed_count>0:print('Missed in WS353:', missed_count)

  WS_353_spearman = spearmanr(sim_score_true, sim_score_vecs)[0]
  # print('Spearman coefficient :', WS_353_spearman)

  # Task 2: RG-65
  # print('Working on RG_65...')
  path_to_dataset = 'Evaluation_Benchmarks/Word_Similarity/RG65/EN-RG-65.txt'
  RG_65 = open(path_to_dataset,'r')
  sim_score_true = []
  sim_score_vecs = []
  missed_count=0
  for index, line in enumerate(RG_65):
    line = line.strip().lower().split()
    sim_score =float(line[2])
    sim_score_true.append(sim_score/10)

    word_pair = (line[0], line[1])
    try: # If word vectors exist for both words.
      a = word_vec[word_pair[0]]
      b = word_vec[word_pair[1]]
      cos_sim = dot(a, b)/(norm(a)*norm(b))
    except: # If there is no word vector for either of the words.
      missed_count+=1
      cos_sim = 0
    sim_score_vecs.append(cos_sim)
  if missed_count>0:print('Missed in RG65:', missed_count)

  RG_65_spearman = spearmanr(sim_score_true, sim_score_vecs)[0]
  # print('Spearman coefficient :', RG_65_spearman)

  # Task 3: MEN
  # print('Working on MEN...')
  path_to_dataset = 'Evaluation_Benchmarks/Word_Similarity/MEN/EN-MEN-TR-3k.txt'
  MEN = open(path_to_dataset,'r')
  sim_score_true = []
  sim_score_vecs = []
  missed_count=0
  for index, line in enumerate(MEN):
    line = line.strip().lower().split()
    sim_score =float(line[2])
    sim_score_true.append(sim_score/50)

    word_pair = (line[0], line[1])
    try: # If word vectors exist for both words.
      a = word_vec[word_pair[0]]
      b = word_vec[word_pair[1]]
      cos_sim = dot(a, b)/(norm(a)*norm(b))
    except: # If there is no word vector for either of the words.
      missed_count+=1
      cos_sim = 0
    sim_score_vecs.append(cos_sim)
  if missed_count>0:print('Missed in MEN:', missed_count)

  MEN_spearman = spearmanr(sim_score_true, sim_score_vecs)[0]
  # print('Spearman coefficient :', MEN_spearman)

  return WS_353_spearman, RG_65_spearman, MEN_spearman

''' Read all the word vectors and normalize them '''
def read_word_vecs(filename):
  wordVectors = {}
  # if filename.endswith('.gz'): fileObject = gzip.open(filename, 'r')
  fileObject = open(filename, 'r')
  # print('Read file..')
  for index,line in enumerate(fileObject):
    # if index%50000==0:print('->', index,'/ 3000000')
    line = line.strip().lower()
    word = line.split()[0]
    wordVectors[word] = np.zeros(len(line.split())-1, dtype=float)
    # if len(line.split()) != 301:
      # print('-'*50,index, word)
      # continue
    for index, vecVal in enumerate(line.split()[1:]):
      wordVectors[word][index] = float(vecVal)
    ''' normalize weight vector '''
    wordVectors[word] /= math.sqrt((wordVectors[word]**2).sum() + 1e-6)
    
  # sys.stderr.write("Vectors read from: "+filename+" \n")
  return wordVectors


# word_vec_filenames = ['Vector Representation of Words/1/glove.6B.300d.txt','Vector Representation of Words/2/GoogleNews-vectors-negative300.txt','Vector Representation of Words/3/3_word_vectors.txt']
word_vec_filenames = ['Vector Representation of Words/3/3_word_vectors.txt']

print('Word Vector\t\t WS-353\t\t RG-65\t\t MEN')
for filename in word_vec_filenames:
  word_vec = read_word_vecs(filename)
  # word_vec = KeyedVectors.load_word2vec_format('Vector Representation of Words/2/GoogleNews-vectors-negative300.bin', binary=True)
  WS_353_spearman, RG_65_spearman, MEN_spearman = word_similarity_tasks(word_vec)
  print(filename.split('/')[-1][:20],'\t','%.4f' %(WS_353_spearman),'\t','%.4f' %(RG_65_spearman),'\t','%.4f' %(MEN_spearman))


Word Vector		 WS-353		 RG-65		 MEN
Missed in MEN: 1
3_word_vectors.txt 	 0.6258 	 0.6299 	 0.3140


In [None]:
from gensim.models import Word2Vec
from gensim.test.utils import datapath
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

word_vec_bin = KeyedVectors.load_word2vec_format('Vector Representation of Words/2/GoogleNews-vectors-negative300.bin', binary=True)
word_vec_bin.evaluate_word_pairs(datapath('wordsim353.tsv'))
# word_vec = Word2Vec.load_word2vec_format(filename, binary=False)


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  if np.issubdtype(vec.dtype, np.int):


((0.6238773466616107, 1.7963237724171284e-39),
 SpearmanrResult(correlation=0.6589215888009288, pvalue=2.5346056459149263e-45),
 0.0)

In [None]:
lengths = []
for _, i in word_vec.items():
  lengths.append(len(i))
print(np.mean(lengths))


50.0


'/usr/local/lib/python3.6/dist-packages/gensim/test/test_data/wordsim353.tsv'

In [None]:

# glove_file = 'Vector Representation of Words/3/3_word_vectors.txt'
# tmp_file = 'Vector Representation of Words/3/temp.txt'

# _ = glove2word2vec(glove_file, tmp_file)
# model = KeyedVectors.load_word2vec_format(tmp_file)
model.evaluate_word_pairs(datapath('wordsim353.tsv')),word_vec_bin.evaluate_word_pairs(datapath('wordsim353.tsv'))

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  if np.issubdtype(vec.dtype, np.int):


(((0.6139829092144378, 5.950038844375029e-38),
  SpearmanrResult(correlation=0.625762577349116, pvalue=9.08944637013904e-40),
  0.0),
 ((0.6238773466616107, 1.7963237724171284e-39),
  SpearmanrResult(correlation=0.6589215888009288, pvalue=2.5346056459149263e-45),
  0.0))