In [1]:
import evaluation.model as md 
model_static_en = md.TextModel('../data/models/wiki.en.vec')
#model_static_es = md.TextModel('/gdrive/MyDrive/MUSE/data/wiki.es.vec')
model_static_fa = md.FastTextModel('/home/mahmoudi/data/blogs_skipgram_300_3.out')

['2519370', '300']


KeyboardInterrupt: 

In [None]:
from scipy.spatial import distance
from scipy.special import softmax

import numpy as np
import re 

class ProcrustesModel(md.Model):
    def __init__(self, model , w):
        self.model = model
        self.w = w
    def get_word_vector(self, word):
        return np.matmul(self.w ,  self.model.get_word_vector(word).transpose() ).transpose() 

    def word_exist(self, word):
        return self.model.word_exist(word)


    def get_word_in_index(self, index):
        return self.model.get_word_in_index(index)

    def get_word_index(self, word):
        return self.model.get_word_index(word)

class LocalConverted(md.Model):
    def __init__(self, src_model , support_vecs_src , support_vecs_dst):
        self.model = src_model
        self.support_vecs_src = support_vecs_src
        self.support_vecs_dst = support_vecs_dst 

    def get_word_vector(self, word):
        # print('B ' , word)
        v =   batch_convert(self.support_vecs_src , self.support_vecs_dst , np.reshape(self.model.get_word_vector(word),(1 , -1) ))
        return v.flatten() 
    def word_exist(self, word):
        return self.model.word_exist(word)

    def get_word_in_index(self, index):
        return self.model.get_word_in_index(index)

    def get_word_index(self, word):
        return self.model.get_word_index(word)


def  load_dictionary(src_tgt_file):
  ret = [] 
  dic_map = {} 
  dic_rev_map = {} 
  with open(src_tgt_file, 'r' , encoding='utf8 ') as dict_file: 
    for ln , line in enumerate(dict_file):
      ret.append( re.split('\s|\t' , line.strip()))
      dic_map[ret[-1][0]] = ret[-1][1]
  
  dic_rev_map[ret[-1][1]] = ret[-1][0]
  return ret , dic_map , dic_rev_map 



def find_k_neighbhor(support_vectors, query, k = 5): 
  dists = np.zeros(support_vectors.shape[0] )
  for i,x in enumerate(support_vectors): 
    
    dists[i] = distance.cosine(support_vectors[i, :] ,  query)

  k_n = np.argsort(dists)[0:k]
  k_n_d  = dists[k_n]
  return k_n, k_n_d 

def load_vectors(src_model, tgt_model, dict):
  print('Loading Vectors')
  cnt = 0 
  dim = 0 
  for entry in dict: 
    if src_model.word_exist(entry[0]) and tgt_model.word_exist(entry[1]):
      if cnt == 0 :
        dim = src_model.get_word_vector(entry[0]).shape[0] 
      cnt += 1 
    elif len(entry) == 2: 
      print('{}  {}'.format(entry[0] , entry[1]))
    else:
      print(entry)
  src_vec = np.zeros((cnt, dim))
  tgt_vec = np.zeros((cnt, dim))
  cnt = 0 
  for entry in dict: 
    if src_model.word_exist(entry[0]) and tgt_model.word_exist(entry[1]):
      src_vec[cnt, :] = src_model.get_word_vector(entry[0]) 
      tgt_vec[cnt, :] = tgt_model.get_word_vector(entry[1])
      cnt += 1

  print('Total {} processed, {} word found'.format(len(dict), cnt))
  return src_vec, tgt_vec 


def batch_convert(support_vectors_src, support_vectors_dst ,  queries ):
  ret = np.zeros((queries.shape[0] , support_vectors_dst.shape[1]))
  for i in range(queries.shape[0]): 
    k_n, k_n_d = find_k_neighbhor(support_vectors_src , queries[i , :]) 
    weights = softmax(k_n_d) 
    ret[i, :] = np.matmul(weights,support_vectors_dst[k_n , :])
  
  return ret 

def calculate_mean_dict_distance(src_vec, tgt_vec): 
  sum = 0
  for ind in range(src_vec.shape[0]): 
      sum += distance.cosine(src_vec[ind, :], tgt_vec[ind, :]) 
  euclid_dist = np.linalg.norm(src_vec - tgt_vec)
  return sum / src_vec.shape[0]  , euclid_dist / src_vec.shape[0] 

def procrustes(A, B):
    """
    Find the best orthogonal matrix mapping using the Orthogonal Procrustes problem
    https://en.wikipedia.org/wiki/Orthogonal_Procrustes_problem
    """
    M = B.transpose().dot(A)
    U, S, V_t = np.linalg.svd(M, full_matrices=True)
    return  U.dot(V_t)

lst, dic_map , dic_rev_map = load_dictionary('../data/dictionaries/en-fa.0-5000.txt')
lst_full, dic_map_full , dic_rev_map_full = load_dictionary('../data/crosslingual/dictionaries/en-fa.txt')

model_src = model_static_en
model_dst = model_static_fa
support_vecs_src, support_vecs_dst = load_vectors(model_src, model_dst ,  lst)  
W = procrustes(support_vecs_dst , support_vecs_src) 

model_proc = ProcrustesModel(model_dst , W)
model_local = LocalConverted(model_dst , support_vecs_dst , support_vecs_src) 


In [None]:
import os
import io 
from scipy.stats import spearmanr


def get_word_pairs(path, lower=True):
    """
    Return a list of (word1, word2, score) tuples from a word similarity file.
    """
    assert os.path.isfile(path) and type(lower) is bool
    word_pairs = []
    with io.open(path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.rstrip()
            line = line.lower() if lower else line
            line = line.split()
            # ignore phrases, only consider words
            if len(line) != 3:
                assert len(line) > 3
                assert 'SEMEVAL17' in os.path.basename(path) or 'EN-IT_MWS353' in path
                continue
            word_pairs.append((line[0], line[1], float(line[2])))
    return word_pairs

  

def get_spearman_rho(model_src , model_dst, path):
  """
  Compute monolingual or cross-lingual word similarity score.
  """
  word_pairs = get_word_pairs(path)
  not_found = 0
  pred = []
  gold = []
  cnt = 0 
  for word1, word2, similarity in word_pairs:
      if not model_src.word_exist(word1) or not model_dst.word_exist(word2):
          # if model_src.word_exist(word1):
          #   print('Cannot find >{}< in dst {} '.format(word2 , model_dst.word_exist(word2) ))
          # else:
          #   print('Cannot find >{}< in src'.format(word1))

          not_found += 1
          continue
      cnt += 1
      u = model_src.get_word_vector(word1)
      v = model_dst.get_word_vector(word2)
      score = u.dot(v) / (np.linalg.norm(u) * np.linalg.norm(v))
      # if cnt == 1 :
      #   print('A ' , word1 , ' ' , word2 , ' ' , similarity, ' ' , score)
      gold.append(similarity)
      pred.append(score)
  return spearmanr(gold, pred).correlation, len(gold), not_found
print('Similarity Score - Base!: {} '.format(get_spearman_rho(model_src , model_dst , '../data/dictionaries/en-fa-SEMEVAL17.txt')))
print('Similarity Score - Proc: {} '.format(get_spearman_rho(model_src , model_proc , '../data/dictionaries/en-fa-SEMEVAL17.txt')))
print('Similarity Score - Local: {} '.format(get_spearman_rho(model_src , model_local , '../data/dictionaries/en-fa-SEMEVAL17.txt')))
