In [1]:
import numpy as np 
from Feat2Vec import Feat2Vec 
from PLSR import PLSR
from utils import * 

path = 'data/cslb_feature_matrix.csv'

print('Building feat2vec')
model = Feat2Vec(path = path)

Using TensorFlow backend.


Building feat2vec


In [2]:
train_concepts = model.concepts[:500]
test_concepts = model.concepts[500:]

In [3]:
print('Training feat2vec')
model.train(till_convergence = True, verbose = 0, tolerence = 1e-4, lr = 1e-4, negative_samples = 20, train_words = train_concepts)
print('')

Training feat2vec
Epoch: 99 delta: 0.00022754846373182133


In [4]:
# test for word dog
print('Example features learned for word: dog')
print(model.topFeatures('dog', top = 10))
print('')

Example features learned for word: dog
[['0.4874358124663263' 'has_four_legs']
 ['0.44311942656813064' 'has_fur_hair']
 ['0.4205652070482799' 'is_a_mammal']
 ['0.3976072042453551' 'has_a_tail']
 ['0.394490119159952' 'is_an_animal']
 ['0.3909242482739385' 'has_teeth']
 ['0.3858567289090658' 'has_ears']
 ['0.3753072610448306' 'is_a_pet']
 ['0.358741453867346' 'is_cute']
 ['0.3520630614506177' 'has_legs']]



In [5]:
# build baseline model (50 and 200)
print('Building partial least squared regression (baseline)')
plsr50 = PLSR(path = path)
plsr50.train(train_words = train_concepts, embedding_size = 50)

plsr200 = PLSR(path = path)
plsr200.train(train_words = train_concepts, embedding_size = 200)
print('')

Building partial least squared regression (baseline)







In [6]:
concept_dict_plsr50 = {}
for index, concept in enumerate(model.test_words):
    concept_dict_plsr50[concept] = plsr50.test_preds[index,:]

In [7]:
concept_dict_plsr200 = {}
for index, concept in enumerate(model.test_words):
    concept_dict_plsr200[concept] = plsr200.test_preds[index,:]

In [8]:
index = 20
print('Word:', model.test_words[index])
topNeighbours(concept_dict_plsr200[model.test_words[index]], model, top = 20, gsf = True)

Word: soup


array([['0.3873581898541869', 'mussel'],
       ['0.36057663412140073', 'oyster'],
       ['0.3464333145292914', 'artichoke'],
       ['0.3384501723863631', 'potato'],
       ['0.33828696850627504', 'bread'],
       ['0.3240404729491807', 'scallop'],
       ['0.3209531530680716', 'crab'],
       ['0.3095331335847945', 'turnip'],
       ['0.30716307589266034', 'mushroom'],
       ['0.30648663769143736', 'lobster'],
       ['0.29445732253386725', 'clam'],
       ['0.2931320377490968', 'cauliflower'],
       ['0.2910596319406533', 'soup'],
       ['0.2889041361992336', 'sweet_potato'],
       ['0.2845413996357119', 'sandwich'],
       ['0.28090072204818783', 'leek'],
       ['0.2776382784723095', 'yoghurt'],
       ['0.27731861025662274', 'ice_cream'],
       ['0.27288945280112664', 'crayfish'],
       ['0.270410578487398', 'squid']], dtype='<U19')

In [9]:
print('PLSR 50 neighbour scores')
tops = [1, 5, 10, 20]
for n in tops:
    print('Top', n, neighbourScore(concept_dict_plsr50, plsr50, top = n, gsf = True))

PLSR 50 neighbour scores
Top 1 5.072463768115942
Top 5 28.26086956521739
Top 10 42.028985507246375
Top 20 57.971014492753625


In [10]:
print('PLSR 200 neighbour scores')
tops = [1, 5, 10, 20]
for n in tops:
    print('Top', n, neighbourScore(concept_dict_plsr200, plsr200, top = n, gsf = True))

PLSR 200 neighbour scores
Top 1 5.072463768115942
Top 5 44.20289855072464
Top 10 62.31884057971014
Top 20 72.46376811594203


In [11]:
def constructVector(concept, model):
    new_vector = np.zeros(model.embedding_matrix.shape[0])
    for feature in [s[0] for s in model.concept_features[concept]]:
        new_vector += model.fvector(feature)
        
    return  new_vector / len(model.concept_features[concept])

In [12]:
concept_dict_f2v = {}
for index, concept in enumerate(model.test_words):
    concept_dict_f2v[concept] = constructVector(concept, model)

In [13]:
print('Word:', model.test_words[index])
topNeighbours(concept_dict_f2v[model.test_words[index]], model, top = 10)

Word: zebra


array([['0.4895793200423988', 'rhino'],
       ['0.4806879407382196', 'cheetah'],
       ['0.4616600598729683', 'hyena'],
       ['0.46081675426969737', 'lion'],
       ['0.44726759624116963', 'hippo'],
       ['0.43865938973578267', 'giraffe'],
       ['0.42842788139834437', 'elephant'],
       ['0.4127932940479721', 'deer'],
       ['0.3954507614515001', 'sheep'],
       ['0.38925437011254155', 'wolf']], dtype='<U19')

In [14]:
tops = [1, 5, 10, 20]
for n in tops:
    print('Top', n, neighbourScore(concept_dict_f2v, model, top = n, gsf = False))

Top 1 8.695652173913043
Top 5 40.57971014492754
Top 10 63.76811594202898
Top 20 76.81159420289855


In [15]:
print('PLSR 50 Scores')
print('Train:', np.mean(plsr50.feature_score(type = 'train'))*100)
print('Test:', np.mean(plsr50.feature_score(type = 'test'))*100)

PLSR 50 Scores
Train: 50.7850831106458
Test: 39.248256304666405


In [16]:
print('PLSR 200 Scores')
print('Train:', np.mean(plsr200.feature_score(type = 'train'))*100)
print('Test:', np.mean(plsr200.feature_score(type = 'test'))*100)

PLSR 200 Scores
Train: 77.40137030053522
Test: 40.206743136930065


In [17]:
print('Feat2Vec Scores')
print('Train:', np.mean(model.feature_score(type = 'train'))*100)
print('Test:', np.mean(model.feature_score(type = 'test'))*100)

Feat2Vec Scores
Train: 69.97012440436802
Test: 42.32375056906637


In [19]:
plsr50_embs = dict(zip(plsr50.concepts, plsr50.regressor.predict(plsr50.embedding_matrix.T)))
plsr200_embs = dict(zip(plsr200.concepts, plsr200.regressor.predict(plsr200.embedding_matrix.T)))

f2v_embs = {}
for index, concept in enumerate(model.concepts):
    f2v_embs[concept] = constructVector(concept, model)

In [20]:
saveEmbs(plsr50_embs, name = 'PLSR50_CSLB')
saveEmbs(plsr200_embs, name = 'PLSR200_CSLB')
saveEmbs(f2v_embs, name = 'F2V_CSLB')

In [21]:
import vecto
import vecto.embeddings 

plsr50_vsm = vecto.embeddings.load_from_dir('embeddings/PLSR50_CSLB')
plsr200_vsm = vecto.embeddings.load_from_dir('embeddings/PLSR200_CSLB')
f2v_vsm = vecto.embeddings.load_from_dir('embeddings/F2V_CSLB')

In [22]:
vectoBenchmark(plsr50_vsm)

simlex999 - (0.3752137482642991, 48)
men - (0.7530178312843285, 104)


In [23]:
vectoBenchmark(plsr200_vsm)

simlex999 - (0.5085362259461739, 48)
men - (0.7784559996754286, 104)


In [24]:
vectoBenchmark(f2v_vsm)

simlex999 - (0.44735771114671424, 48)
men - (0.7061614191188123, 104)


In [5]:
import vecto
import vecto.embeddings 

mcrae = vecto.embeddings.load_from_dir('embeddings/MCRAE')
cslb = vecto.embeddings.load_from_dir('embeddings/CSLB')

In [6]:
vectoBenchmark(mcrae)

simlex999 - (0.5725154311271989, 44)
men - (0.6760324599062513, 94)


In [7]:
vectoBenchmark(cslb)

simlex999 - (0.5130050024627594, 48)
men - (0.7544790806157509, 104)


In [49]:
words = []
with open('data/benchmarks/benchmarks/similarity/en/simlex999.txt', 'r') as f:
    for line in f:
        words.append(line.split()[0])
        words.append(line.split()[1])
        
# with open('data/benchmarks/benchmarks/similarity/en/men.txt', 'r') as f:
#     for line in f:
#         words.append(line.split()[0])
#         words.append(line.split()[1])

In [33]:
words = list(set(words))

In [34]:
len(words)

1577

In [35]:
import spacy 

nlp = spacy.load('en_core_web_lg')

spacy_embs = {}
for word in words:
    token = nlp(u'' + word)
    if token.has_vector:
        spacy_embs[word] = token.vector

In [39]:
saveEmbs(spacy_embs, name = 'SPACY')

In [40]:
import vecto
import vecto.embeddings 

spacy_vsm = vecto.embeddings.load_from_dir('embeddings/SPACY')

In [41]:
vectoBenchmark(spacy_vsm)

simlex999 - (0.389916600639166, 999)
men - (0.8079343906476782, 3000)


In [56]:
import pandas as pd 

mcrae_thing = pd.read_csv('data/mcrae_feature_matrix.csv', index_col=[0])
cslb_thing = pd.read_csv('data/cslb_feature_matrix.csv', index_col=[0])

In [57]:
words = []
for w in mcrae_thing['Vectors'].values:
    words.append(w)
    
for w in cslb_thing['Vectors'].values:
    words.append(w)

In [58]:
words = list(set(words))

In [61]:
counts = 0
with open('data/benchmarks/benchmarks/similarity/en/simlex999.txt', 'r') as f:
    for line in f:
        w1 = line.split()[0]
        w2 = line.split()[1]
        if w1 in words and w2 in words:
            counts += 1

In [62]:
counts

66

In [2]:
model.data_matrix.shape

(638, 2725)

In [3]:
model.concept2id['dog']

193

In [4]:
model.concept2id['cat']

113

In [5]:
np.sum(model.data_matrix[model.concept2id['dog'],:] - model.data_matrix[model.concept2id['cat'],:])

-68.0

In [9]:
from sklearn.metrics.pairwise import cosine_similarity

mat = cosine_similarity([model.data_matrix[model.concept2id['dog'],:]] , model.data_matrix)

np.flip([(model.id2concept[num], mat[0,num]) for num in np.argsort(mat[0,:])[-10:]])

array([['0.9999999999999998', 'dog'],
       ['0.6073745169687377', 'cat'],
       ['0.5718375094655525', 'gerbil'],
       ['0.5636246379114933', 'guinea_pig'],
       ['0.5483845449592721', 'hamster'],
       ['0.5386415417273727', 'raccoon'],
       ['0.505346770544747', 'ox'],
       ['0.4928220864774071', 'rabbit'],
       ['0.483938929472962', 'chipmunk'],
       ['0.4761029120481616', 'llama']], dtype='<U18')