In [1]:
import numpy as np 
from Feat2Vec import Feat2Vec 
from PLSR import PLSR
from utils import * 
import os
os.environ["CUDA_VISIBLE_DEVICES"]="-1"    

SEED = 42
np.random.seed(seed = SEED)

path = 'data/mcrae_feature_matrix.csv'
#path = 'data/cslb_feature_matrix.csv'

print('Building feat2vec')
model = Feat2Vec(path = path)

Using TensorFlow backend.


Building feat2vec


In [2]:
shuffle = np.random.permutation(len(model.concepts))
train_concepts = list(np.asarray(model.concepts)[shuffle][:400])
test_concepts = list(np.asarray(model.concepts)[shuffle][400:])

In [4]:
import tensorflow as tf 

print(tf.test.is_gpu_available())

False


In [5]:
print('Training feat2vec')
model.train(verbose = 0, epochs = 200, lr = 0.001, negative_samples = 20, train_words = train_concepts)
print('')

Training feat2vec
Epoch: 199 Loss: 0.0017884020326719284


In [6]:
# test for word dog
print('Example features learned for word: dog')
print(model.topFeatures('dog', top = 10))
print('')

Example features learned for word: dog
[['0.26643879377080965' 'is_domestic']
 ['0.2635950565122341' 'has_a_wet_nose']
 ['0.25031002594992974' 'beh_-_chases_cats']
 ['0.24618182238421044' 'beh_-_chases']
 ['0.23003760398155676' 'has_fur']
 ['0.2299577929414023' 'has_a_nose']
 ['0.2222580441846434' 'beh_-_barks']
 ['0.21511611470728875' "is_man's_best_friend"]
 ['0.21296843460324924' 'has_a_tail']
 ['0.20549251559938664' 'a_pet']]



In [7]:
# build baseline model (50 and 200)
print('Building partial least squared regression (baseline)')
plsr50 = PLSR(path = path)
plsr50.train(train_words = train_concepts, embedding_size = 50)

plsr200 = PLSR(path = path)
plsr200.train(train_words = train_concepts, embedding_size = 200)
print('')

Building partial least squared regression (baseline)



In [8]:
concept_dict_plsr50 = {}
for index, concept in enumerate(model.test_words):
    concept_dict_plsr50[concept] = plsr50.test_preds[index,:]

In [9]:
concept_dict_plsr200 = {}
for index, concept in enumerate(model.test_words):
    concept_dict_plsr200[concept] = plsr200.test_preds[index,:]

In [8]:
index = 20
print('Word:', model.test_words[index])
topNeighbours(concept_dict_plsr200[model.test_words[index]], model, top = 20)

Word: building


NameError: name 'topNeighbours' is not defined

In [10]:
print('PLSR 50 neighbour scores')
tops = [1, 5, 10, 20]
for n in tops:
    print('Top', n, neighbourScore(concept_dict_plsr50, plsr50, top = n))

PLSR 50 neighbour scores
Top 1 3.546099290780142
Top 5 14.184397163120568
Top 10 29.78723404255319
Top 20 47.5177304964539


In [11]:
print('PLSR 200 neighbour scores')
tops = [1, 5, 10, 20]
for n in tops:
    print('Top', n, neighbourScore(concept_dict_plsr200, plsr50, top = n))

PLSR 200 neighbour scores
Top 1 2.8368794326241136
Top 5 29.78723404255319
Top 10 45.39007092198582
Top 20 58.86524822695035


In [12]:
def constructVector(concept, model):
    new_vector = np.zeros(model.embedding_matrix.shape[0])
    for feature in [s[0] for s in model.concept_features[concept]]:
        new_vector += model.fvector(feature)
        
    return  new_vector / len(model.concept_features[concept])

In [13]:
concept_dict_f2v = {}
for index, concept in enumerate(model.test_words):
    concept_dict_f2v[concept] = constructVector(concept, model)

In [14]:
print('Word:', model.test_words[index])
topNeighbours(concept_dict_f2v[model.test_words[index]], model, top = 10)

Word: worm


NameError: name 'topNeighbours' is not defined

In [15]:
tops = [1, 5, 10, 20]
for n in tops:
    print('Top', n, neighbourScore(concept_dict_f2v, model, top = n))

Top 1 4.25531914893617
Top 5 32.62411347517731
Top 10 43.97163120567376
Top 20 60.99290780141844


In [16]:
print('PLSR 50 Scores')
print('Train:', np.mean(plsr50.feature_score(type = 'train'))*100)
print('Test:', np.mean(plsr50.feature_score(type = 'test'))*100)

PLSR 50 Scores
Train: 49.523889981520554
Test: 31.67454974804622


In [17]:
print('PLSR 200 Scores')
print('Train:', np.mean(plsr200.feature_score(type = 'train'))*100)
print('Test:', np.mean(plsr200.feature_score(type = 'test'))*100)

PLSR 200 Scores
Train: 83.27890187522203
Test: 30.361290408718116


In [18]:
print('Feat2Vec Scores')
print('Train:', np.mean(model.feature_score(type = 'train'))*100)
print('Test:', np.mean(model.feature_score(type = 'test'))*100)

Feat2Vec Scores
Train: 91.12825373408393
Test: 34.38880626944655


In [18]:
plsr50_embs = dict(zip(plsr50.concepts, plsr50.regressor.predict(plsr50.embedding_matrix.T)))
plsr200_embs = dict(zip(plsr200.concepts, plsr200.regressor.predict(plsr200.embedding_matrix.T)))

In [19]:
f2v_embs = {}
for index, concept in enumerate(model.concepts):
    f2v_embs[concept] = constructVector(concept, model)

In [20]:
saveEmbs(plsr50_embs, name = 'PLSR50_MCRAE')
saveEmbs(plsr200_embs, name = 'PLSR200_MCRAE')
saveEmbs(f2v_embs, name = 'F2V_MCRAE')

In [21]:
import vecto
import vecto.embeddings 

plsr50_vsm = vecto.embeddings.load_from_dir('embeddings/PLSR50_MCRAE')
plsr200_vsm = vecto.embeddings.load_from_dir('embeddings/PLSR200_MCRAE')
f2v_vsm = vecto.embeddings.load_from_dir('embeddings/F2V_MCRAE')

In [22]:
vectoBenchmark(plsr50_vsm)

simlex999 - (0.46063852394335336, 44)
men - (0.7081742681841207, 94)


In [23]:
vectoBenchmark(plsr200_vsm)

simlex999 - (0.5446472633161313, 44)
men - (0.6996770736911174, 94)


In [24]:
vectoBenchmark(f2v_vsm)

simlex999 - (0.5587426893853894, 44)
men - (0.6643865978393246, 94)
