In [1]:
import numpy as np 
from Feature2Vec import Feature2Vec 
from PLSR import PLSR
from utils import * 
import os
os.environ["CUDA_VISIBLE_DEVICES"]="-1"    

SEED = 42
np.random.seed(seed = SEED)

path = 'data/mcrae_feature_matrix.csv'
#path = 'data/cslb_feature_matrix.csv'

print('Building feature2vec')
model = FeatureVec(path = path)

Using TensorFlow backend.


Building feat2vec


In [2]:
# get state needed to reproduce experiments from paper 
import pickle as pkl 

with open('state_zero.pkl', 'rb') as f:
    st0 = pkl.load(f)
np.random.set_state(st0)

In [3]:
shuffle = np.random.permutation(len(model.concepts))
train_concepts = list(np.asarray(model.concepts)[shuffle][:400])
test_concepts = list(np.asarray(model.concepts)[shuffle][400:])

In [5]:
print('Training feature2vec')
model.train(verbose = 1, epochs = 20, lr = 5e-3, negative_samples = 20, train_words = train_concepts)
print('')

Training feat2vec
Epoch: 19 Loss: 0.0026860087393787446


In [6]:
# test for word dog
word = 'dog'
print('Example features learned for word:', word)
print(model.top_features(model.wvector(word), top = 10))
print('')

Example features learned for word: dog
[['0.2758738349159628' 'has_a_wet_nose']
 ['0.2730467093512676' 'beh_-_barks']
 ['0.2724924595664283' 'has_4_legs']
 ['0.2700655491388194' 'is_domestic']
 ['0.2508572465465824' 'beh_-_chases']
 ['0.24673958423106437' 'has_fur']
 ['0.24578991111563422' 'beh_-_chases_cats']
 ['0.2441416081749012' 'has_a_tail']
 ['0.24304865630294026' 'a_pet']
 ['0.24184137316070192' 'a_carnivore']]



In [7]:
# build baseline model (50 and 200)
from PLSR import PLSR
print('Building partial least squared regression (50)')
plsr50 = PLSR(path = path)
plsr50.set_vocabulary(train_concepts)
plsr50.train(embedding_size = 50)

print('Building partial least squared regression (120)')
plsr120 = PLSR(path = path)
plsr120.set_vocabulary(train_concepts)
plsr120.train(embedding_size = 120)
print('')

Building partial least squared regression (50)
Building partial least squared regression (120)



In [8]:
concept_dict_plsr50 = {}
for index, concept in enumerate(plsr50.test_words):
    concept_dict_plsr50[concept] = plsr50.test_preds[index,:]

In [9]:
concept_dict_plsr120 = {}
for index, concept in enumerate(plsr120.test_words):
    concept_dict_plsr120[concept] = plsr120.test_preds[index,:]

In [10]:
print('PLSR 50 neighbour scores')
tops = [1, 5, 10, 20]
for n in tops:
    print('Top', n, neighbour_score(concept_dict_plsr50, plsr50, top = n))

PLSR 50 neighbour scores
Top 1 3.546099290780142
Top 5 14.184397163120568
Top 10 29.78723404255319
Top 20 47.5177304964539


In [11]:
print('PLSR 120 neighbour scores')
tops = [1, 5, 10, 20]
for n in tops:
    print('Top', n, neighbour_score(concept_dict_plsr120, plsr120, top = n))

PLSR 120 neighbour scores
Top 1 3.546099290780142
Top 5 25.53191489361702
Top 10 43.262411347517734
Top 20 53.90070921985816


In [12]:
concept_dict_f2v = {}
for index, concept in enumerate(model.test_words):
    concept_dict_f2v[concept] = construct_vector(concept, model)

In [13]:
tops = [1, 5, 10, 20]
for n in tops:
    print('Top', n, neighbour_score(concept_dict_f2v, model, top = n))

Top 1 4.25531914893617
Top 5 32.62411347517731
Top 10 47.5177304964539
Top 20 63.829787234042556


In [14]:
print('PLSR 50 Scores')
print('Train:', np.mean(feature_score(plsr50, data_type = 'train', max_features = 0))*100)
print('Test:', np.mean(feature_score(plsr50, data_type = 'test', max_features = 0))*100)

PLSR 50 Scores
Train: 49.523889981520554
Test: 31.67454974804622


In [15]:
print('PLSR 120 Scores')
print('Train:', np.mean(feature_score(plsr120, data_type = 'train', max_features = 0))*100)
print('Test:', np.mean(feature_score(plsr120, data_type = 'test', max_features = 0))*100)

PLSR 120 Scores
Train: 68.6591561040999
Test: 32.96850865151701


In [16]:
print('Feature2Vec Scores')
print('Train:', np.mean(feature_score(model, data_type = 'train', max_features = 0))*100)
print('Test:', np.mean(feature_score(model, data_type = 'test', max_features = 0))*100)

Feat2Vec Scores
Train: 89.47868798410427
Test: 35.05511595558365
