In [1]:
import numpy as np 
from Feat2Vec import Feat2Vec 
from PLSR import PLSR
from utils import * 

path = 'data/mcrae_feature_matrix.csv'
#path = 'data/cslb_feature_matrix.csv'

print('Building feat2vec')
model = Feat2Vec(path = path)

Using TensorFlow backend.


Building feat2vec


In [2]:
train_concepts = model.concepts[:400]
test_concepts = model.concepts[400:]

In [3]:
print('Training feat2vec')
model.train(till_convergence = True, verbose = 0, tolerence = 1e-4, lr = 1e-4, negative_samples = 20, train_words = train_concepts)
print('')

Training feat2vec
Epoch: 102 delta: 0.00024992341359439743


In [4]:
# test for word dog
print('Example features learned for word: dog')
print(model.topFeatures('dog', top = 10))
print('')

Example features learned for word: dog
[['0.43974062751589965' 'has_a_tail']
 ['0.4313424028298511' 'has_4_legs']
 ['0.4255573513706064' 'has_legs']
 ['0.41896888970903545' 'has_fur']
 ['0.41756476194926023' 'beh_-_eats']
 ['0.41220311616586325' 'an_animal']
 ['0.41099413218793607' 'a_mammal']
 ['0.3883812948003694' 'is_domestic']
 ['0.3870957216402644' 'a_pet']
 ['0.3750088358910616' 'has_hair']]



In [5]:
# build baseline model (50 and 200)
print('Building partial least squared regression (baseline)')
plsr50 = PLSR(path = path)
plsr50.train(train_words = train_concepts, embedding_size = 50)

plsr200 = PLSR(path = path)
plsr200.train(train_words = train_concepts, embedding_size = 200)
print('')

Building partial least squared regression (baseline)







In [6]:
concept_dict_plsr50 = {}
for index, concept in enumerate(model.test_words):
    concept_dict_plsr50[concept] = plsr50.test_preds[index,:]

In [7]:
concept_dict_plsr200 = {}
for index, concept in enumerate(model.test_words):
    concept_dict_plsr200[concept] = plsr200.test_preds[index,:]

In [8]:
index = 20
print('Word:', model.test_words[index])
topNeighbours(concept_dict_plsr200[model.test_words[index]], model, top = 20, gsf = True)

Word: shack


array([['0.32552525026173945', 'hut'],
       ['0.28971284875553865', 'cottage'],
       ['0.28189199556072553', 'cabin'],
       ['0.2710194796273678', 'cabinet'],
       ['0.26037179950357425', 'barrel'],
       ['0.24784760730668834', 'radio'],
       ['0.24717833364476072', 'crab'],
       ['0.23524295332035286', 'shack'],
       ['0.23450088081534548', 'house'],
       ['0.22868859610869371', 'shelves'],
       ['0.2269287790183703', 'board_(wood)'],
       ['0.22277244458902098', 'lobster'],
       ['0.2219940364897932', 'peg'],
       ['0.22149595799837699', 'stick'],
       ['0.21311575832971333', 'shed'],
       ['0.20724803437914505', 'asparagus'],
       ['0.2066278406857381', 'bungalow'],
       ['0.20659232338857692', 'caribou'],
       ['0.20542780295451799', 'table'],
       ['0.2052819044452731', 'buffalo']], dtype='<U19')

In [9]:
print('PLSR 50 neighbour scores')
tops = [1, 5, 10, 20]
for n in tops:
    print('Top', n, neighbourScore(concept_dict_plsr50, plsr50, top = n, gsf = True))

PLSR 50 neighbour scores
Top 1 4.25531914893617
Top 5 22.69503546099291
Top 10 35.46099290780142
Top 20 55.319148936170215


In [10]:
print('PLSR 200 neighbour scores')
tops = [1, 5, 10, 20]
for n in tops:
    print('Top', n, neighbourScore(concept_dict_plsr200, plsr50, top = n, gsf = True))

PLSR 200 neighbour scores
Top 1 5.673758865248227
Top 5 30.49645390070922
Top 10 42.5531914893617
Top 20 58.156028368794324


In [11]:
def constructVector(concept, model):
    new_vector = np.zeros(model.embedding_matrix.shape[0])
    for feature in [s[0] for s in model.concept_features[concept]]:
        new_vector += model.fvector(feature)
        
    return  new_vector / len(model.concept_features[concept])

In [12]:
concept_dict_f2v = {}
for index, concept in enumerate(model.test_words):
    concept_dict_f2v[concept] = constructVector(concept, model)

In [13]:
print('Word:', model.test_words[index])
topNeighbours(concept_dict_f2v[model.test_words[index]], model, top = 10)

Word: zucchini


array([['0.5881385403656619', 'cauliflower'],
       ['0.5762640634588051', 'broccoli'],
       ['0.5744984663637345', 'spinach'],
       ['0.5657014943110387', 'onions'],
       ['0.5406739140526505', 'zucchini'],
       ['0.5358089526698154', 'asparagus'],
       ['0.5351314578622572', 'celery'],
       ['0.5347323991825633', 'eggplant'],
       ['0.528428868415839', 'beets'],
       ['0.5144452262394842', 'garlic']], dtype='<U18')

In [14]:
tops = [1, 5, 10, 20]
for n in tops:
    print('Top', n, neighbourScore(concept_dict_f2v, model, top = n, gsf = False))

Top 1 7.801418439716312
Top 5 40.42553191489361
Top 10 52.4822695035461
Top 20 67.37588652482269


In [15]:
print('PLSR 50 Scores')
print('Train:', np.mean(plsr50.feature_score(type = 'train'))*100)
print('Test:', np.mean(plsr50.feature_score(type = 'test'))*100)

PLSR 50 Scores
Train: 49.77094877301921
Test: 33.239104227195746


In [16]:
print('PLSR 200 Scores')
print('Train:', np.mean(plsr200.feature_score(type = 'train'))*100)
print('Test:', np.mean(plsr200.feature_score(type = 'test'))*100)

PLSR 200 Scores
Train: 83.03836548340645
Test: 32.33900161230663


In [17]:
print('Feat2Vec Scores')
print('Train:', np.mean(model.feature_score(type = 'train'))*100)
print('Test:', np.mean(model.feature_score(type = 'test'))*100)

Feat2Vec Scores
Train: 70.15785016693198
Test: 34.53470193788469


In [18]:
plsr50_embs = dict(zip(plsr50.concepts, plsr50.regressor.predict(plsr50.embedding_matrix.T)))
plsr200_embs = dict(zip(plsr200.concepts, plsr200.regressor.predict(plsr200.embedding_matrix.T)))

In [19]:
f2v_embs = {}
for index, concept in enumerate(model.concepts):
    f2v_embs[concept] = constructVector(concept, model)

In [20]:
saveEmbs(plsr50_embs, name = 'PLSR50_MCRAE')
saveEmbs(plsr200_embs, name = 'PLSR200_MCRAE')
saveEmbs(f2v_embs, name = 'F2V_MCRAE')

In [21]:
import vecto
import vecto.embeddings 

plsr50_vsm = vecto.embeddings.load_from_dir('embeddings/PLSR50_MCRAE')
plsr200_vsm = vecto.embeddings.load_from_dir('embeddings/PLSR200_MCRAE')
f2v_vsm = vecto.embeddings.load_from_dir('embeddings/F2V_MCRAE')

In [22]:
vectoBenchmark(plsr50_vsm)

simlex999 - (0.46063852394335336, 44)
men - (0.7081742681841207, 94)


In [23]:
vectoBenchmark(plsr200_vsm)

simlex999 - (0.5446472633161313, 44)
men - (0.6996770736911174, 94)


In [24]:
vectoBenchmark(f2v_vsm)

simlex999 - (0.5587426893853894, 44)
men - (0.6643865978393246, 94)
