In [1]:
import numpy as np 
from Feat2Vec import Feat2Vec 
from PLSR import PLSR
from utils import * 
import os
os.environ["CUDA_VISIBLE_DEVICES"]="-1"  

SEED = 42
np.random.seed(seed = SEED)

path = 'data/mcrae_feature_matrix.csv'

print('Building feat2vec')
model = Feat2Vec(path = path)

Using TensorFlow backend.


Building feat2vec


In [2]:
shuffle = np.random.permutation(len(model.concepts))
train_concepts = list(np.asarray(model.concepts)[shuffle][:500])
test_concepts = list(np.asarray(model.concepts)[shuffle][500:])

In [3]:
print('Training feat2vec')
model.train(verbose = 0, lr = 1e-3, epochs = 2, negative_samples = 20, train_words = train_concepts)
print('')

Training feat2vec
Epoch: 1 Loss: 0.3580756779236183


In [4]:
import copy 

mat = copy.deepcopy(model.data_matrix)

In [5]:
mat[mat > 0] = 1

In [9]:
mat.shape

(541, 2526)

In [25]:
np.sort([len(features) for concept in model.concepts for features in model.concept_features[concept]])

array([2, 2, 2, ..., 2, 2, 2])

In [19]:
for concept in model.concepts:
    features = model.concept_features[concept]
    if len(features) ==7:
        print(concept)

bayonet
bucket
cork
peg
pipe_(smoking)
spade
urn
yam


In [20]:
sorted(model.concept_features['cork'], key=lambda tup: tup[1], reverse = True)[:100]

[('used_for_sealing_wine_bottles', 16.0),
 ('made_of_wood', 9.0),
 ('used_for_sealing_bottles', 9.0),
 ('used_for_sealing', 7.0),
 ('used_for_message_boards', 7.0),
 ('is_brown', 6.0),
 ('comes_from_trees', 6.0)]

In [45]:
sorted([(concept, len(model.concept_features[concept])) for concept in model.concepts], 
       key=lambda tup: tup[1], reverse = True)

[('alligator', 32),
 ('tiger', 30),
 ('fox', 28),
 ('rabbit', 28),
 ('elephant', 26),
 ('lion', 26),
 ('calf', 25),
 ('frog', 25),
 ('pig', 25),
 ('washing_machine', 25),
 ('zebra', 25),
 ('lamb', 24),
 ('leopard', 24),
 ('monkey', 24),
 ('rhubarb', 24),
 ('wolf', 24),
 ('bee', 23),
 ('cow', 23),
 ('flip_flops', 23),
 ('hippo', 23),
 ('horse', 23),
 ('lobster', 23),
 ('milk', 23),
 ('potato', 23),
 ('sunflower', 23),
 ('swan', 23),
 ('tea', 23),
 ('television', 23),
 ('ambulance', 22),
 ('beer', 22),
 ('cat', 22),
 ('coffee', 22),
 ('eagle', 22),
 ('eye', 22),
 ('giraffe', 22),
 ('necklace', 22),
 ('owl', 22),
 ('shoes', 22),
 ('toad', 22),
 ('bacon', 21),
 ('book', 21),
 ('champagne', 21),
 ('chicken', 21),
 ('crocodile', 21),
 ('dolphin', 21),
 ('gorilla', 21),
 ('hamster', 21),
 ('harpoon', 21),
 ('heart', 21),
 ('jelly', 21),
 ('leg', 21),
 ('limousine', 21),
 ('mouse', 21),
 ('newspaper', 21),
 ('panther', 21),
 ('peacock', 21),
 ('pineapple', 21),
 ('pumpkin', 21),
 ('rhino', 21)

In [20]:
len(model.concept_features['oyster'])

15

In [6]:
# test for word dog
print('Example features learned for word: dog')
print(model.topFeatures('dog', top = 10))
print('')

Example features learned for word: dog
[['0.33736581472385563' 'is_an_animal']
 ['0.3164451337336031' 'is_a_mammal']
 ['0.31100116690861757' 'has_fur_hair']
 ['0.30825791426327265' 'has_teeth']
 ['0.27861600946408627' 'has_a_tail']
 ['0.27575490961174454' 'has_four_legs']
 ['0.2639988336137963' 'is_a_pet']
 ['0.2613066615694819' 'is_cute']
 ['0.25842316452853537' 'is_big_large']
 ['0.2535060842885607' 'does_eat_grass']]



In [9]:
model.concept_features['dog']

[('an_animal', 16.0),
 ('has_a_tail', 10.0),
 ('has_fur', 14.0),
 ('a_mammal', 6.0),
 ('has_4_legs', 18.0),
 ('has_legs', 18.0),
 ('a_pet', 12.0),
 ('has_hair', 7.0),
 ('a_carnivore', 5.0),
 ('is_domestic', 5.0),
 ('beh_-_barks', 21.0),
 ("is_man's_best_friend", 10.0),
 ('used_for_protection', 7.0),
 ('beh_-_chases', 6.0),
 ('beh_-_chases_cats', 6.0),
 ('has_a_nose', 5.0),
 ('has_a_wet_nose', 5.0),
 ('is_friendly', 5.0)]

In [10]:
# test for word dog
print('Example features learned for word: dog')
print(model.topFeatures('dog', top = 10))
print('')

Example features learned for word: dog
[['0.3334511562690621' 'has_a_tail']
 ['0.3114374763546076' 'has_4_legs']
 ['0.2778455114490192' 'is_domestic']
 ['0.27641124222178143' 'an_animal']
 ['0.2704217432051071' 'has_fur']
 ['0.26297182952729786' 'is_black']
 ['0.260977222385825' 'a_pet']
 ['0.2561419740389714' 'has_whiskers']
 ['0.2445012246793494' 'a_carnivore']
 ['0.23323119223134064' 'beh_-_is_nocturnal']]



In [11]:
# build baseline model (50 and 200)
print('Building partial least squared regression (50)')
plsr50 = PLSR(path = path)
plsr50.train(train_words = train_concepts, embedding_size = 50)

print('Building partial least squared regression (200)')
plsr200 = PLSR(path = path)
plsr200.train(train_words = train_concepts, embedding_size = 200)
print('')

Building partial least squared regression (50)
Building partial least squared regression (200)



In [12]:
concept_dict_plsr50 = {}
for index, concept in enumerate(model.test_words):
    concept_dict_plsr50[concept] = plsr50.test_preds[index,:]

24.390243902439025

In [11]:
concept_dict_plsr200 = {}
for index, concept in enumerate(model.test_words):
    concept_dict_plsr200[concept] = plsr200.test_preds[index,:]

In [8]:
index = 20
print('Word:', model.test_words[index])
#topNeighbours(concept_dict_plsr200[model.test_words[index]], model, top = 20)

Word: box


NameError: name 'topNeighbours' is not defined

In [12]:
print('PLSR 50 neighbour scores')
tops = [1, 5, 10, 20]
for n in tops:
    print('Top', n, neighbourScore(concept_dict_plsr50, plsr50, top = n))

PLSR 50 neighbour scores
Top 1 2.4390243902439024
Top 5 17.073170731707318
Top 10 24.390243902439025
Top 20 48.78048780487805


In [16]:
tops = [1, 5, 10, 20]
for n in tops:
    print('Top', n, np.mean([int(concept in plsr50.topNeighbours(concept_dict_plsr50[concept], n)) 
                             for concept in test_concepts]) *100)

Top 1 2.4390243902439024
Top 5 17.073170731707318
Top 10 24.390243902439025
Top 20 48.78048780487805


In [22]:
tops = [1, 5, 10, 20]
for n in tops:
    print('Top', n, len([concept for concept in test_concepts if concept in 
      plsr50.topNeighbours(concept_dict_plsr50[concept], n)]) / len(test_concepts) * 100)

Top 1 2.4390243902439024
Top 5 17.073170731707318
Top 10 24.390243902439025
Top 20 48.78048780487805


In [13]:
print('PLSR 200 neighbour scores')
tops = [1, 5, 10, 20]
for n in tops:
    print('Top', n, neighbourScore(concept_dict_plsr200, plsr200, top = n))

PLSR 200 neighbour scores
Top 1 4.878048780487805
Top 5 26.82926829268293
Top 10 41.46341463414634
Top 20 58.536585365853654


In [14]:
def constructVector(concept, model):
    new_vector = np.zeros(model.embedding_matrix.shape[0])
    for feature in [s[0] for s in model.concept_features[concept]]:
        new_vector += model.fvector(feature)
        
    return  new_vector / len(model.concept_features[concept])

In [15]:
concept_dict_f2v = {}
for index, concept in enumerate(model.test_words):
    concept_dict_f2v[concept] = constructVector(concept, model)

In [13]:
print('Word:', model.test_words[index])
topNeighbours(concept_dict_f2v[model.test_words[index]], model, top = 10)

Word: zebra


NameError: name 'topNeighbours' is not defined

In [16]:
tops = [1, 5, 10, 20]
for n in tops:
    print('Top', n, neighbourScore(concept_dict_f2v, model, top = n))

Top 1 9.75609756097561
Top 5 36.58536585365854
Top 10 51.21951219512195
Top 20 63.41463414634146


In [17]:
print('PLSR 50 Scores')
print('Train:', np.mean(plsr50.feature_score(type = 'train'))*100)
print('Test:', np.mean(plsr50.feature_score(type = 'test'))*100)

PLSR 50 Scores
Train: 47.76861821857716
Test: 29.939537754232315


In [18]:
print('PLSR 200 Scores')
print('Train:', np.mean(plsr200.feature_score(type = 'train'))*100)
print('Test:', np.mean(plsr200.feature_score(type = 'test'))*100)

PLSR 200 Scores
Train: 76.56106416425739
Test: 32.28709723159621


In [19]:
print('Feat2Vec Scores')
print('Train:', np.mean(model.feature_score(type = 'train'))*100)
print('Test:', np.mean(model.feature_score(type = 'test'))*100)

Feat2Vec Scores
Train: 75.18043197622886
Test: 33.163801827813


In [49]:
scores = plsr50.feature_score(type = 'test')

In [53]:
features = [x[0] for x in plsr50.concept_features[concept]]

In [55]:
print(concept)
print(features)

wheel
['made_of_metal', 'is_round', 'made_of_rubber', 'used_on_bicycles', 'inbeh_-_rolls', 'used_on_cars', 'used_for_mobility', 'is_invented', 'has_spokes', 'has_rims']


In [63]:
plsr50._topFeatures(plsr50.test_preds[36,:], top = len(features))

array([['11.44363242042159', 'made_of_metal'],
       ['8.110211952645932', 'a_tool'],
       ['6.459163401058991', 'has_a_handle'],
       ['6.214866354010754', 'a_bird'],
       ['6.210007458594687', 'is_sharp'],
       ['5.774026203701385', 'a_weapon'],
       ['4.784106737100295', 'a_fruit'],
       ['4.630671424875498', 'made_of_wood'],
       ['3.992531625494567', 'used_for_killing'],
       ['3.7598023379565606', 'is_dangerous']], dtype='<U18')

In [59]:
for index, c in enumerate(test_concepts):
    if c == concept:
        print(index)

36


In [61]:
[x[1] for x in plsr50._topFeatures(plsr50.test_preds[36,:], top = len(features))]

['made_of_metal',
 'a_tool',
 'has_a_handle',
 'a_bird',
 'is_sharp',
 'a_weapon',
 'a_fruit',
 'made_of_wood',
 'used_for_killing',
 'is_dangerous']

In [None]:
def feature_score(model, type = 'test'):
    '''
            Scores model on its ability to retrieve correct features
    '''
    if type == 'train':
        concepts = self.train_words
    if type == 'test':
        concepts = self.test_words

    total_scores = []
    for concept in concepts:
        features = [x[0] for x in self.concept_features[concept]]
        num_features = len(features)

        predicted_features = [x[1] for x in self.topFeatures(concept, top = num_features)]

        # scores = [int(features[num] == predicted_features[num]) for num in range(num_features)]
        positives = [f for f in predicted_features if f in features]

        # total_scores.append(f1_score(np.ones(num_features), scores))
        total_scores.append(len(positives) / num_features)

    return total_scores

In [19]:
plsr50_embs = dict(zip(plsr50.concepts, plsr50.regressor.predict(plsr50.embedding_matrix.T)))
plsr200_embs = dict(zip(plsr200.concepts, plsr200.regressor.predict(plsr200.embedding_matrix.T)))

f2v_embs = {}
for index, concept in enumerate(model.concepts):
    f2v_embs[concept] = constructVector(concept, model)

In [20]:
saveEmbs(plsr50_embs, name = 'PLSR50_CSLB')
saveEmbs(plsr200_embs, name = 'PLSR200_CSLB')
saveEmbs(f2v_embs, name = 'F2V_CSLB')

In [21]:
import vecto
import vecto.embeddings 

plsr50_vsm = vecto.embeddings.load_from_dir('embeddings/PLSR50_CSLB')
plsr200_vsm = vecto.embeddings.load_from_dir('embeddings/PLSR200_CSLB')
f2v_vsm = vecto.embeddings.load_from_dir('embeddings/F2V_CSLB')

In [22]:
vectoBenchmark(plsr50_vsm)

simlex999 - (0.3752137482642991, 48)
men - (0.7530178312843285, 104)


In [23]:
vectoBenchmark(plsr200_vsm)

simlex999 - (0.5085362259461739, 48)
men - (0.7784559996754286, 104)


In [24]:
vectoBenchmark(f2v_vsm)

simlex999 - (0.44735771114671424, 48)
men - (0.7061614191188123, 104)


In [5]:
import vecto
import vecto.embeddings 

mcrae = vecto.embeddings.load_from_dir('embeddings/MCRAE')
cslb = vecto.embeddings.load_from_dir('embeddings/CSLB')

In [6]:
vectoBenchmark(mcrae)

simlex999 - (0.5725154311271989, 44)
men - (0.6760324599062513, 94)


In [7]:
vectoBenchmark(cslb)

simlex999 - (0.5130050024627594, 48)
men - (0.7544790806157509, 104)


In [49]:
words = []
with open('data/benchmarks/benchmarks/similarity/en/simlex999.txt', 'r') as f:
    for line in f:
        words.append(line.split()[0])
        words.append(line.split()[1])
        
# with open('data/benchmarks/benchmarks/similarity/en/men.txt', 'r') as f:
#     for line in f:
#         words.append(line.split()[0])
#         words.append(line.split()[1])

In [33]:
words = list(set(words))

In [34]:
len(words)

1577

In [35]:
import spacy 

nlp = spacy.load('en_core_web_lg')

spacy_embs = {}
for word in words:
    token = nlp(u'' + word)
    if token.has_vector:
        spacy_embs[word] = token.vector

In [39]:
saveEmbs(spacy_embs, name = 'SPACY')

In [40]:
import vecto
import vecto.embeddings 

spacy_vsm = vecto.embeddings.load_from_dir('embeddings/SPACY')

In [41]:
vectoBenchmark(spacy_vsm)

simlex999 - (0.389916600639166, 999)
men - (0.8079343906476782, 3000)


In [56]:
import pandas as pd 

mcrae_thing = pd.read_csv('data/mcrae_feature_matrix.csv', index_col=[0])
cslb_thing = pd.read_csv('data/cslb_feature_matrix.csv', index_col=[0])

In [57]:
words = []
for w in mcrae_thing['Vectors'].values:
    words.append(w)
    
for w in cslb_thing['Vectors'].values:
    words.append(w)

In [58]:
words = list(set(words))

In [61]:
counts = 0
with open('data/benchmarks/benchmarks/similarity/en/simlex999.txt', 'r') as f:
    for line in f:
        w1 = line.split()[0]
        w2 = line.split()[1]
        if w1 in words and w2 in words:
            counts += 1

In [62]:
counts

66

In [2]:
model.data_matrix.shape

(638, 2725)

In [3]:
model.concept2id['dog']

193

In [4]:
model.concept2id['cat']

113

In [5]:
np.sum(model.data_matrix[model.concept2id['dog'],:] - model.data_matrix[model.concept2id['cat'],:])

-68.0

In [9]:
from sklearn.metrics.pairwise import cosine_similarity

mat = cosine_similarity([model.data_matrix[model.concept2id['dog'],:]] , model.data_matrix)

np.flip([(model.id2concept[num], mat[0,num]) for num in np.argsort(mat[0,:])[-10:]])

array([['0.9999999999999998', 'dog'],
       ['0.6073745169687377', 'cat'],
       ['0.5718375094655525', 'gerbil'],
       ['0.5636246379114933', 'guinea_pig'],
       ['0.5483845449592721', 'hamster'],
       ['0.5386415417273727', 'raccoon'],
       ['0.505346770544747', 'ox'],
       ['0.4928220864774071', 'rabbit'],
       ['0.483938929472962', 'chipmunk'],
       ['0.4761029120481616', 'llama']], dtype='<U18')