In [1]:
from Feat2Vec import Feat2Vec 

model = Feat2Vec()

Using TensorFlow backend.


In [13]:
len(list(set([model.norms['feature'][m] for m in range(model.norms.shape[0]) if int(model.norms['pf'][m]) >= 5])))

2725

In [14]:
from collections import Counter 

counter = Counter(model.norms['feature'])

In [17]:
len([c for c in counter.keys() if counter[c] >= 5])

832

In [25]:
import numpy as np

len([x for x in np.count_nonzero(model.feature_matrix, axis = 0) if x >= 5])

391

In [26]:
list(set([x for x in np.count_nonzero(model.feature_matrix, axis = 0) if x >= 5]).intersection(set([c for c in counter.keys() if counter[c] >= 5])))

[]

In [2]:
model.train(till_convergence = True, verbose = 0, tolerence = 1e-4, lr = 1e-4, negative_samples = 5)

Epoch: 130 delta: 0.002444298367120734

In [11]:
word = 'dog'

In [49]:
model.model.history.history['loss'][-2:]

[0.1964224589969943]

In [12]:
model.topFeatures(word, top = 10)

array([['0.3183206131579732', 'is_a_mammal'],
       ['0.3170398471029976', 'has_four_legs'],
       ['0.30504247246113414', 'has_fur_hair'],
       ['0.27310649096401407', 'has_a_tail'],
       ['0.27019059815861113', 'has_legs'],
       ['0.2596700709677454', 'does_eat'],
       ['0.25310576277942576', 'is_an_animal'],
       ['0.23818764031203604', 'has_teeth'],
       ['0.23517142426915305', 'has_ears'],
       ['0.23346586653381', 'is_small']], dtype='<U19')

In [13]:
def score(word):
    features = [f[0] for f in model.concept_features[word]]
    outputs = model.topFeatures(word, top = len(features))
    return len([s[1] for s in outputs if s[1] in features]) / len(features)

In [14]:
import numpy as np

scores = []
for word in model.concepts:
    scores.append(score(word = word))
    
print(np.mean(scores))

0.5770241444095124


In [52]:
# create data matrix of X and y
train_concepts = model.concepts[:500]
test_concepts = model.concepts[500:]

X_train = np.zeros((len(model.concepts), 300))
y_train = np.zeros((len(model.concepts), len(model.features)))

for concept in model.concepts:
    X[model.concept2id[concept],:] = model.wvector(concept)

In [53]:
from sklearn.cross_decomposition import PLSRegression

reg = PLSRegression(n_components=50)

In [54]:
reg.fit(X[:500], model.data_matrix[:500,:])

PLSRegression(copy=True, max_iter=500, n_components=50, scale=True, tol=1e-06)

In [56]:
preds = reg.predict(X[500:])

In [64]:
from sklearn.metrics.pairwise import cosine_similarity

def topNeighbours(concepts, index, matrix, top = 10):
    
    mat = cosine_similarity([matrix[index]], matrix)

    return np.flip([(concepts[num], mat[0,num]) for num in np.argsort(mat[0,:])[-top:]])

In [65]:
index = 10
print(test_concepts[index])
print(topNeighbours(test_concepts, index, preds))

skateboard
[['0.9999999999999998' 'skateboard']
 ['0.7226682187911303' 'unicycle']
 ['0.7045670443230222' 'skis']
 ['0.6521901352592201' 'tricycle']
 ['0.6453897050289952' 'surfboard']
 ['0.6085433760578571' 'trolley']
 ['0.5904976316490351' 'sledge']
 ['0.586568716094972' 'tyre']
 ['0.5849189078903646' 'wheelchair']
 ['0.5792003549433515' 'tractor']]


In [66]:
index = 10
print(test_concepts[index])
print(topNeighbours(test_concepts, index, X[500:]))

skateboard
[['1.000000000000001' 'skateboard']
 ['0.5577612335158455' 'surfboard']
 ['0.4716231054506681' 'unicycle']
 ['0.46998857531686955' 'skis']
 ['0.44224858453245736' 'tricycle']
 ['0.40690815768017835' 'shoes']
 ['0.3901972310168721' 'wheelchair']
 ['0.36824323507032' 'wheel']
 ['0.36002317662324107' 'truck']
 ['0.34179090942923473' 'yoyo']]


In [71]:
def neighbourScore(concepts, vectors1, vectors2, top):
    
    scores = []
    for index, concept in enumerate(concepts):
        outputs1 = [s[1] for s in topNeighbours(concepts, index, vectors1, top)]
        outputs2 = [s[1] for s in topNeighbours(concepts, index, vectors2, top)]
        
        scores.append(len([x for x in outputs1 if x in outputs2]))
        
    return np.mean(scores)

In [76]:
print(neighbourScore(test_concepts, X[500:], preds, top = 50))

32.2463768115942


In [45]:
from sklearn.metrics.pairwise import cosine_similarity

def baseline_topFeatures(word, top = 10):
    '''
        Function that gives the top cosine similar features for a word 
    '''

    mat = cosine_similarity([reg.x_scores_[model.concept2id[word]]], reg.y_rotations_)

    return np.flip([(model.features[num], mat[0,num]) for num in np.argsort(mat[0,:])[-top:]])

In [46]:
print(baseline_topFeatures('dog'))

[['0.9388999383515564' 'is_a_companion']
 ['0.9388999383515564' 'does_bark']
 ['0.9388999383515564' "is_man's_best_friend"]
 ['0.9388999383515564' 'does_wag_its_tail']
 ['0.8062235963112354' 'does_like_milk']
 ['0.8062235963112354' 'has_nine_lives']
 ['0.8062235963112354' 'does_purr']
 ['0.8062235963112354' 'does_meow']
 ['0.7189621218632534' 'has_fur_hair']
 ['0.6842676382193786' 'has_whiskers']]


In [47]:
def baseline_score(word):
    features = [f[0] for f in model.concept_features[word]]
    outputs = baseline_topFeatures(word, top = len(features))
    return len([s[1] for s in outputs if s[1] in features]) / len(features)

In [48]:
scores = []
for word in model.concepts:
    scores.append(baseline_score(word = word))
    
print(np.mean(scores))

0.5328836027287316


In [42]:
reg.y_rotations_.shape

(2725, 50)

In [87]:
index = 15
print(test_concepts[index])
for x in np.argsort(preds[index])[-10:]:
    print(model.id2feature[x])

slippers
is_comfortable
is_long
made_of_leather
is_an_animal
has_legs
is_clothing
is_warm
made_of_plastic
made_of_fabric_cloth_material
is_worn


In [89]:
model.concept_features[model['slippers']]

TypeError: 'Feat2Vec' object is not subscriptable