In [2]:
# import stuff
%load_ext autoreload
%autoreload 2
%matplotlib inline

from random import randint
import matplotlib

import numpy as np
import torch

GLOVE_PATH = '../dataset/GloVe/glove.840B.300d.txt'

# Load model

In [3]:
model = torch.load('infersent.allnli.pickle', map_location=lambda storage, loc: storage)
# On CPU, setting the right number of threads with "torch.set_num_threads(k)" may improve performance

In [4]:
model.set_glove_path(GLOVE_PATH)

In [5]:
model.build_vocab_k_words(K=100000)

Vocab size : 100000


# Load sentences

In [6]:
# Load some sentences
sentences = []
with open('text_0_AutomotiveProd7.txt') as f:
    i = 1
    for line in f:
        if i == 1:
            sentences.append(line.strip())
            i *= -1
        else:
            i *= -1
print(len(sentences))

101


In [7]:
sentences[:5]

['Much needed tool for anyone that has a truck. Fits in any location and allows you to store equipment securely.',
 "I have a 2007 Tacoma that I broke the tailgate on, so as a temporary repair I got a tailgate net and one of these bars to help prevent bed flex and also for when there was stuff in the bed. Ratchet mechanism took all of 30 seconds to figure out. Works fine install took maybe 1 minute, and it stays in place. Tightened down a bit and it was secure. The only reason for 4 stars was it does seem a little bit cheap in the material it's made out of but for the price it's great and hopefully it doesn't rust.",
 "Good Cargo Bar.  Doesn't get excessively tight in my truck (thus the 4 stars).  Good construction, use it as a cargo separator under my tonneau cover.",
 'This device works well, especially on narrower truck beds. The ratchet mechanism works better than the screw or collet designs. The ratcheting lever works exactly the same as those found on cargo straps. It is no more 

# Encode sentences

In [8]:
embeddings = model.encode(sentences, bsize=128, tokenize=False, verbose=True)
print('nb sentences encoded : {0}'.format(len(embeddings)))

Nb words kept : 4758/5377 (88.49 %)
Speed : 8.69 sentences/s (cpu mode, bsize=128)
nb sentences encoded : 101


# Visualization

In [9]:
np.linalg.norm(model.encode([sentences[0]]))

4.7021031

In [10]:
def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

In [11]:
model.encode([sentences[0]])[0]

array([ 0.08670553,  0.04914635,  0.05915082, ..., -0.01867001,
       -0.00985753,  0.05564953], dtype=float32)

In [12]:
cosine(model.encode([sentences[0]])[0], model.encode([sentences[3]])[0])

0.81201011

In [13]:
sentences[0]

'Much needed tool for anyone that has a truck. Fits in any location and allows you to store equipment securely.'

In [None]:
sentences[3]

'This device works well, especially on narrower truck beds. The ratchet mechanism works better than the screw or collet designs. The ratcheting lever works exactly the same as those found on cargo straps. It is no more hazardous to fingers than any of the many tools that have pinching hazards. The ratchet increments are wide enough that you may need to test where along the the bed wall the bar can be most effectively tightened.'

In [29]:
f = open('infersent_results.txt', 'w')
for i in range(len(sentences)):
    cosine_for_one = []
    for j in range(len(sentences)):
        if i != j:
            cosine_for_one.append([cosine(model.encode([sentences[i]])[0], model.encode([sentences[j]])[0]), i, j])
    cosine_for_one = sorted(cosine_for_one, reverse=True)
    #print(cosine_for_one)
    top_similar = cosine_for_one[:5]
    f.write('Similar to review '+str(i+1)+' - '+sentences[i]+" :\n")
    for ind, k in enumerate(top_similar):
        f.write(str(ind+1) + '. ' + sentences[k[2]]+'\n')
    f.write('\n')
    print('Review '+str(i)+' done.')
f.close()

Review 0 done.
Review 1 done.
Review 2 done.
Review 3 done.
Review 4 done.
Review 5 done.
Review 6 done.
Review 7 done.
Review 8 done.
Review 9 done.
Review 10 done.
Review 11 done.
Review 12 done.
Review 13 done.
Review 14 done.
Review 15 done.
Review 16 done.
Review 17 done.
Review 18 done.
Review 19 done.
Review 20 done.
Review 21 done.
Review 22 done.
Review 23 done.
Review 24 done.
Review 25 done.
Review 26 done.
Review 27 done.
Review 28 done.
Review 29 done.
Review 30 done.
Review 31 done.
Review 32 done.
Review 33 done.
Review 34 done.
Review 35 done.
Review 36 done.
Review 37 done.
Review 38 done.
Review 39 done.
Review 40 done.
Review 41 done.
Review 42 done.
Review 43 done.
Review 44 done.
Review 45 done.
Review 46 done.
Review 47 done.
Review 48 done.
Review 49 done.
Review 50 done.
Review 51 done.
Review 52 done.
Review 53 done.
Review 54 done.
Review 55 done.
Review 56 done.
Review 57 done.
Review 58 done.
Review 59 done.
Review 60 done.
Review 61 done.
Review 62 done.
Re

In [None]:
f = open('infersent_results_5.txt', 'w')
for i in range(5):
    cosine_for_one = []
    for j in range(len(sentences)):
        if i != j:
            cosine_for_one.append([cosine(model.encode([sentences[i]])[0], model.encode([sentences[j]])[0]), i, j])
    cosine_for_one = sorted(cosine_for_one, reverse=True)
    #print(cosine_for_one)
    top_similar = cosine_for_one[:5]
    f.write('Similar to review '+str(i+1)+' - '+sentences[i]+" :\n")
    for ind, k in enumerate(top_similar):
        f.write(str(ind+1) + '. ' + str(k[0]) + ' ' + sentences[k[2]]+'\n')
    f.write('\n')
    print('Review '+str(i)+' done.')
f.close()

Review 0 done.
Review 1 done.
Review 2 done.
