In [1]:
# import stuff
%load_ext autoreload
%autoreload 2
%matplotlib inline

from random import randint
import matplotlib

import numpy as np
import torch

GLOVE_PATH = '../dataset/GloVe/glove.840B.300d.txt'

# Load model

In [2]:
model = torch.load('infersent.allnli.pickle', map_location=lambda storage, loc: storage)
# On CPU, setting the right number of threads with "torch.set_num_threads(k)" may improve performance

In [3]:
model.set_glove_path(GLOVE_PATH)

In [4]:
model.build_vocab_k_words(K=100000)

Vocab size : 100000


# Load sentences

In [5]:
# Load sentences
sentences = []
indi_sent_rev = []
with open('text_0_AutomotiveProd7.txt') as f:
    i = 1
    rev_num = -1
    for line in f:
        if i == 1:
            rev_num += 1
            l = line.strip()
            indi_sent = l.split('.')
            indi_sent = [a.strip() for a in indi_sent]
            indi_sent = list(filter(None, indi_sent))
            for k in range(len(indi_sent)):
                indi_sent_rev.append(rev_num)
            sentences.extend(indi_sent)
            i *= -1
        else:
            i *= -1
print(len(sentences))

382


In [6]:
sentences[:5]

['Much needed tool for anyone that has a truck',
 'Fits in any location and allows you to store equipment securely',
 'I have a 2007 Tacoma that I broke the tailgate on, so as a temporary repair I got a tailgate net and one of these bars to help prevent bed flex and also for when there was stuff in the bed',
 'Ratchet mechanism took all of 30 seconds to figure out',
 'Works fine install took maybe 1 minute, and it stays in place']

# Encode sentences

In [7]:
embeddings = model.encode(sentences, bsize=128, tokenize=False, verbose=True)
print('nb sentences encoded : {0}'.format(len(embeddings)))

Nb words kept : 5656/5982 (94.55 %)
Speed : 47.38 sentences/s (cpu mode, bsize=128)
nb sentences encoded : 382


In [8]:
print(embeddings.shape)

(382, 4096)


# Visualization

In [9]:
np.linalg.norm(model.encode([sentences[0]]))

3.7739689

In [10]:
def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

In [11]:
model.encode([sentences[0]])[0]

array([ 0.04275879,  0.04914635,  0.05915082, ..., -0.00428127,
       -0.03567129,  0.05815831], dtype=float32)

In [12]:
cosine(model.encode([sentences[0]])[0], model.encode([sentences[3]])[0])

0.52351725

In [13]:
sentences[0]

'Much needed tool for anyone that has a truck'

In [14]:
sentences[3]

'Ratchet mechanism took all of 30 seconds to figure out'

In [17]:
f = open('infersent_results.txt', 'w')
for i in range(len(sentences)):
    cosine_for_one = []
    for j in range(len(sentences)):
        if i != j:
            cosine_for_one.append([cosine(model.encode([sentences[i]])[0], model.encode([sentences[j]])[0]), i, j])
    cosine_for_one = sorted(cosine_for_one, reverse=True)
    #print(cosine_for_one)
    top_similar = cosine_for_one[:5]
    f.write('Similar to review '+str(i+1)+' - '+sentences[i]+" :\n")
    for ind, k in enumerate(top_similar):
        f.write(str(ind+1) + '. ' + str(k[0]) + ' ' + sentences[k[2]]+'\n')
    f.write('\n')
    print('Review '+str(i)+' done.')
f.close()

Review 0 done.
Review 1 done.
Review 2 done.


KeyboardInterrupt: 

In [15]:
f = open('infersent_results_5.txt', 'w')
for i in range(5):
    cosine_for_one = []
    for j in range(len(sentences)):
        if i != j:
            cosine_for_one.append([cosine(model.encode([sentences[i]])[0], model.encode([sentences[j]])[0]), i, j])
    cosine_for_one = sorted(cosine_for_one, reverse=True)
    #print(cosine_for_one)
    top_similar = cosine_for_one[:5]
    f.write('Similar to review '+str(i+1)+' - '+sentences[i]+" :\n")
    for ind, k in enumerate(top_similar):
        f.write(str(ind+1) + '. ' + str(k[0]) + ' ' + sentences[k[2]]+'\n')
    f.write('\n')
    print('Review '+str(i)+' done.')
f.close()

Review 0 done.
Review 1 done.
Review 2 done.
Review 3 done.
Review 4 done.


In [20]:
docLabel = []
for i, j in enumerate(indi_sent_rev):
    docLabel.append('sent'+str(i)+'r'+str(j))

In [21]:
print(docLabel[:10])

['sent0r0', 'sent1r0', 'sent2r1', 'sent3r1', 'sent4r1', 'sent5r1', 'sent6r1', 'sent7r2', 'sent8r2', 'sent9r2']


In [22]:
import pandas as pd
from sklearn.cluster import KMeans
## K-means ##
num_clusters = 20
km = KMeans(n_clusters=num_clusters)
km.fit(embeddings)
clusters = km.labels_.tolist()

In [23]:
## Print Sentence Clusters ##
cluster_info = {'sentence': sentences, 'cluster' : clusters}
sentenceDF = pd.DataFrame(cluster_info, index=[clusters], columns = ['sentence','cluster'])

In [26]:
for num in range(num_clusters):
    print("Sentence cluster %d: " %int(num+1), end='')
    print()
    #print(sentenceDF.ix[num]['sentence'])
    flag = 0
    i = 1
    for sentence in sentenceDF.ix[num]['sentence']:
        print(str(i)+': '+sentence)
        i += 1
    print()

Sentence cluster 1: 
1: I have a 2007 Tacoma that I broke the tailgate on, so as a temporary repair I got a tailgate net and one of these bars to help prevent bed flex and also for when there was stuff in the bed
2: Just a wee bit tricky to learn to operate at first, blame instructions on one word, instead of saying "rotate" should say "pull over" for the lever, engage last black notch (so easy to miss), then adjust length to short or long quickly
3: This was an early Fathers Day gift for my husband, and he couldn't be happier! Easy to use, love the ratchet instead of a twist and lock,held the load in the truck without shifting when we went camping (literally 40 miles of bad road), good pads on ends so truck bed is protected
4: So far, no "pinches", either; perhaps I was forewarned by the misadventures of others, but I'm nevertheless cautious!  It grips tightly, and doesn't slip; it packs up to a small, manageable size; what's not to like?  BTW, I'm using this to hold up the folded rea

In [9]:
from sklearn.cluster import SpectralClustering
from sklearn import metrics

# Spectral Clustering
sc = SpectralClustering(20, affinity='nearest_neighbors', n_init=1000)
sc.fit(embeddings)
print('spectral clustering')
print(sc.labels_)

spectral clustering
[18  4  2 14 14  4 13  9  4  4 15 11 11  0 10 17  4  0  4  4  4  3  9  4  1
 11  4 17  0  4  4  0  0  7 11  4 13 16  1 13  6 19 12 17  4 11  7  5  5 12
  4 13  4  8  2  3  9 18  0  3  5 17 18 12  3 17  1  0  3  0  2 11  2  9 14
  2 19 10  4 19  2 14  4  2  5  4  5  4  4 18 13  3  2 10  7 12 16  8  4 17
  7 10 10  4  7 16 13 14 15 13  9  6  4  4  4  4  0 10 10 15  8 18 13  8  6
 15  0 14  3  3 13  4  0  9 15  0 14 14  5  4 19  4  0  4  4  4  0  4 16 18
  4  3  4 13  4  3 10  4 17 10  4  1  0  2  4 18 13  4  0  6  3  4  4 18  3
  8 16  1  3  3  8  7 19 19  3  3  0  4 12 12 12  5 16 17  4 11 14  4 17  6
 16 12  1 14 14  4 19 13  4 19 14  2  0  4 16 13  4  0  4 18  2  0 18  3  4
 12 19  4 19  4  7  4 13  5  5 10 13  4 18 11  9  4 16  4  4  3  6  7  3  4
 15 13 11  9  4 19  0  9 12 15  7  8  4  4  6 16 16 18  1  4  9  6 18 12  3
  8 13  4  2 17  4  1  4 15  4  7 11 14  4  4 13  4  4 10 19  4  0  0  4 18
  4 11  0  7 17  4 19  8  3 15 18 18  8  1  2  4  4  4 18  4  2  0  

In [10]:
labels = sc.labels_.flatten()
print(labels.shape)
labels = list(labels)
for i in set(sc.labels_):
    print(str(i)+': '+str(labels.count(i)))

(382,)
0: 29
1: 10
2: 18
3: 22
4: 94
5: 10
6: 10
7: 13
8: 12
9: 13
10: 11
11: 16
12: 13
13: 23
14: 18
15: 10
16: 12
17: 12
18: 21
19: 15


In [11]:
print(sc.labels_[:5])

[18  4  2 14 14]


In [12]:
for i in range(40):
    print('Cluster '+str(i)+':')
    cnt = 1
    for k, l in enumerate(sentences):
        if sc.labels_[k] == i:
            print(str(cnt)+': '+l)
            cnt += 1
    print()

Cluster 0:
1: It is no more hazardous to fingers than any of the many tools that have pinching hazards
2: You have to make sure you don't ratchet this thing down too hard, if you do, you will regret that last click
3: Just a wee bit tricky to learn to operate at first, blame instructions on one word, instead of saying "rotate" should say "pull over" for the lever, engage last black notch (so easy to miss), then adjust length to short or long quickly
4: Of course once you know how to use it, does not mean others can, makes it difficult to quickly steal compare to others I've seen on friend's trucks
5: When not in use, it fits behind my seat in the regular cab (unfortunately always stuffed) without stealing any space
6: The product is cheaply made and would not stay locked in place on the desired adjustment
7: I received the order promptly but the ratcheting device did not function as well as I had expected
8: I just received 2 ratcheting cargo bars and I am just as quickly returning the

In [13]:
# Load reviews
reviews = []
with open('text_0_AutomotiveProd7.txt') as f:
    i = 1
    rev_num = -1
    for line in f:
        if i == 1:
            rev_num += 1
            l = line.strip()
            reviews.append(l)
            i *= -1
        else:
            i *= -1
print(len(reviews))

101


In [18]:
rev_cluster = []
for i, rev in enumerate(reviews):
    rev_specific_cluster = []
    for j, sent in enumerate(sentences):
        if sent in rev:
            rev_specific_cluster.append(sc.labels_[j])
    rev_cluster.append([i, list(set(rev_specific_cluster))])

In [19]:
print(rev_cluster[1])

[1, [1, 2, 4, 13, 14]]


### OfficeProd13

In [20]:
# Load sentences
sentences = []
indi_sent_rev = []
with open('text_0_OfficeProd13.txt') as f:
    i = 1
    rev_num = -1
    for line in f:
        if i == 1:
            rev_num += 1
            l = line.strip()
            indi_sent = l.split('.')
            indi_sent = [a.strip() for a in indi_sent]
            indi_sent = list(filter(None, indi_sent))
            for k in range(len(indi_sent)):
                indi_sent_rev.append(rev_num)
            sentences.extend(indi_sent)
            i *= -1
        else:
            i *= -1
print(len(sentences))

401


In [21]:
embeddings = model.encode(sentences, bsize=128, tokenize=False, verbose=True)
print('nb sentences encoded : {0}'.format(len(embeddings)))

Nb words kept : 5944/6476 (91.79 %)
Speed : 16.34 sentences/s (cpu mode, bsize=128)
nb sentences encoded : 401


In [22]:
from sklearn.cluster import SpectralClustering
from sklearn import metrics

# Spectral Clustering
sc = SpectralClustering(20, affinity='nearest_neighbors', n_init=1000)
sc.fit(embeddings)

SpectralClustering(affinity='nearest_neighbors', assign_labels='kmeans',
          coef0=1, degree=3, eigen_solver=None, eigen_tol=0.0, gamma=1.0,
          kernel_params=None, n_clusters=20, n_init=1000, n_jobs=1,
          n_neighbors=10, random_state=None)

In [23]:
labels = sc.labels_.flatten()
print(labels.shape)
labels = list(labels)
for i in set(sc.labels_):
    print(str(i)+': '+str(labels.count(i)))

(401,)
0: 16
1: 111
2: 18
3: 11
4: 10
5: 9
6: 8
7: 8
8: 16
9: 9
10: 14
11: 14
12: 24
13: 16
14: 18
15: 15
16: 36
17: 18
18: 17
19: 13


In [24]:
for i in range(40):
    print('Cluster '+str(i)+':')
    cnt = 1
    for k, l in enumerate(sentences):
        if sc.labels_[k] == i:
            print(str(cnt)+': '+l)
            cnt += 1
    print()

Cluster 0:
1: spent a whole 10 bucks :) and I just threw these bastards arround the house on every drawer on every corner, everywhere
2: This box of 60 I bought, arrived and I counted an exact 44 pens in it, not sure where the remaining 16 went to
3: I have used at least 20 of them so far & have yet to come across one that doesn't work well
4: This package, as an 'add-on' item, cost under 10 cents per pen so I ordered two and took them in
5: Your basic pen in a high enough quantity that you don't mind if a few dozen go missing
6: I have to buy a bunch of these for the office since they keep disappearing
7: I tend to lose a lot of pens before they run out of ink, so can't speak to how long they'll last, but I'm sure i'll be buying these again once i lose all 60 again
8: I have found that even I have trouble loosing these things if I buy them a dozen at a time
9: 60 pens in one box for a low price! I haven't used all the pens so far, but what I've used and given away worked without probl