In [1]:
# import stuff
%load_ext autoreload
%autoreload 2
%matplotlib inline

from random import randint
import matplotlib

import numpy as np
import torch

GLOVE_PATH = '../dataset/GloVe/glove.840B.300d.txt'

In [2]:
model = torch.load('infersent.allnli.pickle', map_location=lambda storage, loc: storage)
# On CPU, setting the right number of threads with "torch.set_num_threads(k)" may improve performance

In [3]:
model.set_glove_path(GLOVE_PATH)

In [4]:
model.build_vocab_k_words(K=100000)

Vocab size : 100000


# Load sentences

In [5]:
# Load sentences
sentences = []
indi_sent_rev = []
with open('text_0_OfficeProd13.txt') as f:
    i = 1
    rev_num = -1
    for line in f:
        if i == 1:
            rev_num += 1
            l = line.strip()
            indi_sent = l.split('.')
            indi_sent = [a.strip() for a in indi_sent]
            indi_sent = list(filter(None, indi_sent))
            for k in range(len(indi_sent)):
                indi_sent_rev.append(rev_num)
            sentences.extend(indi_sent)
            i *= -1
        else:
            i *= -1
print(len(sentences))

401


# Encode sentences

In [6]:
embeddings = model.encode(sentences, bsize=128, tokenize=False, verbose=True)
print('nb sentences encoded : {0}'.format(len(embeddings)))

Nb words kept : 5944/6476 (91.79 %)
Speed : 34.55 sentences/s (cpu mode, bsize=128)
nb sentences encoded : 401


In [7]:
print(embeddings.shape)

(401, 4096)


# Load ratings

In [8]:
ratings = []
f = open('rating_0_OfficeProd13.txt', 'r')
for l in f:
    ratings.append(float(l))

In [9]:
print(ratings[:10])

[5.0, 5.0, 5.0, 1.0, 5.0, 5.0, 5.0, 4.0, 5.0, 5.0]


# Append ratings to embeddings

In [10]:
print(indi_sent_rev[:10])

[0, 0, 1, 1, 1, 1, 2, 2, 2, 3]


In [11]:
embeddings = list(embeddings)
for i in range(len(sentences)):
    embeddings[i] = np.append(embeddings[i], np.array([ratings[indi_sent_rev[i]]]))

In [12]:
embeddings = np.array(embeddings)

In [13]:
print(embeddings.shape)

(401, 4097)


# Agglomerative Clustering

In [14]:
from sklearn.cluster import AgglomerativeClustering
from sklearn import metrics

# Agglomerative Clustering
ag = AgglomerativeClustering(n_clusters=40, affinity='cosine', linkage='complete')
ag.fit(embeddings)
print('agglomerative clustering')
print(ag.labels_)

agglomerative clustering
[18 18 18  0 16 18 17  7 18 25  4 11 17  7 17 17 17 16 17 17 17 18 18  5  7
 17  7  1  1 17 17 17  0 17  7  7 17 17 17 17 17 17 17 17  7 18  4 34 33 39
 16  7  7  7  6 18  7 17 18 18  1 17  7 17 17 17  5  7 36  1  0  5  0  0 17
 18 18 18 18 18 12 17  0 12 17 18 17  1  7  7 18  7 17 17 17  1 17  7 17  7
  1  5 17  0  0 17 18 17 17 17 16  5 36  0 17  7  7 18  7  7  7 17  7 17  7
 17  1  7 17  7  6  0  5  7 17 17  0  0  7 18  0 17 18  7 16 17  0 12  7  5
  9 18 17 17  7 17 12 30 17 22 21 11 17  1 36 12 36 18  7  9 17 13  5  7  7
 17 17 17 18 17 18 17 12 16  0  0  7  0  0 17 17 18 18 17 17 18  0  5 17 13
 18 17 18 18 18 17 17  7 16 16  1 17 17 17  7  6  5 18  1 12 12  1  6  1 17
  1 17  5  7 35 18 16 12  1 18 17  7 18 18 18  5 17 17 16 18 18  7 17  7 17
 10 23 28  8 17 12 18 18 17  0  0  0 18  0 17 17  7 17 17 17  8 18  5  0 16
  0  1  0  1  7  7 17 17  7  7 20 19  3 31 17 16 17  7  7 17 17  1 17 17  5
 18  1  0  1  7  1 17 17  7 36  1  1 17  7  6  7  7 17  7 17 17

In [15]:
labels = ag.labels_.flatten()
print(labels.shape)
labels = list(labels)
for i in set(ag.labels_):
    print(str(i)+': '+str(labels.count(i)))

(401,)
0: 32
1: 27
2: 5
3: 2
4: 2
5: 18
6: 5
7: 64
8: 3
9: 2
10: 1
11: 2
12: 12
13: 4
14: 1
15: 1
16: 15
17: 123
18: 55
19: 1
20: 1
21: 1
22: 1
23: 1
24: 1
25: 1
26: 1
27: 1
28: 1
29: 1
30: 1
31: 1
32: 1
33: 1
34: 1
35: 1
36: 7
37: 1
38: 1
39: 1


In [17]:
for i in range(40):
    print('Cluster '+str(i)+':')
    cnt = 1
    for k, l in enumerate(sentences):
        if ag.labels_[k] == i:
            print(str(cnt)+': '+l)
            cnt += 1
    print()

Cluster 0:
1: spent a whole 10 bucks :) and I just threw these bastards arround the house on every drawer on every corner, everywhere
2: I have taken these overseas for storage and use in our bookstore and they always work
3: I personally don't mind  using them, I live with a few people in my house and my pens tend to get lost  quite often
4: These should last my four grandchildren a couple years
5: The grandchildren were very happy to receive them just before school started
6: This package, as an 'add-on' item, cost under 10 cents per pen so I ordered two and took them in
7: I have to buy a bunch of these for the office since they keep disappearing
8: My employees like them so I think I will continue to buy them
9: I tend to lose a lot of pens before they run out of ink, so can't speak to how long they'll last, but I'm sure i'll be buying these again once i lose all 60 again
10: I have the little twerps! And I don't even have to be mean about it like I do with the endless bathroom and

# Test for Original embeddings

In [18]:
embeddings_orig = model.encode(sentences, bsize=128, tokenize=False, verbose=True)
print('nb sentences encoded : {0}'.format(len(embeddings_orig)))

Nb words kept : 5944/6476 (91.79 %)
Speed : 46.27 sentences/s (cpu mode, bsize=128)
nb sentences encoded : 401


In [19]:
print(embeddings_orig.shape)

(401, 4096)


In [20]:
from sklearn.cluster import AgglomerativeClustering
from sklearn import metrics

# Agglomerative Clustering
ag = AgglomerativeClustering(n_clusters=40, affinity='cosine', linkage='complete')
ag.fit(embeddings_orig)
print('agglomerative clustering')
print(ag.labels_)

agglomerative clustering
[26  8 12  8 25 22 12  6 24  3 30  5  6  6  9  5  1 37 12  8  5 22  9 34  3
 13  4 12 12 12  8 12  8  8 11 11 12  6  8  8  8 12 12 12 13  8 30  8  7 10
 17 11  3 13 27 25  5 22  9 22  8 12 24  8  8  8 12 30  8 26 12 26  5 25 13
  8 26  5 25 22  6 12  8 12 12 38 12 12 26  5  5 13  7  7  7 12 12  6  8 24
 12 22  7  8  8  8 24  8 12  7 39 34  3 12 12  4 14 38 13 14  3  5  3  8 30
 12 12  4  8 30  0 12 26  3  6 12 12  8 13  8  8  8 13 33 10 12  8  8 38  7
 36  8  8  5 30  8  6 26  8  8  1  5 26 12  3 12  8  8 28 36  7 16  6  2 24
  8  8 12 25  6 22 12 12 32 12  8 13  8  9  8 13 38  8  7 12  8 12 26  8 16
 22  6 38 22  9 12  7 24 17 20 12 12  8  8 38 31 34  6 12 12 12  8 29 12  8
 12 12  1 13  7 26 21 12 12 22 12 12 26 38  8 26 22  1 23 26  8 14 12  8 22
 13 19 28  8 12  8 13 26 12  8  8  8 12  8 12 12 13  8  8  5  8 22 26  8 21
 26  9  8  7  4  3 12 12 30  6 26 21  3  8  9 37  9  4 13 12  8  8  8 12  6
 30 12 12 12 30 12  5  5 13  3  7  8 12  6  0  2  8  7  6  7  8

In [21]:
labels = ag.labels_.flatten()
print(labels.shape)
labels = list(labels)
for i in set(ag.labels_):
    print(str(i)+': '+str(labels.count(i)))

(401,)
0: 2
1: 4
2: 3
3: 13
4: 5
5: 16
6: 17
7: 19
8: 93
9: 9
10: 2
11: 3
12: 88
13: 19
14: 4
15: 2
16: 2
17: 2
18: 3
19: 3
20: 1
21: 3
22: 16
23: 1
24: 8
25: 7
26: 20
27: 1
28: 4
29: 1
30: 11
31: 1
32: 1
33: 1
34: 3
35: 1
36: 2
37: 2
38: 7
39: 1


In [22]:
for i in range(40):
    print('Cluster '+str(i)+':')
    cnt = 1
    for k, l in enumerate(sentences):
        if ag.labels_[k] == i:
            print(str(cnt)+': '+l)
            cnt += 1
    print()

Cluster 0:
1: "  Ha
2: Went back

Cluster 1:
1: If you're into blue ball point pens, this is where its at
2: The ink is very scant and light colored
3: These pens are blue
4: BIC round Stic black 36 pens Medium point

Cluster 2:
1: held up well
2: They didn't have them
3: they explode!

Cluster 3:
1: These do not work well, cheaply made
2: Does not bleed
3: Wrote smoothly, color good, no flaws
4: Not the smoothest pen, nor the most comfortable to hold, but if you can get this at less than a dime per pen, it's a great value
5: Forth, they do no leak
6: And finally, they are not unpleasant to write with
7: They do not leak all over the place and smudge
8: They won't win any beauty contests, but they do put ink on paper, and they do it cheaply
9: Not too fine and not too thick
10: I would not suggest buying this product
11: Not as good as the Cristal pens, but decent
12: Nothing special whe you look at them but very enjoyable
13: I have not had any "duds" as you get sometimes with generic

# Try splitting clusters

In [23]:
embeddings_split = model.encode(sentences, bsize=128, tokenize=False, verbose=True)
print('nb sentences encoded : {0}'.format(len(embeddings_split)))

Nb words kept : 5944/6476 (91.79 %)
Speed : 40.87 sentences/s (cpu mode, bsize=128)
nb sentences encoded : 401


In [24]:
print(embeddings_split.shape)

(401, 4096)


In [25]:
from sklearn.cluster import AgglomerativeClustering
from sklearn import metrics

# Agglomerative Clustering
ag = AgglomerativeClustering(n_clusters=30, affinity='cosine', linkage='complete')
ag.fit(embeddings_split)
print('agglomerative clustering')
print(ag.labels_)

agglomerative clustering
[ 4  3 12  3 25 13 12  3 24  8  0 26  3  3 11 26  2 18 12  3 26 13 11  4  8
  0 10 12 12 12  3 12  3  3  5  5 12  3  3  3  3 12 12 12  0  3  0  3  2 22
  5  5  8  0 27 25 26 13 11 13  3 12 24  3  3  3 12  0  3  4 12  4 26 25  0
  3  4 26 25 13  3 12  3 12 12 11 12 12  4 26 26  0  2  2  2 12 12  3  3 24
 12 13  2  3  3  3 24  3 12  2 19  4  8 12 12 10 14 11  0 14  8 26  8  3  0
 12 12 10  3  0  9 12  4  8  3 12 12  3  0  3  3  3  0  6 22 12  3  3 11  2
  6  3  3 26  0  3  3  4  3  3  2 26  4 12  8 12  3  3 28  6  2 16  3 23 24
  3  3 12 25  3 13 12 12  1 12  3  0  3 11  3  0 11  3  2 12  3 12  4  3 16
 13  3 11 13 11 12  2 24  5 20 12 12  3  3 11 15  4  3 12 12 12  3 29 12  3
 12 12  2  0  2  4 21 12 12 13 12 12  4 11  3  4 13  2  1  4  3 14 12  3 13
  0 13 28  3 12  3  0  4 12  3  3  3 12  3 12 12  0  3  3 26  3 13  4  3 21
  4 11  3  2 10  8 12 12  0  3  4 21  8  3 11 18 11 10  0 12  3  3  3 12  3
  0 12 12 12  0 12 26 26  0  8  2  3 12  3  9 23  3  2  3  2  3

In [26]:
labels = ag.labels_.flatten()
print(labels.shape)
labels = list(labels)
for i in set(ag.labels_):
    print(str(i)+': '+str(labels.count(i)))

(401,)
0: 30
1: 2
2: 23
3: 110
4: 23
5: 5
6: 3
7: 2
8: 13
9: 2
10: 5
11: 16
12: 91
13: 19
14: 4
15: 1
16: 2
17: 1
18: 2
19: 1
20: 1
21: 3
22: 2
23: 3
24: 8
25: 7
26: 16
27: 1
28: 4
29: 1


In [27]:
cluster_new = []
for i in range(30):
    temp_sentences = []
    pos_sent = []
    neg_sent = []
    #print('Cluster '+str(i)+':')
    for k, l in enumerate(sentences):
        if ag.labels_[k] == i:
            temp_sentences.append(k)
    for sent in temp_sentences:
        if ratings[indi_sent_rev[sent]] >= 3.0:
            pos_sent.append(sent)
        else:
            neg_sent.append(sent)
    cluster_new.append(pos_sent)
    cluster_new.append(neg_sent)

In [28]:
print(len(cluster_new))

60


In [29]:
print(cluster_new[:30])

[[25, 44, 53, 67, 74, 91, 118, 124, 129, 138, 142, 154, 186, 190, 228, 256, 266, 283, 293, 300, 304, 308, 336, 345, 364, 395], [10, 46, 250, 385], [183, 243], [], [16, 92, 93, 94, 102, 109, 149, 170, 193, 206, 227, 229, 242, 278, 310, 317, 319, 349, 377, 394, 398], [48, 160], [1, 3, 7, 12, 13, 19, 30, 32, 33, 37, 38, 39, 40, 45, 60, 63, 64, 65, 68, 75, 80, 82, 97, 98, 103, 104, 105, 107, 123, 128, 134, 137, 139, 140, 141, 146, 147, 151, 152, 155, 156, 158, 166, 167, 172, 175, 176, 179, 185, 187, 189, 192, 195, 198, 201, 212, 213, 217, 221, 224, 239, 245, 248, 255, 259, 260, 261, 263, 267, 268, 273, 277, 284, 295, 296, 297, 299, 311, 313, 316, 318, 320, 321, 323, 327, 329, 334, 350, 351, 352, 356, 359, 368, 370, 374, 376, 400], [47, 159, 253, 270, 288, 333, 380, 382, 386, 387, 388, 396, 397], [0, 23, 69, 71, 76, 88, 111, 132, 157, 162, 197, 216, 230, 237, 240, 244, 257, 272, 275, 338, 353, 365], [285], [34, 35, 50, 51, 208], [], [143, 150, 169], [], [363, 375], [], [24, 52, 112, 120, 12

In [30]:
for i in range(30):
    print('Cluster '+str(i)+'_pos:')
    cnt = 1
    for j in cluster_new[2*i]:
        print(str(cnt)+'. '+sentences[j])
        cnt += 1
    print()
    cnt = 1
    print('Cluster '+str(i)+'_neg:')
    for j in cluster_new[2*i+1]:
        print(str(cnt)+'. '+sentences[j])
        cnt += 1
    print()

Cluster 0_pos:
1. We are sure to reorder this product
2. Turns out that they also write well
3. I would buy again and recommend to others
4. I'm glad I found them
5. If you're like me and constantly misplace pens, this is the product for you
6. They work and does its job
7. Second, they actually write
8. I never know
9. I forgot my pen
10. Anyway, they write smoothly and get the job done
11. Be that as it may, I certainly can recommend these things
12. I couldn't ask for more!
13. Okay lets be serious now
14. Everyone has heard of the "You get what you pay for" saying before
15. They write well
16. These guys are just what I need and the work great
17. We will order again
18. Now I only want to use this one
19. would buy again and again
20. I recomend this product
21. Luckily I did
22. They write the same as always, too
23. I recommend you buy
24. I recommend them
25. They do actually write every time for me
26. I find it to be a great balance, and on top of that they are cheap!

Clust