In [1]:
# import stuff
%load_ext autoreload
%autoreload 2
%matplotlib inline

from random import randint
import matplotlib

import numpy as np
import torch

GLOVE_PATH = '../dataset/GloVe/glove.840B.300d.txt'

In [2]:
model = torch.load('infersent.allnli.pickle', map_location=lambda storage, loc: storage)
# On CPU, setting the right number of threads with "torch.set_num_threads(k)" may improve performance

In [3]:
model.set_glove_path(GLOVE_PATH)

In [4]:
model.build_vocab_k_words(K=100000)

Vocab size : 100000


# Load sentences

In [5]:
# Load sentences
sentences = []
indi_sent_rev = []
with open('text_0_OfficeProd13.txt') as f:
    i = 1
    rev_num = -1
    for line in f:
        if i == 1:
            rev_num += 1
            l = line.strip()
            indi_sent = l.split('.')
            indi_sent = [a.strip() for a in indi_sent]
            indi_sent = list(filter(None, indi_sent))
            for k in range(len(indi_sent)):
                indi_sent_rev.append(rev_num)
            sentences.extend(indi_sent)
            i *= -1
        else:
            i *= -1
print(len(sentences))

401


# Encode sentences

In [6]:
embeddings = model.encode(sentences, bsize=128, tokenize=False, verbose=True)
print('nb sentences encoded : {0}'.format(len(embeddings)))

Nb words kept : 5944/6476 (91.79 %)
Speed : 45.99 sentences/s (cpu mode, bsize=128)
nb sentences encoded : 401


In [7]:
print(embeddings.shape)

(401, 4096)


# Load ratings

In [8]:
ratings = []
f = open('rating_0_OfficeProd13.txt', 'r')
for l in f:
    ratings.append(float(l))

In [9]:
print(ratings[:10])

[5.0, 5.0, 5.0, 1.0, 5.0, 5.0, 5.0, 4.0, 5.0, 5.0]


# Append ratings to embeddings

In [10]:
print(indi_sent_rev[:10])

[0, 0, 1, 1, 1, 1, 2, 2, 2, 3]


In [11]:
embeddings = list(embeddings)
for i in range(len(sentences)):
    embeddings[i] = np.append(embeddings[i], np.array([ratings[indi_sent_rev[i]]]))

In [12]:
embeddings = np.array(embeddings)

In [13]:
print(embeddings.shape)

(401, 4097)


# Spectral Clustering

In [14]:
from sklearn.cluster import SpectralClustering
from sklearn import metrics

# Spectral Clustering
sc = SpectralClustering(40, affinity='nearest_neighbors', n_init=1000)
sc.fit(embeddings)
print('spectral clustering')
print(sc.labels_)

spectral clustering
[38  8 38 20 38 13 36 29 10  4  9 24 22 19 37 22 26 33 38 23 22 18 38 30 17
 21 16  0 35 36 38 38 38 38  5  5 13  8 28  2 38 38 13 27  6 28  9  9  4 24
  5  2 17  2 38 27 10 13 21 27 26 13 10 25  8 28 12 19 31 31 32 16 22 22 21
 39 15 21 21 18 38  1 37 38 28 34 38 38 29 38 29  6  7  7  7 35  8 29 38 10
 13 38 12 14 14 23 10 38 38  7 33 30 38 32 15 38 34 34  6 34 17 22  6 25 29
 38 36 34 20 29 38 20 30 17 23  8 20 14  6  8 14  8 21  6  5 22 39 32 34 12
 38 38 35 38 19 25 38 38 28  9  4 24  1  0 38 32 38  8 16 38  7 38 38 16 38
 21 23 12 29 23 27 38 32 33 36 14 38 37 37 35  2 12 28  7 23 39 25 38 38 38
 18 38 18 38 38 38 12 10  5  5 38 20 23 25 38 38 30  6 36 36 36 26 38 36  8
 36 25 26  6 20 15 38 36 36 13 35 38 15 34 39 30 38 38 33 15  8 34 28 38 27
 31 24 24 31 20 32 38 39 39 14 29 38 15 22 38 13  2 25 38 22 31 18 30 11 16
 31 38 31 16 21 17 20 28 23 38 24 24  9  4 38 33 21 34  2 37 28 26 25 13 16
 16  0 11  0 29  0  1 22 16 38  0 26 38 15 38 34  2 26 29 12 38 14 3

In [15]:
labels = sc.labels_.flatten()
print(labels.shape)
labels = list(labels)
for i in set(sc.labels_):
    print(str(i)+': '+str(labels.count(i)))

(401,)
0: 6
1: 8
2: 8
3: 6
4: 7
5: 6
6: 9
7: 8
8: 11
9: 9
10: 7
11: 5
12: 8
13: 11
14: 8
15: 8
16: 10
17: 5
18: 7
19: 5
20: 10
21: 13
22: 11
23: 8
24: 8
25: 8
26: 8
27: 8
28: 10
29: 12
30: 7
31: 8
32: 8
33: 7
34: 11
35: 8
36: 12
37: 5
38: 81
39: 6


In [16]:
for i in range(40):
    print('Cluster '+str(i)+':')
    cnt = 1
    for k, l in enumerate(sentences):
        if sc.labels_[k] == i:
            print(str(cnt)+': '+l)
            cnt += 1
    print()

Cluster 0:
1: Works fine most of the time, blobs out ink other times, but hey, GREAT PRICE, and I've lost half of them already, so who cares?
2: They have a plain, cylindrical body, a thoroughly unsophisticated plastic cap, and the whole assembly is the same color as the ink contained within
3: We all are familiar with the Bic round stic pens, with the white barrel, medium or fine points
4: It says white barrel in the description, but I hoped to get this new color
5: The pen now comes in a clear blue tinted barrel so you can see the ink inside, and it looks more interesting than the traditional white
6: The Cristal pens have a wider point or something, even though these are both labeled as medium points

Cluster 1:
1: They go through a lot of pens - customers take them, bussers toss them, they drop them, etc
2: What can one say about these pens? They're about the most basic pens imaginable
3: But, I think the pen would benefit a lot more if they added a grip to it
4: These pens are fin

76: These pens are kind of my standard dependable pens
77: I have not had any "duds" as you get sometimes with generic brands
78: Good stuff
79: Definitely recommended if you have cashiers at work who chew through pens at an astonishing rate
80: Pens don't come much cheaper than this, and it shows in the quality of the product
81: Enough for at least 2 years! Smooth and durable

Cluster 39:
1: I've tried all the cheaper pens and nothing are as good as these right here
2: We were able to buy it for a much cheaper price
3: Are there better pens out there? Of course, but they cost a heck of a lot more
4: I find the less expensive pens to do an excellent jobs
5: I've had more expensive pens that just stop working
6: Nice to have a bunch of low cost pens you can depend on!



# Test for Original embeddings

In [17]:
embeddings_orig = model.encode(sentences, bsize=128, tokenize=False, verbose=True)
print('nb sentences encoded : {0}'.format(len(embeddings_orig)))

Nb words kept : 5944/6476 (91.79 %)
Speed : 46.68 sentences/s (cpu mode, bsize=128)
nb sentences encoded : 401


In [18]:
print(embeddings_orig.shape)

(401, 4096)


In [19]:
from sklearn.cluster import SpectralClustering
from sklearn import metrics

# Spectral Clustering
sc = SpectralClustering(40, affinity='nearest_neighbors', n_init=1000)
sc.fit(embeddings_orig)
print('spectral clustering')
print(sc.labels_)

spectral clustering
[19 17 19 19 19 34 24 32 38  5 19 21 32  2 33 35  9  1  8 16 35 23 19 22  5
  3 14 20 10 19  3 24  0 19 23 23 33 19  6  4 19 39 34 28 14  6  7 19  9  1
 23 19  5  4 19 28 38 28  3 28  9 39 12 31 27 19 20  2 13 37 10 22 35 19  3
 30 21 28 19 23  0  8  6 10 19 26 39 20 38 21 21 14 25 25 25 10 24 19  0 12
 39 19 18 27 27 16 38 19 19  9  1 22 30 24 32 19 14 26 14 26  5 35  5 31 38
  8 11 14 31 38 19 10 22  5 16 19 11 27 14 17 19 10  3 14 19 32 36 10 26 18
 19 19 24 24  2 19  0 30  6 19 19 21 37 20 19 29 19 17 19 19 25 15 32  1 16
  3 16 18  2 33 28 39 29  1 29 27 19  6  6 10  4 19 17 25  8 30 11 19 19 15
 23  0 23 34 26 19 18 12 19 19 20  8 16 31 19 19 22 21 11 29 11 13 19 11 13
 20  8  9 14  8 21 19 29 11 34 36 19 21 26 30 22  0 33  1 21 17 26 19 19 13
  4 19 14 16  8 31 21 30 36 27 21 19 21 19  0 34 14 31 17 35 13 23 22 36  1
 21 26 27 26  3  5 11 32 38 19 32 19  7 16 19  1 35 19  4 33 19  9 24 34 38
  7 20 39 20  2 20 38 19 14  5 20  9 39 32 19 14  4 20  2 18  0 19 1

In [20]:
labels = sc.labels_.flatten()
print(labels.shape)
labels = list(labels)
for i in set(sc.labels_):
    print(str(i)+': '+str(labels.count(i)))

(401,)
0: 10
1: 9
2: 9
3: 11
4: 7
5: 8
6: 7
7: 5
8: 8
9: 7
10: 14
11: 8
12: 5
13: 6
14: 14
15: 4
16: 11
17: 8
18: 6
19: 73
20: 13
21: 15
22: 9
23: 10
24: 9
25: 6
26: 10
27: 8
28: 8
29: 8
30: 8
31: 7
32: 8
33: 6
34: 8
35: 7
36: 6
37: 6
38: 10
39: 9


In [21]:
for i in range(40):
    print('Cluster '+str(i)+':')
    cnt = 1
    for k, l in enumerate(sentences):
        if sc.labels_[k] == i:
            print(str(cnt)+': '+l)
            cnt += 1
    print()

Cluster 0:
1: I have taken these overseas for storage and use in our bookstore and they always work
2: I have several friends who are servers at my favorite restaurant
3: Why on earth?  Bic, you need to come back home to make these
4: We use them to write orders for food in our restaurant
5: used them during a graduation party for the guests
6: Good writing pen, use in the office daily, would purchase again
7: These pens were ordered for our church
8: I keep them around the house for many different uses
9: Used them for a baby shower
10: Will definitely order again for other events as they disappear quickly

Cluster 1:
1: Recommended
2: Ann
3: Awesome
4: held up well
5: Otherwise, they're perfect!
6: word
7: Go get it!
8: Completeley recommended
9: they explode!

Cluster 2:
1: I got them at $4
2: I'm glad I found them
3: I couldn't ask for more!
4: Bringing me joy where there once was stress, thank you!
5: Luckily I did
6: Wish they came in green, too
7: I recommend them
8: I like it a

# Try splitting clusters

In [22]:
embeddings_split = model.encode(sentences, bsize=128, tokenize=False, verbose=True)
print('nb sentences encoded : {0}'.format(len(embeddings_split)))

Nb words kept : 5944/6476 (91.79 %)
Speed : 47.98 sentences/s (cpu mode, bsize=128)
nb sentences encoded : 401


In [23]:
print(embeddings_split.shape)

(401, 4096)


In [24]:
from sklearn.cluster import SpectralClustering
from sklearn import metrics

# Spectral Clustering
sc = SpectralClustering(30, affinity='nearest_neighbors', n_init=1000)
sc.fit(embeddings_split)
print('spectral clustering')
print(sc.labels_)

spectral clustering
[ 5  1  5 24  5  3 23 12  5 20  5  8 21  8 24 21 22  4 26 16 21 10  5  5 20
 13  2 18  0 24  5  5 11  5 10 10 24  1 24 27 21 17  3 15  2 24 27  5 22  4
 10  5 20 27  5 15 16 15 13 15 22 17 28 29  1  5 18  8 22 12  0 19 21 21 13
 25  6 15 15 10 11 26 24  0  9  5 26 18 16  5  8  2 14 14 14  0 23  8 11 28
 17  5 18  7  7 16 16  5  5 14  4 19  5 24 12  5  2  3  2  5 20 21 20 29  8
 26 23  2 29  8  5 26 19 20 16  1 26  7  2  1  7  0 13  2  5 12 25  0  5 18
 27  5  0  5  8  5 11  5 24  1  5  8  9 18  5 23  5  1  5  5 14 10 12  4 16
 13  5  9  8 16 15 17 23  4 24  7  5 24 24  0 27 18  1 14 26 25 23  5  5  5
 10 11 10  3 13  5 18 28  5  5 18 26 16 29  5  5  5  6 23 23 23 22  5 23  1
 18 26 19  2 26 12  5 23 23  3 17  5  6  3 25 19 11 18  4  6  1  5 24  5 15
 27  3  2  9 26 29  5  5 25  7  5  5  6  7 11  3  2 29  1 21  9 10 19 17  4
  6  5  1  5 13 20 23 12 16  2 19  5 27 16  5  4 21  5 27 24  9 22  9  3 16
 27 18 17 18  8 18  7  5  2 20 18 22 17 12  5  2 27 18  8 21 11  7  

In [25]:
labels = sc.labels_.flatten()
print(labels.shape)
labels = list(labels)
for i in set(sc.labels_):
    print(str(i)+': '+str(labels.count(i)))

(401,)
0: 12
1: 16
2: 15
3: 11
4: 9
5: 79
6: 7
7: 10
8: 15
9: 14
10: 11
11: 10
12: 8
13: 10
14: 8
15: 10
16: 13
17: 12
18: 20
19: 8
20: 8
21: 11
22: 7
23: 16
24: 16
25: 6
26: 15
27: 12
28: 5
29: 7


In [26]:
cluster_new = []
for i in range(30):
    temp_sentences = []
    pos_sent = []
    neg_sent = []
    #print('Cluster '+str(i)+':')
    for k, l in enumerate(sentences):
        if sc.labels_[k] == i:
            temp_sentences.append(k)
    for sent in temp_sentences:
        if ratings[indi_sent_rev[sent]] >= 3.0:
            pos_sent.append(sent)
        else:
            neg_sent.append(sent)
    cluster_new.append(pos_sent)
    cluster_new.append(neg_sent)

In [27]:
print(len(cluster_new))

60


In [28]:
print(cluster_new[:30])

[[28, 70, 83, 95, 141, 147, 152, 189, 326, 344, 351], [382], [1, 37, 64, 135, 139, 167, 192, 224, 245, 268, 277, 327, 369], [159, 396, 397], [26, 44, 91, 116, 118, 127, 138, 143, 228, 266, 284, 308, 315, 364], [252], [5, 42, 117, 203, 234, 238, 265, 298, 328, 361], [251], [17, 110, 173, 183, 243, 274, 290, 341], [49], [0, 2, 4, 8, 22, 23, 30, 31, 33, 51, 54, 65, 85, 89, 101, 107, 108, 112, 115, 119, 130, 144, 148, 151, 153, 155, 157, 164, 166, 168, 169, 176, 186, 197, 198, 199, 205, 208, 209, 214, 215, 216, 222, 231, 236, 246, 248, 256, 257, 260, 261, 276, 278, 289, 292, 307, 314, 322, 323, 324, 329, 339, 343, 363, 365, 373, 375, 378, 398], [10, 47, 160, 286, 332, 380, 381, 383, 385, 387], [76, 217, 237, 244, 262, 275, 337], [], [103, 104, 137, 140, 185, 259, 263, 306, 321, 331], [], [13, 67, 90, 97, 124, 129, 154, 178, 304, 318, 345, 399, 400], [11, 161], [84, 162, 177, 295, 297, 330, 338, 352, 360, 367, 371], [253, 270, 388], [21, 34, 35, 50, 79, 171, 200, 202, 271, 355, 357], [], [3

In [29]:
for i in range(30):
    print('Cluster '+str(i)+'_pos:')
    cnt = 1
    for j in cluster_new[2*i]:
        print(str(cnt)+'. '+sentences[j])
        cnt += 1
    print()
    cnt = 1
    print('Cluster '+str(i)+'_neg:')
    for j in cluster_new[2*i+1]:
        print(str(cnt)+'. '+sentences[j])
        cnt += 1
    print()

Cluster 0_pos:
1. comfortable in your hand but i was surprised how well the ink comes out and write, for dirt cheap pens I love the way they look and feel
2. I personally don't mind  using them, I live with a few people in my house and my pens tend to get lost  quite often
3. BIC pens are pretty reliable for taking orders and are cheap enough they don't mind putting them down with credit card receipts
4. Great cheap item so far every pen i have used has worked first try i would recommend buy this alot of pen for a great cheap price
5. I will note that when the company changed the manufacturing location, to save money I presume, I certainly did not see the price of these things, cheap though they  may be, going down any, for us, the consumer
6. these bic pens are great pens which come in a neatly designed box i use these pens in my business for customers to sign their receipts and work orders they last and work well
7. People lose pens all the time and these are so inexpensive that I do