In [1]:
# import stuff
%load_ext autoreload
%autoreload 2
%matplotlib inline

from random import randint
import matplotlib

import numpy as np
import torch

GLOVE_PATH = '../dataset/GloVe/glove.840B.300d.txt'

In [2]:
model = torch.load('infersent.allnli.pickle', map_location=lambda storage, loc: storage)
# On CPU, setting the right number of threads with "torch.set_num_threads(k)" may improve performance

In [3]:
model.set_glove_path(GLOVE_PATH)

In [4]:
model.build_vocab_k_words(K=100000)

Vocab size : 100000


# Load sentences

In [5]:
# Load sentences
sentences = []
indi_sent_rev = []
with open('text_0_AutomotiveProd7.txt') as f:
    i = 1
    rev_num = -1
    for line in f:
        if i == 1:
            rev_num += 1
            l = line.strip()
            indi_sent = l.split('.')
            indi_sent = [a.strip() for a in indi_sent]
            indi_sent = list(filter(None, indi_sent))
            for k in range(len(indi_sent)):
                indi_sent_rev.append(rev_num)
            sentences.extend(indi_sent)
            i *= -1
        else:
            i *= -1
print(len(sentences))

382


# Encode sentences

In [6]:
embeddings = model.encode(sentences, bsize=128, tokenize=False, verbose=True)
print('nb sentences encoded : {0}'.format(len(embeddings)))

Nb words kept : 5656/5982 (94.55 %)
Speed : 52.47 sentences/s (cpu mode, bsize=128)
nb sentences encoded : 382


In [7]:
print(embeddings.shape)

(382, 4096)


# Load ratings

In [8]:
ratings = []
f = open('rating_0_AutomotiveProd7.txt', 'r')
for l in f:
    ratings.append(float(l))

In [9]:
print(ratings[:10])

[5.0, 4.0, 4.0, 5.0, 4.0, 5.0, 5.0, 1.0, 5.0, 5.0]


# Load review ids

In [10]:
ids = []
f = open('id_0_AutomotiveProd7.txt', 'r')
for l in f:
    ids.append(l)

# Append ratings to embeddings

In [11]:
print(indi_sent_rev[:10])

[0, 0, 1, 1, 1, 1, 1, 2, 2, 2]


In [12]:
embeddings = list(embeddings)
for i in range(len(sentences)):
    embeddings[i] = np.append(embeddings[i], np.array([ratings[indi_sent_rev[i]]]))

In [13]:
embeddings = np.array(embeddings)

In [14]:
print(embeddings.shape)

(382, 4097)


# Spectral Clustering

In [15]:
from sklearn.cluster import SpectralClustering
from sklearn import metrics

# Spectral Clustering
sc = SpectralClustering(40, affinity='nearest_neighbors', n_init=1000)
sc.fit(embeddings)
print('spectral clustering')
print(sc.labels_)

spectral clustering
[30 21 28 26 11 31 30  3 25 33 10 23 23 20 21 27 33 26 21 21 21 12  3 21  8
  5 16 27 30 12 38 30 20 13 23 13 30 21  8 30  4 37 17 27 32  4 13 24 38 17
 21 30 21 14  0 12  3  6  5 39  4 27  0 17 30 27  8 19 11 21 29  5 29 15 34
  0 21 19 21 21 22 34 22 29  6 39  5  2  6  2  2 12 22 28  1 36 21 14 36 27
  9 28 28 13 13 18 39  2  6 21  3  4  9 16 16  2  5 22 30 10 14  0 30 14  4
 10 20 21 12 34 38  3 24 15 25 24 24 26 24 26 11 21 26 36 21 28 26 36 21 21
 21  7 31  9 21 21 19 21 27 22 21  8 30 22 33 35  9 31  5  4 12 12 21 21 11
 37 18  8 36 12 14 13 32 37 12 12  5  5  2  2 17 38 18 27 11 26 21  9 27  4
 18 11 31  7  1 21 11 38 31 37 38 30 20 38 18 38 21 21 16  6  6  6  0 34 32
 17 37 38 37 32 13 11  9 24 24 28  9 30 39  5  2 31 31 15 21 34  4 13 12 32
 10  7 19 15 38 37 30  3 17 10  4 37 21 32 10 18 18 35  8 38  3  4 21 17 11
 30 21 16 29 27 33  8 21 25 16  5  5  2 16 39 39 21 21 28 11 21 19  7 21 10
  4 23 20 13 27 22 32 14 12 25  0  0 14  8 22 29 21 31 35 21 22 26  

In [16]:
labels = sc.labels_.flatten()
print(labels.shape)
labels = list(labels)
for i in set(sc.labels_):
    print(str(i)+': '+str(labels.count(i)))

(382,)
0: 11
1: 7
2: 11
3: 7
4: 11
5: 12
6: 10
7: 10
8: 9
9: 10
10: 7
11: 12
12: 14
13: 9
14: 8
15: 7
16: 9
17: 7
18: 8
19: 6
20: 6
21: 47
22: 9
23: 7
24: 7
25: 7
26: 8
27: 12
28: 8
29: 6
30: 17
31: 8
32: 9
33: 4
34: 5
35: 4
36: 6
37: 9
38: 12
39: 6


In [17]:
for i in range(40):
    print('Cluster '+str(i)+':')
    cnt = 1
    for k, l in enumerate(sentences):
        if sc.labels_[k] == i:
            print(str(cnt)+': '+l)
            cnt += 1
    print()

Cluster 0:
1: This was an early Fathers Day gift for my husband, and he couldn't be happier! Easy to use, love the ratchet instead of a twist and lock,held the load in the truck without shifting when we went camping (literally 40 miles of bad road), good pads on ends so truck bed is protected
2: I wish I had bought one of these for my truck years ago!  No more cargo rolling around in the truck's bed!
3: It's worth the money paid and I love it because it hold the item loaded in the rear in my truck bed
4: I bought two of these and use them to support my slide-out on my Lance truck camper when extended
5: I bought this for a trip this past summer to keep things from moving in the back of the pickup and worked great
6: If you are like me, I just through items in the back of my truck and hope they are still there wherever I'm going
7: But with this Ratchet Cargo Bar, I can still through things, but I can now through them in the right place
8: Even though I havent used this on my truckbed i

1: Super value
2: a good value for the cost
3: Would buy again!
4: The price is right too
5: This solved the problem immediately, and at reasonable cost
6: I'd buy it again
7: Outstanding price
8: definitely worth the price

Cluster 19:
1: I received the order promptly but the ratcheting device did not function as well as I had expected
2: The only con I have is the rubber pads on the ends are too hard and you just can't that last click to get it tight enough to stay in place
3: This item is a bit flimsy, the ratchet isn't quite as easy to use as expected, tends to slip on the bed wall
4: The ratcheting mechanism is very rough and requires some sort of ritual sacrifice in order to make it work properly
5: Seems like it will do the job, but when I extended it in my F150, it felt real flimsy at the ratchet point
6: Also - don't get your fingers pinched in the ratcheting mechanism - I can easily see that happening

Cluster 20:
1: It is no more hazardous to fingers than any of the many too

1: I would recommend them to a friend
2: I'm curious about how the folks who gave bad reviews are using this product
3: I would be on the fence to recommend this product
4: I would recommend this product, it does what it says, bounced to 4 star
5: would recommend it to friends  it is  great
6: Lets hope this second one lives up to it's potential

Cluster 37:
1: Seems to be built very well too
2: good  quality
3: Great product, works real well
4: This is much better
5: This is great
6: This one works great every time
7: This thing is heavy duty and works excellent
8: Also the finish seems to be good quality
9: work great

Cluster 38:
1: A bit of screwing around and you'll master it
2: When you hold it up to crank it tight, the bar is awkward to hold because it is off balance
3: If you have a pick up truck,this item is a must, they are extremely durable, and I like the fact that it really stretches out there
4: I wanted a strong ratchet type bar and this is exactly it
5: for sure!  I use

# Test for Original embeddings

In [18]:
embeddings_orig = model.encode(sentences, bsize=128, tokenize=False, verbose=True)
print('nb sentences encoded : {0}'.format(len(embeddings_orig)))

Nb words kept : 5656/5982 (94.55 %)
Speed : 40.96 sentences/s (cpu mode, bsize=128)
nb sentences encoded : 382


In [19]:
print(embeddings_orig.shape)

(382, 4096)


In [20]:
from sklearn.cluster import SpectralClustering
from sklearn import metrics

# Spectral Clustering
sc = SpectralClustering(40, affinity='nearest_neighbors', n_init=1000)
sc.fit(embeddings_orig)
print('spectral clustering')
print(sc.labels_)

spectral clustering
[30 14  6 14 17 29 26 31 35 32  2 28 32 23  4 11 32  1  6 26 26  5 31 14  8
 32 29 11  1  5 14 33 33 36 21  7 26 14  8 26 37  5 15 11 31 21 36 24 24 25
 14 26 29 16 27 13 31 35 38 26 34 11 12 15 19 11  8 38  5 34 27 38 33  0  3
 27 21  4 14 16 27  3  6 35 35 26 28  6 35 30 20  5  6  4 36 15 14 13  5 11
 36  4  4  7 36 31 14  3 35 26 31 37 14 14  7 14  1  6  1  2 29 12 23 16 37
  2 23  3 13 19 20 14 34  0 14 34 17  3 34 23  9 14 38 15  7 32 23 14 10 27
 14 19  7 26 14 19  6 23 11  6 14  8 39  6 32 22 20 29  1  5 13 14 14 30 25
 16 10  8 15 13 16 36  9 18 13 13 39 14 25 15 25 24 10 11 14 38  3 14 11 37
 10 25 29 14 14 14 18 24 29 18 17 27 23 20 10 24 14 39 29 22 27 33 12  3 14
 25 18 14 18  9  7 14 20 14 24  4 20 14 12 28  0 29 29 14 14  3 37 23 13  9
  2 32  1  0 14 21 33  0 25  2 36 21  5  9 30 10 10 22  8 14  0 37 14 25 13
 21 26 14 27 11 14  8 14 35 29 23 38 17  7 26 26 14 39  4 18 14  1 39 14 30
 14 21 39 36 11 14 18 16 13  2 12 34 31  8 27  6 14  9 12 14  6  1 1

In [21]:
labels = sc.labels_.flatten()
print(labels.shape)
labels = list(labels)
for i in set(sc.labels_):
    print(str(i)+': '+str(labels.count(i)))

(382,)
0: 8
1: 11
2: 6
3: 11
4: 7
5: 8
6: 11
7: 7
8: 9
9: 8
10: 8
11: 12
12: 9
13: 12
14: 57
15: 8
16: 6
17: 7
18: 9
19: 7
20: 9
21: 10
22: 4
23: 9
24: 8
25: 8
26: 13
27: 10
28: 4
29: 11
30: 5
31: 8
32: 8
33: 5
34: 9
35: 9
36: 10
37: 6
38: 6
39: 9


In [22]:
for i in range(40):
    print('Cluster '+str(i)+':')
    cnt = 1
    for k, l in enumerate(sentences):
        if sc.labels_[k] == i:
            print(str(cnt)+': '+l)
            cnt += 1
    print()

Cluster 0:
1: I was looking for a sturdy, quality made cargo restraint bar and this is not it
2: I thought this cargo bar would be easier to place in a lot let time
3: I would recommend trying a different cargo bar
4: I do not recommend this cargo bar
5: If you want a quality, heavy duty cargo bar at a good price, this is it
6: Someone recommended a cargo bar and I bought this one
7: I'm fairly pleased with this cargo bar
8: I would recommend you keep looking for a sturdier cargo bar

Cluster 1:
1: You have to make sure you don't ratchet this thing down too hard, if you do, you will regret that last click
2: Just a wee bit tricky to learn to operate at first, blame instructions on one word, instead of saying "rotate" should say "pull over" for the lever, engage last black notch (so easy to miss), then adjust length to short or long quickly
3: When it is tightened it just pops off the gear and will not create enough tension to hold in place
4: Used to have one of the ones that used thre

Cluster 28:
1: The ratchet mechanism works better than the screw or collet designs
2: The ratchet mechanism is right in the middle of the bar instead of off center
3: The ratcheting mechanism slips when it is torqued enough to stay in place
4: Product is just as described, works just like a ratchet strap

Cluster 29:
1: Tightened down a bit and it was secure
2: It's useless now
3: One high, one low
4: It,however,is a low cost,durable multi use product
5: It's a bit tricky to get used too
6: One is broken
7: It was okay, but cumbersome
8: This product is trash
9: A little on the flimsy side
10: A bit weak but the price is cheap
11: This item was horrible

Cluster 30:
1: Much needed tool for anyone that has a truck
2: So, every time I want to use this I need to get in the back of the truck
3: i geass its good for truck only because its metel internal
4: Easy to place in the truck and easy to remove
5: This is a must have for pickup truck owners

Cluster 31:
1: Good Cargo Bar
2: The cargo

# Try splitting clusters

In [23]:
embeddings_split = model.encode(sentences, bsize=128, tokenize=False, verbose=True)
print('nb sentences encoded : {0}'.format(len(embeddings_split)))

Nb words kept : 5656/5982 (94.55 %)
Speed : 44.8 sentences/s (cpu mode, bsize=128)
nb sentences encoded : 382


In [24]:
print(embeddings_split.shape)

(382, 4096)


In [25]:
from sklearn.cluster import SpectralClustering
from sklearn import metrics

# Spectral Clustering
sc = SpectralClustering(30, affinity='nearest_neighbors', n_init=1000)
sc.fit(embeddings_split)
print('spectral clustering')
print(sc.labels_)

spectral clustering
[ 9 27  7 13 29 10  5 11 25 27 15 12 12 22 24 14 22  3 16 27  5  8 11 27 20
 12 10 14  3 27 27 22 22 18 12 27  5  1 20  5  0 25 28 14 27 12 18  4  2  6
 16  5 10 23  7 23 11 27 26  5  2 14 21 28  8 14 20 26  8  4  7 26 22 11 13
  7 12 24 27 19  7 13 27  7  2  5  2 16 27  9  5  8 16 24 18 28 27 23 27 14
 18 24 24 18 18  1  5 13  7  5 11  0 25 27 25 27  3 16 24 15 10 21  5 23  0
 15 22 13  8  8  5 27  4 11 27  4 29 13  4 25 25 27 26 27 25  4 22 27  1  7
 27  8 25  5 27  8 16 22 14 16 27 20 22 16 27 17  3 10  3  0 28 27 27  9  8
 19  1 20 28 27 23 18 25 19 27 23 22 25  6 28  6  2  1 14 27 26 13 27 14  0
  1  6 10 13 13 27 19 29 10 19 29  7 22  3  1 29 27  3 10 17  7 22 21  8 25
  6 19 27 19 25 18 27 27  2  2 24 27 27 21 26 11 10 10 22 27  8  0 18 28 25
 15  5 24 11 27 12 22 11  6 15 18 23 27 25  9  1  1 17 20 27 11  0 27  6  8
 23  5 27  7 14 27 20 27 15 10 18 26 29 25 27  5 27 22 24 19 27  3 22 27  9
 27 12 22 18 14 27 25 23 28 15 21  4 23 20  7 16 25 25 21 25 16  3 2

In [26]:
labels = sc.labels_.flatten()
print(labels.shape)
labels = list(labels)
for i in set(sc.labels_):
    print(str(i)+': '+str(labels.count(i)))

(382,)
0: 8
1: 10
2: 8
3: 12
4: 12
5: 16
6: 7
7: 14
8: 12
9: 6
10: 11
11: 13
12: 12
13: 14
14: 12
15: 7
16: 10
17: 4
18: 14
19: 10
20: 9
21: 9
22: 18
23: 12
24: 10
25: 21
26: 7
27: 64
28: 10
29: 10


In [27]:
cluster_new = []
for i in range(30):
    temp_sentences = []
    pos_sent = []
    neg_sent = []
    #print('Cluster '+str(i)+':')
    for k, l in enumerate(sentences):
        if sc.labels_[k] == i:
            temp_sentences.append(k)
    for sent in temp_sentences:
        if ratings[indi_sent_rev[sent]] >= 3.0:
            pos_sent.append(sent)
        else:
            neg_sent.append(sent)
    cluster_new.append(pos_sent)
    cluster_new.append(neg_sent)

In [28]:
print(len(cluster_new))

60


In [29]:
print(cluster_new[:30])

[[40, 111, 124, 169, 199, 246, 271, 336], [], [37, 105, 148, 176, 192, 200, 214, 265, 266, 333], [], [48, 60, 191, 233, 234], [84, 86, 371], [17, 28, 166, 213, 296, 321, 334, 358, 360], [116, 168, 217], [47, 132, 135, 138, 145, 311, 331, 335, 338, 349], [69, 353], [6, 20, 36, 39, 51, 122, 130, 153, 251], [59, 85, 90, 106, 109, 276, 290], [49, 190, 201, 225, 258, 273], [188], [2, 54, 75, 80, 149, 211, 278, 314, 330, 341], [70, 83, 108, 220], [21, 64, 68, 91, 128, 129, 151, 155, 174, 223, 245, 274], [], [0, 264, 299, 326], [89, 173], [5, 52, 120, 167, 202, 208, 241, 242], [26, 218, 284], [7, 22, 56, 110, 133, 253, 257, 270, 328, 365], [73, 240, 354], [11, 12, 34, 45, 76, 255, 301, 357, 364, 366, 379], [25], [3, 74, 81, 127, 137, 196, 203, 204, 324, 342, 359, 367], [107, 323], [15, 27, 43, 61, 65, 99, 158, 193, 198, 279, 304, 376], []]


In [30]:
for i in range(30):
    print('Cluster '+str(i)+'_pos:')
    cnt = 1
    for j in cluster_new[2*i]:
        print(str(cnt)+'. '+sentences[j])
        cnt += 1
    print()
    cnt = 1
    print('Cluster '+str(i)+'_neg:')
    for j in cluster_new[2*i+1]:
        print(str(cnt)+'. '+sentences[j])
        cnt += 1
    print()

Cluster 0_pos:
1. Holds things inplace, and very easy to use
2. Easy to use and racheting works good
3. Easy to snap in and take out
4. Easy to install, has worked well so far
5. It is easy to use and stays in place without damage
6. It is really easy to adjust and move around
7. It was very easy to install (old lady here) and it works very nicely
8. Easy to use, stays locked tight

Cluster 0_neg:

Cluster 1_pos:
1. Use the savings for gasoline (this 65 yr
2. Super value
3. It's one of those you get what you pay for
4. a good value for the cost
5. Would buy again!
6. The price is right too
7. This solved the problem immediately, and at reasonable cost
8. I'd buy it again
9. Outstanding price
10. definitely worth the price

Cluster 1_neg:

Cluster 2_pos:
1. When you hold it up to crank it tight, the bar is awkward to hold because it is off balance
2. This bar is easy to adjust and the ratcheting feature lets you tighten it down perfectly
3. I wanted a strong ratchet type bar and this is

In [31]:
print(cluster_new)

[[40, 111, 124, 169, 199, 246, 271, 336], [], [37, 105, 148, 176, 192, 200, 214, 265, 266, 333], [], [48, 60, 191, 233, 234], [84, 86, 371], [17, 28, 166, 213, 296, 321, 334, 358, 360], [116, 168, 217], [47, 132, 135, 138, 145, 311, 331, 335, 338, 349], [69, 353], [6, 20, 36, 39, 51, 122, 130, 153, 251], [59, 85, 90, 106, 109, 276, 290], [49, 190, 201, 225, 258, 273], [188], [2, 54, 75, 80, 149, 211, 278, 314, 330, 341], [70, 83, 108, 220], [21, 64, 68, 91, 128, 129, 151, 155, 174, 223, 245, 274], [], [0, 264, 299, 326], [89, 173], [5, 52, 120, 167, 202, 208, 241, 242], [26, 218, 284], [7, 22, 56, 110, 133, 253, 257, 270, 328, 365], [73, 240, 354], [11, 12, 34, 45, 76, 255, 301, 357, 364, 366, 379], [25], [3, 74, 81, 127, 137, 196, 203, 204, 324, 342, 359, 367], [107, 323], [15, 27, 43, 61, 65, 99, 158, 193, 198, 279, 304, 376], [], [10, 119, 125, 250, 259, 283, 309], [], [18, 50, 92, 117, 156, 159, 163, 315, 320], [87], [165, 267, 350], [219], [33, 46, 94, 100, 103, 104, 181, 230, 247

In [32]:
print(indi_sent_rev)

[0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 14, 14, 15, 15, 16, 16, 16, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 20, 21, 21, 22, 23, 23, 23, 23, 23, 24, 24, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 27, 27, 28, 28, 28, 28, 29, 29, 29, 30, 30, 30, 30, 31, 31, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, 37, 37, 37, 38, 38, 38, 39, 39, 39, 39, 39, 39, 40, 40, 40, 40, 41, 42, 42, 42, 43, 43, 44, 44, 44, 44, 44, 45, 45, 45, 45, 46, 46, 46, 47, 47, 47, 47, 48, 48, 48, 49, 49, 49, 49, 49, 50, 50, 50, 51, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 53, 54, 54, 54, 54, 55, 55, 55, 55, 56, 56, 56, 56, 56, 57, 57, 57, 57, 57, 57, 57, 58, 58, 58, 59, 59, 59, 59, 60, 60, 60, 60, 60, 60, 61, 61, 61, 62, 62, 62, 62, 63, 63, 63

In [33]:
print(len(sentences), len(indi_sent_rev))

382 382


In [34]:
matrix = []
for i in range(rev_num+1):
    matrix_row = []
    for clusters in cluster_new:
        flag = 0
        for sent_ind in clusters:
            if indi_sent_rev[sent_ind] == i:
                matrix_row.append('1')
                flag = 1
                break
        if flag == 0:
            matrix_row.append('0')
    matrix.append(matrix_row)

In [35]:
print(rev_num)

100


In [36]:
print(len(matrix))

101


In [37]:
for i in matrix:
    print(len(i))

60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60


In [38]:
print(matrix[5])

['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0']


In [39]:
f = open('AutomotiveProd7_matrix.csv', 'w')
for i in range(len(matrix)):
    f.write(ids[i].strip()+',')
    f.write(','.join(matrix[i]))
    f.write('\n')
f.close()