In [1]:
# import stuff
%load_ext autoreload
%autoreload 2
%matplotlib inline

from random import randint
import matplotlib

import numpy as np
import torch
import os

GLOVE_PATH = '../../dataset/GloVe/glove.840B.300d.txt'

In [2]:
model = torch.load('infersent.allnli.pickle', map_location=lambda storage, loc: storage)
# On CPU, setting the right number of threads with "torch.set_num_threads(k)" may improve performance

In [3]:
model.set_glove_path(GLOVE_PATH)

In [4]:
model.build_vocab_k_words(K=100000)

Vocab size : 100000


# Make review, id, text files

In [5]:
FILE = "0_VideoGamesProd3.txt"

os.system("python extract_rating.py " + FILE)
os.system("python extract_review_id.py " + FILE)
os.system("python extract_review_text.py " + FILE)

print("Done!")

Done!


# Load sentences

In [6]:
# Load sentences
sentences = []
indi_sent_rev = []
with open('DEVELOPMENT_DATA/text_'+FILE) as f:
    i = 1
    rev_num = -1
    for line in f:
        if i == 1:
            rev_num += 1
            l = line.strip()
            indi_sent = l.split('.')
            indi_sent = [a.strip() for a in indi_sent]
            indi_sent = list(filter(None, indi_sent))
            for k in range(len(indi_sent)):
                indi_sent_rev.append(rev_num)
            sentences.extend(indi_sent)
            i *= -1
        else:
            i *= -1

print(len(sentences))

967


In [7]:
new_sentences = []
new_indi_sent_rev = []
for ind, sent in enumerate(sentences):
    if ' and ' in sent:
        temp = sent.split(' and ')
        temp_rev = [indi_sent_rev[ind] for i in range(len(temp))]
        new_sentences.extend(temp)
        new_indi_sent_rev.extend(temp_rev)
    elif ' or ' in sent:
        temp = sent.split(' or ')
        temp_rev = [indi_sent_rev[ind]] * len(temp)
        new_sentences.extend(temp)
        new_indi_sent_rev.extend(temp_rev)
    else:
        new_sentences.append(sent)
        new_indi_sent_rev.append(indi_sent_rev[ind])

In [8]:
for i in new_sentences:
    print(i)

I LOVE IT, IT CAME EARLY, THE MUSIC FOR THIS GAME IS FANTASTIC,
the cartridge wasn't broken or anything
Very Nice Condition
I love that it came in like 2 days
This game is frigging awesome if you're into games like Donkey Kong 64
Mario 64
5/5 will play it again
This is most likely one of the best N64 Games Ever made, Its way better then Super mario 64 in everyway (and possibly Zelda too), its rich, colorful
very addicting, The soundtracks are also one of my favorite things about the game- it has a happy
also very awesome tune to them making them addictive (you just cant stop listening to it!),
the world designs (10 total I believe) are all very Beautiful
Gorgeous, I've beaten the game to 100%
I still go back to the worlds
gaze in thier amazement, its just really cool, this game is a MUST HAVE for any Nintendo 64 player, Im serious if you dont have this, your missing out BIG time
PS
check out its Sequel Banjo-Tooie after you've bought this!
From my experience of owning
playing this game

In [9]:
print(len(new_sentences))
print(len(new_indi_sent_rev))
print(indi_sent_rev[:5])
print(new_indi_sent_rev[:5])

1457
1457
[0, 0, 0, 0, 0]
[0, 0, 0, 0, 0]


# Encode sentences

In [10]:
embeddings = model.encode(new_sentences, bsize=128, tokenize=False, verbose=True)
print('nb sentences encoded : {0}'.format(len(embeddings)))

Nb words kept : 16609/18545 (89.56 %)
Speed : 46.9 sentences/s (cpu mode, bsize=128)
nb sentences encoded : 1457


In [11]:
print(embeddings.shape)

(1457, 4096)


# Load ratings

In [12]:
ratings = []
f = open('DEVELOPMENT_DATA/rating_'+FILE, 'r')
for l in f:
    ratings.append(float(l))

In [13]:
print(ratings[:10])

[5.0, 5.0, 4.0, 4.0, 5.0, 5.0, 4.0, 5.0, 5.0, 5.0]


# Load review ids

In [14]:
ids = []
f = open('DEVELOPMENT_DATA/id_'+FILE, 'r')
for l in f:
    ids.append(l)

# Append ratings to embeddings

In [15]:
print(new_indi_sent_rev[:15])

[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]


In [16]:
embeddings = list(embeddings)
for i in range(len(new_sentences)):
    embeddings[i] = np.append(embeddings[i], np.array([ratings[new_indi_sent_rev[i]]]))

In [17]:
embeddings = np.array(embeddings)

In [18]:
print(embeddings.shape)

(1457, 4097)


# Try splitting clusters

In [19]:
from sklearn.cluster import SpectralClustering
from sklearn import metrics

# Spectral Clustering
sc = SpectralClustering(30, affinity='nearest_neighbors', n_init=1000)
sc.fit(embeddings)
print('spectral clustering')
print(sc.labels_)

spectral clustering
[ 2  2  2 ..., 28  2  3]


In [20]:
labels = sc.labels_.flatten()
print(labels.shape)
labels = list(labels)
for i in set(sc.labels_):
    print(str(i)+': '+str(labels.count(i)))

(1457,)
0: 18
1: 18
2: 656
3: 66
4: 73
5: 16
6: 15
7: 61
8: 15
9: 10
10: 42
11: 11
12: 28
13: 31
14: 15
15: 13
16: 57
17: 13
18: 25
19: 23
20: 25
21: 20
22: 11
23: 16
24: 25
25: 28
26: 27
27: 42
28: 30
29: 27


In [21]:
cluster_new = []
for i in range(30):
    temp_sentences = []
    pos_sent = []
    neg_sent = []
    #print('Cluster '+str(i)+':')
    for k, l in enumerate(new_sentences):
        if sc.labels_[k] == i:
            temp_sentences.append(k)
    for sent in temp_sentences:
        if ratings[new_indi_sent_rev[sent]] >= 3.0:
            pos_sent.append(sent)
        else:
            neg_sent.append(sent)
    cluster_new.append(pos_sent)
    cluster_new.append(neg_sent)

In [22]:
print(len(cluster_new))

60


In [23]:
print(cluster_new[:30])

[[22, 27, 97, 112, 206, 420, 561, 609, 761, 763, 886, 966, 1024, 1218, 1250, 1252, 1327, 1329], [], [65, 74, 326, 458, 461, 471, 721, 1019, 1043, 1055, 1058, 1086, 1143, 1350, 1378, 1414, 1433, 1437], [], [0, 1, 2, 6, 7, 8, 9, 10, 14, 15, 16, 18, 20, 21, 23, 25, 31, 32, 35, 38, 41, 42, 45, 58, 61, 62, 63, 69, 70, 71, 72, 76, 78, 85, 88, 90, 91, 93, 94, 95, 96, 98, 102, 103, 104, 105, 109, 110, 113, 114, 115, 116, 118, 121, 122, 123, 124, 125, 129, 130, 132, 134, 138, 139, 142, 143, 144, 149, 150, 153, 155, 157, 165, 169, 171, 172, 176, 177, 179, 180, 182, 185, 188, 190, 192, 194, 197, 198, 202, 204, 205, 207, 209, 211, 213, 214, 215, 218, 219, 221, 223, 224, 225, 228, 229, 230, 231, 233, 234, 235, 237, 238, 239, 241, 242, 244, 245, 246, 247, 249, 250, 252, 256, 258, 259, 261, 262, 263, 264, 269, 271, 274, 275, 276, 277, 280, 284, 286, 290, 291, 293, 295, 296, 299, 301, 303, 305, 307, 309, 312, 315, 316, 317, 318, 322, 324, 325, 327, 371, 372, 373, 374, 382, 383, 387, 388, 389, 390, 392

In [24]:
for i in range(30):
    print('Cluster '+str(i)+'_pos:')
    cnt = 1
    for j in cluster_new[2*i]:
        print(str(cnt)+'. '+new_sentences[j])
        cnt += 1
    print()
    cnt = 1
    print('Cluster '+str(i)+'_neg:')
    for j in cluster_new[2*i+1]:
        print(str(cnt)+'. '+new_sentences[j])
        cnt += 1
    print()

Cluster 0_pos:
1. took her to her lair
2. get Banjo's sister back
3. At the same time, my three-year-old daughter caught on to it
4. call her grunty
5. Gruntilda kidnaps Banjo's sister, Tooty who is then taken up to Grunty's lair, where Gruntilda then wants Tooty's beauty, where Grunty will "make herself young
6. Kazooie save Tooty before Grunty robs her of her beauty
7. Only the girl is the main character's little sister
8. then you must face the witch herself
9. even save your captured sister while doing so
10. make it her own
11. As my brother went
12. his brothers
13. Her brother
14. plans to suck the beauty out of her
15. My son
16. and great-grandson
17. So she kidnapps Banjo's sister
18. put it into her

Cluster 0_neg:

Cluster 1_pos:
1. This is one of the best games on the N64,
2. This has to be my favourite N64 game
3. It's just the best N64 adventure game ever
4. Banjo Kazooie is one of the best games i have ever played
5. This is one of the best rated games evere
6. This is 

298. Highly recommended
299. very deserving of its praise
300. -Being able to go through the game at your own pace is very nice
301. Cons:-The underwater swimming controls are horrible
302. annoying-The flying 'spear' move is very hard to master
303. Takes a LOT of practice
304. (Review written entirely from my opinion
305. This game is absolutely amazing
306. I'm going to get this again so i can play it outside of school
307. church
308. It can be frustrating at times, but it is very addictive
309. if it was worth buying the cartage or not
310. Have fun,
311. gotta admit, it's pretty entertaining! Can't wait to see his next pick!
312. One of the best games on the N64, it combines action, adventure, puzzle
313. other elements as you go through awsome based levels to beat it
314. This is the best game!There is no equal
315. The combination of hard to beat
316. My friends said it was stupid
317. It's the best
318. My friend Brian likes it too
319. the second one
320. Being able to contro

In [25]:
print(cluster_new)

[[22, 27, 97, 112, 206, 420, 561, 609, 761, 763, 886, 966, 1024, 1218, 1250, 1252, 1327, 1329], [], [65, 74, 326, 458, 461, 471, 721, 1019, 1043, 1055, 1058, 1086, 1143, 1350, 1378, 1414, 1433, 1437], [], [0, 1, 2, 6, 7, 8, 9, 10, 14, 15, 16, 18, 20, 21, 23, 25, 31, 32, 35, 38, 41, 42, 45, 58, 61, 62, 63, 69, 70, 71, 72, 76, 78, 85, 88, 90, 91, 93, 94, 95, 96, 98, 102, 103, 104, 105, 109, 110, 113, 114, 115, 116, 118, 121, 122, 123, 124, 125, 129, 130, 132, 134, 138, 139, 142, 143, 144, 149, 150, 153, 155, 157, 165, 169, 171, 172, 176, 177, 179, 180, 182, 185, 188, 190, 192, 194, 197, 198, 202, 204, 205, 207, 209, 211, 213, 214, 215, 218, 219, 221, 223, 224, 225, 228, 229, 230, 231, 233, 234, 235, 237, 238, 239, 241, 242, 244, 245, 246, 247, 249, 250, 252, 256, 258, 259, 261, 262, 263, 264, 269, 271, 274, 275, 276, 277, 280, 284, 286, 290, 291, 293, 295, 296, 299, 301, 303, 305, 307, 309, 312, 315, 316, 317, 318, 322, 324, 325, 327, 371, 372, 373, 374, 382, 383, 387, 388, 389, 390, 392

In [26]:
print(new_indi_sent_rev)

[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1

In [27]:
print(len(new_sentences), len(new_indi_sent_rev))

1457 1457


In [28]:
matrix = []
for i in range(rev_num+1):
    matrix_row = []
    for clusters in cluster_new:
        flag = 0
        for sent_ind in clusters:
            if new_indi_sent_rev[sent_ind] == i:
                matrix_row.append('1')
                flag = 1
                break
        if flag == 0:
            matrix_row.append('0')
    matrix.append(matrix_row)

In [29]:
print(rev_num)

107


In [30]:
print(len(matrix))

108


In [31]:
for i in matrix:
    print(len(i))

60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60


In [32]:
print(matrix[5])

['0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0']


In [33]:
f = open('MATRICES/matrix_'+FILE[:-4]+'.csv', 'w')
for i in range(len(matrix)):
    f.write(ids[i].strip()+',')
    f.write(','.join(matrix[i]))
    f.write('\n')
f.close()