In [1]:
# import stuff
%load_ext autoreload
%autoreload 2
%matplotlib inline

from random import randint
import matplotlib

import numpy as np
import torch

GLOVE_PATH = '../dataset/GloVe/glove.840B.300d.txt'

In [2]:
model = torch.load('infersent.allnli.pickle', map_location=lambda storage, loc: storage)
# On CPU, setting the right number of threads with "torch.set_num_threads(k)" may improve performance

In [3]:
model.set_glove_path(GLOVE_PATH)

In [4]:
model.build_vocab_k_words(K=100000)

Vocab size : 100000


# Load sentences

In [5]:
# Load sentences
sentences = []
indi_sent_rev = []
with open('text_0_OfficeProd13.txt') as f:
    i = 1
    rev_num = -1
    for line in f:
        if i == 1:
            rev_num += 1
            l = line.strip()
            indi_sent = l.split('.')
            indi_sent = [a.strip() for a in indi_sent]
            indi_sent = list(filter(None, indi_sent))
            for k in range(len(indi_sent)):
                indi_sent_rev.append(rev_num)
            sentences.extend(indi_sent)
            i *= -1
        else:
            i *= -1

print(len(sentences))

401


In [6]:
new_sentences = []
new_indi_sent_rev = []
for ind, sent in enumerate(sentences):
    if ' and ' in sent:
        temp = sent.split(' and ')
        temp_rev = [indi_sent_rev[ind] for i in range(len(temp))]
        new_sentences.extend(temp)
        new_indi_sent_rev.extend(temp_rev)
    elif ' or ' in sent:
        temp = sent.split(' or ')
        temp_rev = [indi_sent_rev[ind]] * len(temp)
        new_sentences.extend(temp)
        new_indi_sent_rev.extend(temp_rev)
    else:
        new_sentences.append(sent)
        new_indi_sent_rev.append(indi_sent_rev[ind])

In [7]:
for i in new_sentences:
    print(i)

Great deal on a lot of ink pens
i use these for ccdw classes
havent had a problem yet
Hands down, best product for this kind of money, I was tired of never finding pens, so I bought three boxes of these
spent a whole 10 bucks :)
I just threw these bastards arround the house on every drawer on every corner, everywhere
I'm a happy man
These pens are durable, always ready, light, beautiful, at this price it's crazy!!
Bic Pens: from plain (like these) to "fancy", they write, they write, they write -- they are inexpensive, seem to last a looong time,
create GREAT multiple copies because you press em as hard as you want
they write, they write, they write
Best little pen I ever had
Use em far more often than I use my Parkers (which I also love)
These do not work well, cheaply made
I am bummed I have a lot of them
I ended up throwing out
60 pens should last a college student such as myself a long time
I got them at $4
69, making each pen cost about 8 cents, which is a satisfying price
They wri

In [8]:
print(len(new_sentences))
print(len(new_indi_sent_rev))
print(indi_sent_rev[:5])
print(new_indi_sent_rev[:5])

562
562
[0, 0, 1, 1, 1]
[0, 0, 0, 1, 1]


# Encode sentences

In [9]:
embeddings = model.encode(new_sentences, bsize=128, tokenize=False, verbose=True)
print('nb sentences encoded : {0}'.format(len(embeddings)))

Nb words kept : 6105/6637 (91.98 %)
Speed : 70.39 sentences/s (cpu mode, bsize=128)
nb sentences encoded : 562


In [10]:
print(embeddings.shape)

(562, 4096)


# Load ratings

In [11]:
ratings = []
f = open('rating_0_OfficeProd13.txt', 'r')
for l in f:
    ratings.append(float(l))

In [12]:
print(ratings[:10])

[5.0, 5.0, 5.0, 1.0, 5.0, 5.0, 5.0, 4.0, 5.0, 5.0]


# Load review ids

In [13]:
ids = []
f = open('id_0_OfficeProd13.txt', 'r')
for l in f:
    ids.append(l)

# Append ratings to embeddings

In [14]:
print(new_indi_sent_rev[:15])

[0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3]


In [15]:
embeddings = list(embeddings)
for i in range(len(new_sentences)):
    embeddings[i] = np.append(embeddings[i], np.array([ratings[new_indi_sent_rev[i]]]))

In [16]:
embeddings = np.array(embeddings)

In [17]:
print(embeddings.shape)

(562, 4097)


# Spectral Clustering

In [18]:
from sklearn.cluster import SpectralClustering
from sklearn import metrics

# Spectral Clustering
sc = SpectralClustering(40, affinity='nearest_neighbors', n_init=1000)
sc.fit(embeddings)
print('spectral clustering')
print(sc.labels_)

spectral clustering
[12 24  1 12 30 12 12 12 12 12  2 35 12 25 25 25 24 36 31  2  0 21 19  0 12
 35 12 16 23 23 28  1  0 18 21 18  6 12 19 12  0  6 12  0 12 26 13 12 12 12
 31 24 37 12 30 31 12 12  8 12  8 33  2 30 25 25 17 17 12 12 11  1  5 13 12
  5 35  8 13  5 37 23 21  8 12 12 12  8  3 27 11  7 11 36 12 34 10 10 10 10
 12 10 36 37  8 13  6 37 37 32  9  6 10 31 18 12 12 27 32 23  8 12  7 12 35
 12 12  8 29 26 12 39 39 39 33 27 29 16 12  0 12 33 12 28 12 29 38 12  3 12
 12 27 11 12 32 28 12 14 20 13 13 12  2 12  1 16  1 24 24 35 12 12 20 35 12
 13 12 24  6 35 12  0 12 28  1 12 12  3 12 12  3  0 12 38  2 26  3 13 12 33
 37 12 12 20 22 10 18 29 32  7  5  8 26 12 38 12 12 36 12 12 10 20  1 25 17
 17 17 10 12 21 12 12 12 12 23 32  3 18 12 39  9 12 18 12 37 35  7 37 12 31
 37  8 12 10 39 18 27 12 12 27 12 31 31 12 12 12  7  3 39 12 35  6 22 27 12
 12  0 32 23  0 34 10 34  8 35 12 12 12 12 12  8  0 13 35 35 24 24 12 11 12
 28  0 12 33 12 12 18 18 14 10 14 21 12 21 18 18 20 22  3 23 27 27 1

In [19]:
labels = sc.labels_.flatten()
print(labels.shape)
labels = list(labels)
for i in set(sc.labels_):
    print(str(i)+': '+str(labels.count(i)))

(562,)
0: 15
1: 10
2: 8
3: 13
4: 11
5: 10
6: 15
7: 9
8: 15
9: 10
10: 19
11: 9
12: 160
13: 13
14: 9
15: 9
16: 5
17: 12
18: 17
19: 8
20: 9
21: 13
22: 7
23: 12
24: 8
25: 10
26: 6
27: 17
28: 7
29: 7
30: 7
31: 8
32: 12
33: 7
34: 6
35: 13
36: 7
37: 17
38: 4
39: 8


In [20]:
for i in range(40):
    print('Cluster '+str(i)+':')
    cnt = 1
    for k, l in enumerate(new_sentences):
        if sc.labels_[k] == i:
            print(str(cnt)+': '+l)
            cnt += 1
    print()

Cluster 0:
1: last a fair while before drying up
2: We play a lot of group games, which means we need a lot of pens,
3: We are sure to reorder this product
4: Prices seem to vary on this product, so I'd suggest browsing through the various sizes/colors to find the current best deal
5: I have taken these overseas for storage
6: Why on earth?  Bic, you need to come back home to make these
7: I have the little twerps! And I don't even have to be mean about it like I do with the endless bathroom
8: I am one of those people who are constantly misplacing pens all over the house
9: BIC lives up to their expectation - their slogan is &#34;writes the first time, every time&#34;
10: used them during a graduation party for the guests
11: keep some at home, in kitchen, by computer, books, in my bag, a few to give
12: I used them at work
13: I keep them around the house for many different uses
14: Used them for a baby shower
15: Will definitely order again for other events as they disappear quickly

4: clean,
5: Completeley recommended
6: again
7: They're basic
8: they explode!

Cluster 20:
1: Indeed, as another reviewer has stated, these pens have been around for almost ever
2: three packs of these pens with me as invariably, sure as the sun rises, there will be at least one
3: This is a pen we have used for several years
4: 60 pens in one box for a low price! I haven't used all the pens so far, but what I've used
5: five stars for the price, i got them very cheap;  with four  kids that use pens often 
6: These pens are the best!  They last a long time, too!  I will be using them mostly for school
7: Didn't like it at first but had to adjust after using so many 'bad' pens
8: They were the best pens I have ever used
9: Lots of pens that can be used for a life time, but

Cluster 21:
1: If you're into blue ball point pens, this is where its at
2: Works fine most of the time, blobs out ink other times, but hey, GREAT PRICE,
3: This red pen is SWEET! would highly recommond to someone 

# Test for Original embeddings

In [21]:
embeddings_orig = model.encode(new_sentences, bsize=128, tokenize=False, verbose=True)
print('nb sentences encoded : {0}'.format(len(embeddings_orig)))

Nb words kept : 6105/6637 (91.98 %)
Speed : 70.41 sentences/s (cpu mode, bsize=128)
nb sentences encoded : 562


In [22]:
print(embeddings_orig.shape)

(562, 4096)


In [23]:
from sklearn.cluster import SpectralClustering
from sklearn import metrics

# Spectral Clustering
sc = SpectralClustering(40, affinity='nearest_neighbors', n_init=1000)
sc.fit(embeddings_orig)
print('spectral clustering')
print(sc.labels_)

spectral clustering
[ 7  0  7  7 36  7  7  7  7  7 11  7  7  8 30 22  0 12 36 11  7 26  6  7  7
 33  7 16  5 15  2  8 14 29  7 22  3  7  6 23  7 38  7 27  7 13  7  7  6  6
  7  0  7  7 36 36 32  7 27  7 19 19 11 36 30 32 26  6  6  7  4  8 18  7  7
 14  3 19 15 18 14  5 26 25 17  7 32 27 33 33 24  1 26 12  7  8 38 37 38 28
  0  0 12 14 20 15  3 14 14  6  9 37 23 36 22  7  7 23 15  5  7  7  1  7 28
  7 22 27 10 13  6 21 21 21 25 33 10 16  7 37 17 25  7 35 15 10 34  7 32  7
  7 19 24  7  6  2 33 34 38 29 17 15 11  7  8 16  8  0  0 12  7  7 38 32  7
 29  7  0  7  7  6  7  7  1  8 24  7 30 33  7 32 37  7 34 11 13 30  7  7 25
 14  7  7 38 20 23  7 10 15  1  7 27 13 28 34  7  7 12  0 37 20 38  8  7  7
 24 22  7  7 26  7  7 21 17  5  6 30  2  7 21  9  7  7 32  7 33  1 14  7  7
 14 27  7 21 21  4 23  7  7 33  7 36 36  7  7 32  1 30 21  7 39 32 20 23  7
 28  7 15  5 37  5 19 15  7 33  7 17  6  6  7 27 37  7 39 39  0  0  7 24  6
  2 37 13 25  7 23  4 18 34  7 33 32 23  7  4  6 38 20 33  5  3 23 2

In [24]:
labels = sc.labels_.flatten()
print(labels.shape)
labels = list(labels)
for i in set(sc.labels_):
    print(str(i)+': '+str(labels.count(i)))

(562,)
0: 14
1: 10
2: 6
3: 12
4: 8
5: 11
6: 22
7: 170
8: 11
9: 8
10: 5
11: 9
12: 10
13: 7
14: 13
15: 14
16: 5
17: 10
18: 9
19: 9
20: 8
21: 10
22: 6
23: 15
24: 7
25: 9
26: 10
27: 11
28: 5
29: 3
30: 12
31: 3
32: 16
33: 24
34: 7
35: 4
36: 14
37: 12
38: 18
39: 5


In [25]:
for i in range(40):
    print('Cluster '+str(i)+':')
    cnt = 1
    for k, l in enumerate(new_sentences):
        if sc.labels_[k] == i:
            print(str(cnt)+': '+l)
            cnt += 1
    print()

Cluster 0:
1: i use these for ccdw classes
2: 60 pens should last a college student such as myself a long time
3: Bic never disappoints!! As I've stated before I am a college student
4: These should last my four grandchildren a couple years
5: The grandchildren were very happy to receive them just before school started
6: I substitute teach which means I can be in a 3rd grade class one day
7: in an advanced high school class the next
8: some students will actually try to take advantage of a substitute teacher
9: I gave these to friends in a Nursing Home to write their life stories in!  Those who are writing their stories are having a good time remimicing
10: This is great school supplies value for Middle School
11: High School students who need pens but sometimes lose them in their daily shuffle
12: great for classroom use! My students are always excited to get to use a pen
13: These pens are the best!  They last a long time, too!  I will be using them mostly for school
14: I use them 

5: Not the smoothest pen, nor the most comfortable to hold, but if you can get this at less than a dime per pen, it's a great value
6: they are less likely to steal them as they do not really look all that cool nor do they have a lot of whistles
7: (one being not having working pen handy when needed) This product has solved that problem
8: It's true! What else could cause them to disappear? I think they're forming a colony somewhere in the house, I just hope they don't make a move to take over
9: desk that can never seem to find a pen (like mine)
10: so far have not bled out in my bag like some other cheaper pens of the past
11: Can't go wrong with these!  Love these ball point pens, they work great, never have any issues,
12: But to hand out to clients that will not return, it's perfect, that was we don't lose our better pens
13: If you don't use them for several months, you'll find that several pens do not actually write at all
14: its annoying when every other pen does not work
15: 

# Try splitting clusters

In [26]:
embeddings_split = model.encode(new_sentences, bsize=128, tokenize=False, verbose=True)
print('nb sentences encoded : {0}'.format(len(embeddings_split)))

Nb words kept : 6105/6637 (91.98 %)
Speed : 72.06 sentences/s (cpu mode, bsize=128)
nb sentences encoded : 562


In [27]:
print(embeddings_split.shape)

(562, 4096)


In [28]:
from sklearn.cluster import SpectralClustering
from sklearn import metrics

# Spectral Clustering
sc = SpectralClustering(30, affinity='nearest_neighbors', n_init=1000)
sc.fit(embeddings_split)
print('spectral clustering')
print(sc.labels_)

spectral clustering
[ 4 27  4  8 21  0  4  4  4  4  9  4  4 20 25 13 27 11 21  9 13 12 16  4  4
  6  4  5 28  4 19 20 22 10  4 13 17  4 16  8  4  4  4  0  4 14 10  4  4  4
 21 27  4  4 21 21  4  4  7  4 26  4  9 21 25  4 12  4  4  4 16 20 15  4  4
  4  6 26  3 15 22 28 12  8 10  4  6  4  4 24 29 23 12 11  4 20  7  0  7 19
  4 27 11 22 26 10 17 22 22  3 18  0 26 21 13  4  4 24  3 28  4  4 23  4 19
  4 13  0  2 14  4  1  1  1  8 24  2  5  4 15 10  8  4 19  3  2  0  4  6  4
  4 26 29  4  3 19 24  8  7 10 10  4  9 10 20  5 20 27 27 11  4  4  7  4  4
 10  4 27  4  6  4  0  4 19 20 29  6 25 24  4  4  0  4  8  9 14 25 10  4  8
 22  4  4  7 26  8  4  2  3 23  4  0 14 19  8  4  4 11  4  4 26  7 20  4  4
 29 13  4  4 12  4  4 24 10 28  3 25  4  4  1 18  4  4  6  4  4 23 22  4  6
 22  4  4  1  1 16  8  4  4 24  4 21 21  4  4  4 23 25  1  4  6  6 26 17  6
 19  4  3 28  4 28 26  3  4  6  4 10  4  4  4  0  0  4  6  6 27 27  4 29  4
 19  0  4  8 26  8 16 15  8  4  4  6  8  4 16  4  7 26  4 28 17  4 1

In [29]:
labels = sc.labels_.flatten()
print(labels.shape)
labels = list(labels)
for i in set(sc.labels_):
    print(str(i)+': '+str(labels.count(i)))

(562,)
0: 16
1: 8
2: 5
3: 15
4: 197
5: 6
6: 19
7: 17
8: 23
9: 12
10: 22
11: 10
12: 9
13: 7
14: 6
15: 12
16: 12
17: 13
18: 8
19: 14
20: 10
21: 15
22: 13
23: 9
24: 22
25: 12
26: 20
27: 12
28: 11
29: 7


In [30]:
cluster_new = []
for i in range(30):
    temp_sentences = []
    pos_sent = []
    neg_sent = []
    #print('Cluster '+str(i)+':')
    for k, l in enumerate(new_sentences):
        if sc.labels_[k] == i:
            temp_sentences.append(k)
    for sent in temp_sentences:
        if ratings[new_indi_sent_rev[sent]] >= 3.0:
            pos_sent.append(sent)
        else:
            neg_sent.append(sent)
    cluster_new.append(pos_sent)
    cluster_new.append(neg_sent)

In [31]:
print(len(cluster_new))

60


In [32]:
print(cluster_new[:30])

[[5, 43, 97, 111, 127, 146, 181, 191, 211, 290, 291, 301, 436, 449, 488, 498], [], [131, 132, 133, 239, 253, 254, 268, 522], [], [128, 136, 145, 207, 502], [], [78, 109, 118, 144, 154, 208, 235, 277, 282, 333, 354, 393, 409, 497, 520], [], [0, 2, 6, 7, 8, 9, 11, 12, 23, 24, 26, 29, 34, 37, 40, 41, 42, 44, 47, 48, 49, 52, 53, 56, 57, 59, 61, 68, 69, 73, 74, 75, 85, 87, 88, 94, 100, 115, 116, 120, 121, 123, 125, 130, 138, 142, 147, 149, 150, 153, 161, 170, 171, 173, 174, 176, 178, 180, 182, 189, 190, 192, 198, 201, 202, 206, 210, 215, 216, 218, 219, 227, 228, 230, 231, 237, 238, 241, 242, 244, 245, 248, 251, 252, 257, 258, 260, 263, 264, 265, 269, 276, 279, 283, 285, 287, 288, 289, 292, 297, 299, 302, 309, 310, 313, 315, 318, 321, 324, 325, 327, 328, 329, 338, 340, 343, 344, 349, 350, 351, 362, 367, 368, 372, 377, 378, 385, 387, 399, 400, 417, 419, 422, 423, 430, 433, 438, 440, 441, 442, 443, 446, 447, 450, 451, 452, 454, 456, 458, 460, 461, 463, 467, 468, 469, 474, 475, 479, 484, 487, 4

In [33]:
for i in range(30):
    print('Cluster '+str(i)+'_pos:')
    cnt = 1
    for j in cluster_new[2*i]:
        print(str(cnt)+'. '+new_sentences[j])
        cnt += 1
    print()
    cnt = 1
    print('Cluster '+str(i)+'_neg:')
    for j in cluster_new[2*i+1]:
        print(str(cnt)+'. '+new_sentences[j])
        cnt += 1
    print()

Cluster 0_pos:
1. I just threw these bastards arround the house on every drawer on every corner, everywhere
2. I have taken these overseas for storage
3. I personally don't mind  using them, I live with a few people in my house
4. I have several friends who are servers at my favorite restaurant
5. Perhaps, they just can't take the owner!Anyway, I got these in bulk
6. I have to buy a bunch of these for the office since they keep disappearing
7. I have the little twerps! And I don't even have to be mean about it like I do with the endless bathroom
8. I am one of those people who are constantly misplacing pens all over the house
9. I bought a couple of these for my smaller ambulance corp
10. I buy a big box of pens
11. keep some at home, in kitchen, by computer, books, in my bag, a few to give
12. I used them at work
13. I bought a pack of blue pens at an office supply place
14. I keep them around the house for many different uses
15. I bought a box of these to keep in our kitchen drawer


1. I bought this pens to be used as &#34;expendables&#34; actually,
2. I'm sure every one has used normal bic pens before, they are exactly the same as the one you normally used
3. my pens tend to get lost  quite often
4. Indeed, as another reviewer has stated, these pens have been around for almost ever
5. three packs of these pens with me as invariably, sure as the sun rises, there will be at least one
6. This is a pen we have used for several years
7. 60 pens in one box for a low price! I haven't used all the pens so far, but what I've used
8. five stars for the price, i got them very cheap;  with four  kids that use pens often 
9. The best pen I have ever bought, still use today, my coworkers steal these all the time
10. i always use bic pens because i know they will work
11. These BICs are my favorite pens to use, even if they aren't too fancy
12. I needed pens for work as mine are always being stolen
13. Didn't like it at first but had to adjust after using so many 'bad' pens
14.

In [34]:
print(cluster_new)

[[5, 43, 97, 111, 127, 146, 181, 191, 211, 290, 291, 301, 436, 449, 488, 498], [], [131, 132, 133, 239, 253, 254, 268, 522], [], [128, 136, 145, 207, 502], [], [78, 109, 118, 144, 154, 208, 235, 277, 282, 333, 354, 393, 409, 497, 520], [], [0, 2, 6, 7, 8, 9, 11, 12, 23, 24, 26, 29, 34, 37, 40, 41, 42, 44, 47, 48, 49, 52, 53, 56, 57, 59, 61, 68, 69, 73, 74, 75, 85, 87, 88, 94, 100, 115, 116, 120, 121, 123, 125, 130, 138, 142, 147, 149, 150, 153, 161, 170, 171, 173, 174, 176, 178, 180, 182, 189, 190, 192, 198, 201, 202, 206, 210, 215, 216, 218, 219, 227, 228, 230, 231, 237, 238, 241, 242, 244, 245, 248, 251, 252, 257, 258, 260, 263, 264, 265, 269, 276, 279, 283, 285, 287, 288, 289, 292, 297, 299, 302, 309, 310, 313, 315, 318, 321, 324, 325, 327, 328, 329, 338, 340, 343, 344, 349, 350, 351, 362, 367, 368, 372, 377, 378, 385, 387, 399, 400, 417, 419, 422, 423, 430, 433, 438, 440, 441, 442, 443, 446, 447, 450, 451, 452, 454, 456, 458, 460, 461, 463, 467, 468, 469, 474, 475, 479, 484, 487, 4

In [35]:
print(new_indi_sent_rev)

[0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 21, 21, 22, 22, 22, 22, 22, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 25, 25, 25, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 28, 29, 29, 29, 29, 29, 29, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 39, 40, 40, 41, 41, 42, 42, 42, 42, 43, 43, 43, 43, 43, 43, 43, 43, 44, 44, 45, 45, 45, 46, 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 49, 49, 49, 50, 50, 50, 50, 

In [36]:
print(len(new_sentences), len(new_indi_sent_rev))

562 562


In [37]:
matrix = []
for i in range(rev_num+1):
    matrix_row = []
    for clusters in cluster_new:
        flag = 0
        for sent_ind in clusters:
            if new_indi_sent_rev[sent_ind] == i:
                matrix_row.append('1')
                flag = 1
                break
        if flag == 0:
            matrix_row.append('0')
    matrix.append(matrix_row)

In [38]:
print(rev_num)

118


In [39]:
print(len(matrix))

119


In [40]:
for i in matrix:
    print(len(i))

60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60


In [41]:
print(matrix[5])

['0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '1', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0']


In [42]:
f = open('conjunction_OfficeProd13_matrix.csv', 'w')
for i in range(len(matrix)):
    f.write(ids[i].strip()+',')
    f.write(','.join(matrix[i]))
    f.write('\n')
f.close()