### Spark Initialization

In [1]:
import findspark
findspark.init()

In [2]:
import sys
import copy
import csv
import numpy as np

from string import atoi
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [3]:
conf = SparkConf().setAppName("ContentBased")
conf = conf.setMaster("local[*]")

In [5]:
sc  = SparkContext(conf=conf)

### Load Train and Test Data

In [12]:
trainData = sc.textFile("/Users/lakshya/Desktop/INF-553/Project/las_vegas_review_with_text_50_lemma_train.txt",use_unicode=False)
testData = sc.textFile("/Users/lakshya/Desktop/INF-553/Project/las_vegas_review_with_text_50_lemma_test.txt",use_unicode=False)

In [13]:
train_rdd = trainData.mapPartitions(lambda x: csv.reader(x)).map(lambda x: ((x[0], x[1]), float(x[2])))
test_rdd = testData.mapPartitions(lambda x: csv.reader(x)).map(lambda x: ((x[0], x[1]), float(x[2])))

Calculate average rating of users and products

In [14]:
avg_rating = train_rdd.map(lambda x: (x[0][0], x[1])).groupByKey().map(lambda x: (x[0], list(x[1]))).map(lambda x: (x[0], sum(x[1])/len(x[1])))
prod_rating = train_rdd.map(lambda x: (x[0][1], x[1])).groupByKey().map(lambda x: (x[0], list(x[1]))).map(lambda x: (x[0], sum(x[1])/len(x[1])))

In [15]:
train_temp = trainData.mapPartitions(lambda x: csv.reader(x)).map(lambda x: ((x[0], x[1]), x[2]))

### Load product category data

In [9]:
data = sc.textFile("/Users/lakshya/Desktop/INF-553/Project/pitts_bus_20_attributes_categories.txt",use_unicode=False)

In [10]:
data.take(6)

['"f2FfutZhb4F-m1Ob0EdYaw","asian fusion caterers fast food chinese restaurants vegetarian food delivery services event planning  services food  alcohol_full_bar ambience_casual bikeparking businessacceptscreditcards businessparking_street caters goodforkids goodformeal_dinner hastv noiselevel_average restaurantsattire_casual restaurantsdelivery restaurantsgoodforgroups restaurantspricerange2_2 restaurantstableservice restaurantstakeout wheelchairaccessible "',
 '"qfcdMhm1Ff28JHVpHca20g","pizza restaurants  businessacceptscreditcards restaurantsattire_casual restaurantspricerange2_2 restaurantstakeout "',
 '"wHUru-79ExNRanJtwZEYeA",""',
 '"rreIYrzI9U052p7nN_qogA","sushi bars food thai poke japanese restaurants   ambience_casual bikeparking businessacceptscreditcards caters goodforkids goodformeal_lunch goodformeal_dinner noiselevel_average restaurantsattire_casual restaurantspricerange2_2 restaurantstakeout "',
 '"eva56motCJcevOwKzyQO1g","cafes hotels  travel swimming pools travel serv

In [11]:
train_data = data.mapPartitions(lambda x: csv.reader(x)).map(lambda x: (x[0], x[1]))

In [12]:
train_data.take(3)

[('f2FfutZhb4F-m1Ob0EdYaw',
  'asian fusion caterers fast food chinese restaurants vegetarian food delivery services event planning  services food  alcohol_full_bar ambience_casual bikeparking businessacceptscreditcards businessparking_street caters goodforkids goodformeal_dinner hastv noiselevel_average restaurantsattire_casual restaurantsdelivery restaurantsgoodforgroups restaurantspricerange2_2 restaurantstableservice restaurantstakeout wheelchairaccessible '),
 ('qfcdMhm1Ff28JHVpHca20g',
  'pizza restaurants  businessacceptscreditcards restaurantsattire_casual restaurantspricerange2_2 restaurantstakeout '),
 ('wHUru-79ExNRanJtwZEYeA', '')]

### Collect user data from train data (User, Product, Rating)

In [13]:
userReview = train_data.map(lambda x: (x[0][0], x[0][1], x[1][0][0]))

In [14]:
userReviewCollected = train_temp.map(lambda x: (x[0][0], x[0][1], x[1]))

In [15]:
userReviewCollected.take(5)

[('2ZZNOoMQ7WFvFoxDUFw1wg', 'o__g2Q64FnNc_Q4O70EhbQ', '5'),
 ('3DltS5Wr9MOntnc8StBFBA', 'kkD0tv_e5E6a8kRpLYEcaA', '3'),
 ('7qGe49__QqCpk8ZYOm7W8w', 'gfIVwp0RkM5yMa1u3mbB4Q', '5'),
 ('BTKtAXjPHjH5B3xuT4uiIw', '1LUaZFVMEjodl1tbAGF3sQ', '5'),
 ('7VTgkFaJ49ftsK1PlRDGrQ', 'Ul6JwluSTm12PVDIqnNaTg', '4')]

### Collect product data (Product, Category Text)

In [16]:
prodReview = train_data.map(lambda x: (x[0], x[1])).groupByKey().mapValues(list)

In [17]:
prodReviewCollected = prodReview.map(lambda x: (x[0], x[1][0].split(" ")))

In [18]:
prodReviewCollected.take(5)

[('MvlQo4bev1eqp1q0HYOLHg',
  ['professional',
   'services',
   'performing',
   'arts',
   'arts',
   '',
   'entertainment',
   'fitness',
   '',
   'instruction',
   'dance',
   'studios',
   'active',
   'life',
   'education',
   '',
   'bikeparking',
   'businessparking_street',
   'byappointmentonly',
   'goodforkids',
   'wheelchairaccessible',
   '']),
 ('jemRU9DvmS0WGZuZ-k3jCQ',
  ['desserts',
   'food',
   'shaved',
   'ice',
   'ice',
   'cream',
   '',
   'frozen',
   'yogurt',
   '',
   'bikeparking',
   'businessacceptscreditcards',
   'businessparking_lot',
   'caters',
   'restaurantspricerange2_1',
   'restaurantstakeout',
   '']),
 ('PuMpFTKS6gY_e31UP5YVnw',
  ['restaurants',
   'american_new',
   '',
   'bikeparking',
   'businessacceptscreditcards',
   'goodforkids',
   'hastv',
   'restaurantsattire_casual',
   'restaurantsgoodforgroups',
   'restaurantspricerange2_1',
   'restaurantstakeout',
   '']),
 ('5S8KaaAIjqo2bJcVlMNW5w',
  ['chinese',
   'restaurants',
 

### Doc2Vec vector creation

In [20]:
from collections import namedtuple
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

  % self._get_c_name())


Convert user and product rdd to pandas dataframe

In [21]:
spark = SparkSession(sc)

userPandas = userReviewCollected.toDF().toPandas()
prodPandas = prodReviewCollected.toDF().toPandas()

Create Doc2Vec on product category text

In [22]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(prodPandas['_2'])]

In [23]:
documents

[TaggedDocument(words=[u'professional', u'services', u'performing', u'arts', u'arts', u'', u'entertainment', u'fitness', u'', u'instruction', u'dance', u'studios', u'active', u'life', u'education', u'', u'bikeparking', u'businessparking_street', u'byappointmentonly', u'goodforkids', u'wheelchairaccessible', u''], tags=[0]),
 TaggedDocument(words=[u'desserts', u'food', u'shaved', u'ice', u'ice', u'cream', u'', u'frozen', u'yogurt', u'', u'bikeparking', u'businessacceptscreditcards', u'businessparking_lot', u'caters', u'restaurantspricerange2_1', u'restaurantstakeout', u''], tags=[1]),
 TaggedDocument(words=[u'restaurants', u'american_new', u'', u'bikeparking', u'businessacceptscreditcards', u'goodforkids', u'hastv', u'restaurantsattire_casual', u'restaurantsgoodforgroups', u'restaurantspricerange2_1', u'restaurantstakeout', u''], tags=[2]),
 TaggedDocument(words=[u'chinese', u'restaurants', u'', u'businessacceptscreditcards', u'restaurantsattire_casual', u'restaurantsdelivery', u'restau

In [24]:
model = Doc2Vec(documents, size = 2000, min_count = 1, workers=4)



In [25]:
feature_vectors = []
for i in range(0, len(model.docvecs)):
    feature_vectors.append(model.docvecs[i])

In [26]:
feature_vectors[0]

array([  4.88698170e-05,   4.56643105e-03,  -8.95946752e-03, ...,
         3.77150462e-03,   2.69041746e-03,   2.43058079e-03], dtype=float32)

Add Doc2Vec vectors to product dataframe

In [27]:
prodPandas['Vector'] = feature_vectors

In [28]:
prodPandas

Unnamed: 0,_1,_2,Vector
0,MvlQo4bev1eqp1q0HYOLHg,"[professional, services, performing, arts, art...","[4.88698e-05, 0.00456643, -0.00895947, 0.00443..."
1,jemRU9DvmS0WGZuZ-k3jCQ,"[desserts, food, shaved, ice, ice, cream, , fr...","[0.000120094, 0.000102407, -0.000914564, 0.000..."
2,PuMpFTKS6gY_e31UP5YVnw,"[restaurants, american_new, , bikeparking, bus...","[0.000102436, -9.47991e-05, -0.000457199, 0.00..."
3,5S8KaaAIjqo2bJcVlMNW5w,"[chinese, restaurants, , businessacceptscredit...","[0.000440745, 0.00282641, -0.00590869, 0.00263..."
4,xZfPdAYeimiruXuGR4nSUA,"[specialty, food, health, markets, vegetarian,...","[0.000231943, 0.00124948, -0.00241711, 0.00141..."
5,8A4ck_lvMLYZDrDt7w8GVA,[],"[7.38954e-05, -0.000146097, 0.0002251, 4.58524..."
6,P2uPCEBP2VZsApnvEHCgag,"[indian, restaurants, , , ambience_casual, bik...","[0.000234017, -0.00104063, 0.00171847, -0.0003..."
7,DwmAsemahJywywhuqv5J1Q,"[szechuan, restaurants, chinese, , , ambience_...","[0.000558957, 0.00167981, -0.00421754, 0.00225..."
8,71ONxJtmDNDiJAtYHK-0RA,"[american_traditional, restaurants, diners, br...","[0.00022496, 0.00418685, -0.00889159, 0.004455..."
9,_7uEyBADd8S14rp2Kzez5g,"[bars, nightlife, dive, bars, beer, bar, , alc...","[-1.90256e-05, 0.000235834, -0.00054637, -1.51..."


Product category text not needed

In [29]:
del prodPandas['_2']

### Weighted Linear Combination of product vectors for users

In [30]:
userPandas['Vector'] = [[] for _ in range(len(userPandas))]

In [31]:
userPandas['_3'] = userPandas['_3'].astype(float)

In [32]:
for index, row in userPandas.iterrows():
    vector = np.array(prodPandas.loc[prodPandas['_1'] == row['_2'], 'Vector'].values[0])
    rating = row['_3']
    userPandas.at[index,'Vector'] = rating*vector

In [33]:
userPandas

Unnamed: 0,_1,_2,_3,Vector
0,2ZZNOoMQ7WFvFoxDUFw1wg,o__g2Q64FnNc_Q4O70EhbQ,5.0,"[0.00128346, 0.00441323, -0.00742467, 0.002237..."
1,3DltS5Wr9MOntnc8StBFBA,kkD0tv_e5E6a8kRpLYEcaA,3.0,"[0.00144684, 0.012156, -0.0245771, 0.0120083, ..."
2,7qGe49__QqCpk8ZYOm7W8w,gfIVwp0RkM5yMa1u3mbB4Q,5.0,"[0.00292295, 0.0178243, -0.0360004, 0.0197503,..."
3,BTKtAXjPHjH5B3xuT4uiIw,1LUaZFVMEjodl1tbAGF3sQ,5.0,"[-0.000455438, -0.0105197, 0.0185627, -0.00910..."
4,7VTgkFaJ49ftsK1PlRDGrQ,Ul6JwluSTm12PVDIqnNaTg,4.0,"[0.00225828, 0.0338968, -0.0701648, 0.0331309,..."
5,67ci-zI-Z6ABxAxTJRsd_Q,8L8QlR4XJeAblTKCqoDP0A,3.0,"[0.000480425, 0.00600562, -0.012642, 0.0057782..."
6,6Ki3bAL0wx9ymbdJqbSWMA,WO3P5BpPHYB-zbDgJZOuCw,3.0,"[-0.000283978, 0.00914403, -0.0169175, 0.00624..."
7,2i0A5OoKyiwskqvnkGq7cA,3dyO9LrC9x-qJ2ZpbL9FZA,4.0,"[-0.000767936, -0.00599715, 0.0113835, -0.0064..."
8,YhkTKKMo9UQmdcxx_umxPA,T8-13dD8mV2qUI53wsUL6g,3.0,"[0.00033604, 0.00189871, -0.0020801, 0.0018249..."
9,HdEM0xeldw4QAweECyOwcQ,AKQbrvRBZvU5kB9Ut4gVkg,4.0,"[-0.00060014, -0.00767391, 0.0170831, -0.00967..."


Product and rating column not needed

In [34]:
del userPandas['_2']
del userPandas['_3']

Linear combination of feature vectors

In [35]:
userPandas = userPandas.groupby(['_1']).sum()

Normalize the user feature vectors

In [36]:
from sklearn.preprocessing import Normalizer

In [37]:
for index, row in userPandas.iterrows():
    vector = np.array(row['Vector']).reshape(1, -1)
    transformer = Normalizer().fit(vector)
    userPandas.at[index,'Vector'] = transformer.transform(vector)

In [38]:
userPandas

Unnamed: 0_level_0,Vector
_1,Unnamed: 1_level_1
-0-hVEpwWEcJLJoGq3rE3g,"[[0.00148065, 0.0324586, -0.0668861, 0.0312036..."
-2OB54nQ6FsGLUM-R1KXnA,"[[0.00320728, 0.0325715, -0.0677208, 0.0324862..."
-ARdx8hOcEWlMDjzwLYZ_g,"[[0.00192666, 0.0325239, -0.0675018, 0.0314879..."
-Pk25bOBsvemFaWKDBVBzA,"[[0.00306555, 0.0309148, -0.0663251, 0.0338803..."
-Q2wBtscwW6JOqlBndji4A,"[[0.00318649, 0.0317414, -0.0662989, 0.0325583..."
-Q4bjWlbxmb1yKP4U7OODg,"[[0.00213934, 0.0322081, -0.0655944, 0.0322375..."
-SDx-d5jppC4OBBosLVpYw,"[[0.00391337, 0.0313437, -0.067462, 0.0337848,..."
-XgVXGJnOnW0kQEol6O3Pg,"[[0.00236541, 0.0322361, -0.0677664, 0.0315283..."
-Y6tXYPYqeVy37-L5p0rMw,"[[0.00226388, 0.0323635, -0.0676386, 0.0315482..."
-a873HRQxWRRobMNT4xOKg,"[[0.00321547, 0.0321675, -0.0688119, 0.0343713..."


### Create user numpy matrix from feature vectors

In [39]:
user_matrix = np.zeros((len(userPandas), len(feature_vectors[0])))
idx = 0
for index, row in userPandas.iterrows():
    vector = np.array(row['Vector'])[0]
    user_matrix[idx] = vector
    idx += 1
    

In [40]:
user_matrix

array([[ 0.00148065,  0.03245856, -0.06688613, ...,  0.03354246,
         0.01020854,  0.01578787],
       [ 0.00320728,  0.03257146, -0.06772078, ...,  0.03674885,
         0.00774733,  0.01670618],
       [ 0.00192666,  0.03252386, -0.06750184, ...,  0.03476782,
         0.00971361,  0.016352  ],
       ..., 
       [ 0.00299245,  0.03215313, -0.06672696, ...,  0.03495213,
         0.00914505,  0.01648291],
       [ 0.00297377,  0.03121844, -0.06433573, ...,  0.03310052,
         0.01194891,  0.01614382],
       [ 0.00197157,  0.03206934, -0.06816911, ...,  0.03474172,
         0.01033008,  0.01558064]])

### Create product numpy matrix from feature vectors

In [41]:
prod_matrix = np.zeros((len(prodPandas), len(feature_vectors[0])))
idx = 0
for index, row in prodPandas.iterrows():
    vector = np.array(row['Vector'])
    prod_matrix[idx] = vector
    idx += 1

In [42]:
prod_matrix

array([[  4.88698170e-05,   4.56643105e-03,  -8.95946752e-03, ...,
          3.77150462e-03,   2.69041746e-03,   2.43058079e-03],
       [  1.20094468e-04,   1.02407139e-04,  -9.14564065e-04, ...,
          1.29539054e-04,   9.94612579e-04,   1.75034671e-04],
       [  1.02435581e-04,  -9.47990920e-05,  -4.57198767e-04, ...,
          5.67398733e-04,  -7.08605803e-05,   1.42638135e-04],
       ..., 
       [  9.30358365e-05,   1.16564543e-03,  -2.47844425e-03, ...,
          1.14233803e-03,   4.04149061e-04,   6.05971087e-04],
       [  2.77659448e-04,   2.02748086e-03,  -3.74884508e-03, ...,
          2.04749987e-03,   4.27909981e-04,   1.08973309e-03],
       [ -5.32553713e-06,   6.22295018e-04,  -1.44430564e-03, ...,
          2.33934639e-04,   6.85379666e-04,   5.06914163e-04]])

### Compute cosine similarity by taking dot product

In [43]:
similarity_matrix = np.dot(user_matrix, prod_matrix.T)

In [44]:
similarity_matrix.shape

(987, 3192)

### Flatten similarity matrix to related with user and products

In [45]:
prod = prodPandas['_1'].values
user = userPandas.index.values

In [46]:
zf = similarity_matrix.flatten()
xr = np.repeat(user, prod.size)
yt = np.tile(prod, user.size)
d = np.stack((xr, yt, zf), axis=-1)

In [47]:
d.shape

(3150504, 3)

### Convert similarity matrix to RDD

In [56]:
similarity_rdd = sc.parallelize(d)

In [57]:
similarity_rdd.take(5)

[array([u'-0-hVEpwWEcJLJoGq3rE3g', u'MvlQo4bev1eqp1q0HYOLHg',
        0.1528778716527085], dtype=object),
 array([u'-0-hVEpwWEcJLJoGq3rE3g', u'jemRU9DvmS0WGZuZ-k3jCQ',
        0.01739900908886617], dtype=object),
 array([u'-0-hVEpwWEcJLJoGq3rE3g', u'PuMpFTKS6gY_e31UP5YVnw',
        0.0029408639507235374], dtype=object),
 array([u'-0-hVEpwWEcJLJoGq3rE3g', u'5S8KaaAIjqo2bJcVlMNW5w',
        0.09041876155946578], dtype=object),
 array([u'-0-hVEpwWEcJLJoGq3rE3g', u'xZfPdAYeimiruXuGR4nSUA',
        0.03727177868981802], dtype=object)]

### Group Users 

In [58]:
userGrouped = similarity_rdd.map(lambda x: (x[0], (x[1], float(x[2])))).groupByKey().mapValues(lambda x: list(x))

In [59]:
userGrouped.take(5)

[(u'AKqMnr1bEFIZDG_zQzX8sA',
  [(u'MvlQo4bev1eqp1q0HYOLHg', 0.1516278485824682),
   (u'jemRU9DvmS0WGZuZ-k3jCQ', 0.016685529844555094),
   (u'PuMpFTKS6gY_e31UP5YVnw', 0.0032236407810345754),
   (u'5S8KaaAIjqo2bJcVlMNW5w', 0.09023455631609228),
   (u'xZfPdAYeimiruXuGR4nSUA', 0.03726517347830521),
   (u'8A4ck_lvMLYZDrDt7w8GVA', -4.024295451033922e-06),
   (u'P2uPCEBP2VZsApnvEHCgag', -0.028187789448053144),
   (u'DwmAsemahJywywhuqv5J1Q', 0.057660761128509215),
   (u'71ONxJtmDNDiJAtYHK-0RA', 0.13259926855943885),
   (u'_7uEyBADd8S14rp2Kzez5g', 0.003965955234553252),
   (u'BsLyCu_lkEdkG42qmvZlnQ', 0.0348799024083129),
   (u'h5XqYtEmRuQ6yI0IyG6oNQ', 0.07303770956382614),
   (u'7rqpu7hwyFTEBZOZoNv7iA', 0.14546872656702217),
   (u'LkPQmCF3k6iDqBLY7bsE5Q', -0.015651806614492964),
   (u'rKh_Nl5edIB9AevqnDmO6g', 0.10497266561165483),
   (u'lLALzZxsJfd45g_u9Huerg', 0.03247569234543673),
   (u'CY5S01UNJIH5bXHE9Ex_9Q', 0.06985862247842031),
   (u'Een2yret0IwJUpHLwcxtRg', 0.14353518728271045),
   (u't

### Sort businesses for each user on the basis of similarity value

In [71]:
similaritySorted = userGrouped.map(lambda x: (x[0], sorted(x[1], key=lambda y: y[1], reverse=True)))

In [None]:
similaritySorted.take(2)

### Take top N recommendations

In [75]:
numKeep = 300

In [86]:
topProds = similaritySorted.map(lambda x: (x[0], x[1][:numKeep])).mapValues(lambda x: [idx[0] for idx in x])

In [87]:
topProds.take(2)

[(u'AKqMnr1bEFIZDG_zQzX8sA',
  [u'eva56motCJcevOwKzyQO1g',
   u'BzuNAM9vuJXQx2sTbKqaJw',
   u'0cILAS0WDttKm8kf-o_CKQ',
   u'5XFe2938xyKtSR6Y--_7tA',
   u'UzbBrP-NJZ4bM1jWpGaNoQ',
   u'2dNjePYVp2AVei4ZoSDaaA',
   u'u0Oi2frKE1P6lW4J1y59mw',
   u'fFvCQoOEb4IW-FqpTO2Z-A',
   u'ODTI9W7fHMKNWZ3g8VcFUg',
   u'qpfbMbWB_ROqe35MIO_amw',
   u'-BuZrOMqgO4OU_UKYSddAg',
   u'VAi90YmGh93IjkdE9v87DQ',
   u'w-3cua8wSEjFImF3pUR2xg',
   u'ihsaBetrKcBSAO4pqqL4vA',
   u'_vzNvnj8ob0hAk2QXEv3Mw',
   u'0wca-OLqtMV8Ebp-W2du0g',
   u'_KgLof61QcylkZkSFle23g',
   u'GUcCURVCIue9QhkEVDOMZg',
   u'ph0xYDrzfnRH3KrP-5_TYQ',
   u'efSbCWuU0FJbLmPC5CDfdg',
   u'm59upQj5qUn-termmtxTYQ',
   u'CdnraUBoA5EqIoXwUXf4zw',
   u'yZ_wpRr4Svw8YRZ9voCgIw',
   u'GDXftPS1_fTJUtZkaEUDhA',
   u'cdwJYqlp_NjS1w5Tz4J5dw',
   u'74gDxr6rs_8tUTcS1vkr0Q',
   u'zuz4w_uDWT5xC0EXOquhHQ',
   u'A7PtzjX_4h10E7EMMNXn2A',
   u'1XWpUatzGk0Cgn6C6VDaWg',
   u'l7e71T_WozJ6r8VrY7M94w',
   u'vgGijxITEbgF44fkG-lGJw',
   u'pQpeWdPNW26-Q43ZqOhjqg',
   u'kuJSCQ

In [88]:
topProds = topProds.collect()

In [90]:
topProds[0]

(u'AKqMnr1bEFIZDG_zQzX8sA',
 [u'eva56motCJcevOwKzyQO1g',
  u'BzuNAM9vuJXQx2sTbKqaJw',
  u'0cILAS0WDttKm8kf-o_CKQ',
  u'5XFe2938xyKtSR6Y--_7tA',
  u'UzbBrP-NJZ4bM1jWpGaNoQ',
  u'2dNjePYVp2AVei4ZoSDaaA',
  u'u0Oi2frKE1P6lW4J1y59mw',
  u'fFvCQoOEb4IW-FqpTO2Z-A',
  u'ODTI9W7fHMKNWZ3g8VcFUg',
  u'qpfbMbWB_ROqe35MIO_amw',
  u'-BuZrOMqgO4OU_UKYSddAg',
  u'VAi90YmGh93IjkdE9v87DQ',
  u'w-3cua8wSEjFImF3pUR2xg',
  u'ihsaBetrKcBSAO4pqqL4vA',
  u'_vzNvnj8ob0hAk2QXEv3Mw',
  u'0wca-OLqtMV8Ebp-W2du0g',
  u'_KgLof61QcylkZkSFle23g',
  u'GUcCURVCIue9QhkEVDOMZg',
  u'ph0xYDrzfnRH3KrP-5_TYQ',
  u'efSbCWuU0FJbLmPC5CDfdg',
  u'm59upQj5qUn-termmtxTYQ',
  u'CdnraUBoA5EqIoXwUXf4zw',
  u'yZ_wpRr4Svw8YRZ9voCgIw',
  u'GDXftPS1_fTJUtZkaEUDhA',
  u'cdwJYqlp_NjS1w5Tz4J5dw',
  u'74gDxr6rs_8tUTcS1vkr0Q',
  u'zuz4w_uDWT5xC0EXOquhHQ',
  u'A7PtzjX_4h10E7EMMNXn2A',
  u'1XWpUatzGk0Cgn6C6VDaWg',
  u'l7e71T_WozJ6r8VrY7M94w',
  u'vgGijxITEbgF44fkG-lGJw',
  u'pQpeWdPNW26-Q43ZqOhjqg',
  u'kuJSCQ-bAySEYI3nfu3C7g',
  u'3ncW0rqy9EQ

### Save similarity values

In [91]:
with open('/Users/lakshya/Desktop/INF-553/Project/CategoryBasedSorted.txt', 'w') as f:
    for item in topProds:
        f.write(str(item[0])+",")
        prodList = ""
        for prod in item[1]:
            prodList += str(prod)+","
        prodList = prodList[:-1]
        f.write(prodList+"\n")