### Spark Initialization

In [1]:
import findspark
findspark.init()

In [2]:
import sys
import copy
import csv

from string import atoi
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [3]:
import numpy as np

In [4]:
conf = SparkConf().setAppName("ContentBased")
conf = conf.setMaster("local[*]")

In [5]:
sc  = SparkContext(conf=conf)

### Train, Test and Output files

In [6]:
train_file = "/Users/lakshya/Desktop/INF-553/Project/pittsburgh_review_with_text_20_res_lemma_data_train.txt"
test_file = "/Users/lakshya/Desktop/INF-553/Project/pittsburgh_review_with_text_20_res_lemma_data_test.txt"
train_output = '/Users/lakshya/Desktop/INF-553/Project/Pittsburgh_TextBased_TFIDF_train_predictions.txt'
test_output = '/Users/lakshya/Desktop/INF-553/Project/Pittsburgh_TextBased_TFIDF_test_predictions.txt'

### Load Train and Test Data

In [7]:
trainData = sc.textFile(train_file,use_unicode=False)
testData = sc.textFile(test_file,use_unicode=False)

In [8]:
train_rdd = trainData.mapPartitions(lambda x: csv.reader(x)).map(lambda x: ((x[0], x[1]), float(x[2])))
test_rdd = testData.mapPartitions(lambda x: csv.reader(x)).map(lambda x: ((x[0], x[1]), float(x[2])))

In [9]:
avg_rating = train_rdd.map(lambda x: (x[0][0], x[1])).groupByKey().map(lambda x: (x[0], list(x[1]))).map(lambda x: (x[0], sum(x[1])/len(x[1])))
prod_rating = train_rdd.map(lambda x: (x[0][1], x[1])).groupByKey().map(lambda x: (x[0], list(x[1]))).map(lambda x: (x[0], sum(x[1])/len(x[1])))

In [10]:
avg_rating = avg_rating.map(lambda x: ((x[0]), x[1]))

In [11]:
avg_rating.take(5)

[('AKqMnr1bEFIZDG_zQzX8sA', 3.45),
 ('S1cjSFKcS5NVc3o1MkfpwA', 3.574074074074074),
 ('bSvNU2vABlaBi1ooF4KNJg', 3.3684210526315788),
 ('8ye6-7YYMsp0YizCLyrxcA', 4.184210526315789),
 ('oIN1CjzbCCol4kPueU9WzQ', 3.5555555555555554)]

In [12]:
train_temp = trainData.mapPartitions(lambda x: csv.reader(x)).map(lambda x: ((x[0], x[1]), 1))

### Load review data

In [13]:
data = sc.textFile(train_file,use_unicode=False)

In [14]:
data.take(6)

['1VVHf1BvtGC0aSCCIjQyiA,K5jY2W5Q3eNnwssV5UZtow,4,2016-11-16,2,2,2,past sunday one several time ive spirit its always eccentric fun time first music performance second hang recent visit sundays bingo bango spirit know pizza good drink also awesome last time get chard margarita time get tomatillo margarita hot ciders its always pleasant surprise see whats menu food drink drink little pricey drawback one coolest things spirit atmosphere its always super strange positive way really never know expect bingo bango definitely family appropriate its fun activity friends maybe even date doesnt mind something ordinary',
 'QYKexxaOJQlseGWmc6soRg,rzByiKaj-bLeLz-zKNBQdg,2,2015-04-13,0,0,0,old cramp build lot enough employees staff keep demand cause long wait time',
 '-ARdx8hOcEWlMDjzwLYZ_g,3cbsPfoUUrysf-M8FI_0IA,4,2014-03-24,6,4,3,live long world without donut menu dont know group nine din three varieties donuts include lemon lavender chocolate espresso zeppolli amaze pepper donut concoction ever m

In [15]:
train_data = data.mapPartitions(lambda x: csv.reader(x)).map(lambda x: ((x[0], x[1]), x[7])).join(train_temp)

In [16]:
train_data = train_data.map(lambda x: (x[0][0], x[0][1], x[1][0]))

### Collect user data from train data (User, Review)

In [17]:
userReview = train_data.map(lambda x: (x[0], x[2])).groupByKey().mapValues(list)

In [18]:
userReviewCollected = userReview.map(lambda x: (x[0], " ".join(x[1])))

In [19]:
userReviewCollected.take(5)

[('AKqMnr1bEFIZDG_zQzX8sA',
  'place breathe fresh air find hot dog fantastic buns toppings fresh homemade fresh cut french fry hot crispy staff pleasant friendly chicago dog plain dog relish ketchup right point fry yummmmmm little small inside serve purpose mood fresh fry great hot dog get sure definitely go back typical hibachi fair best worst middle road steak do perfectly veggies really mushy overcook rice little bland also first time go restaurant saw television show travel channel think go wasnt disappoint local restaurant pittsburgh want great steak great atmosphere favor definitely go place im fan chain restaurants always look locally own restaraunt locally own thumb definitely worth trip bread sandwich fantastic price reasonable its quaint little spot grab bite like b\xc3\xa1nh m\xc3\xac favor stop sure wish place still open find close try go must unreasonable rule university place pretty good food eat many time mostly close im go bite overprice make lose star always clean sta

### Collect product data from train data (Product, Review)

In [20]:
prodReview = train_data.map(lambda x: (x[1], x[2])).groupByKey().mapValues(list)

In [21]:
prodReviewCollected = prodReview.map(lambda x: (x[0], " ".join(x[1])))

In [22]:
prodReviewCollected.take(5)

[('5REYrZfsX3m4E3FTwovp5Q',
  'try first time last night pretty good one thing really annoy restaurant see review verde entire restaurant fill reservations can not even seat walk 2 people restaurant empty really avocado corn ceviche mojo criollo nigiri avocado crab delicious mojo criollo favorite everything taste super fresh flavorful serve good size date share wine meh would think place thats like tapas wine bar wine would better order frontera sauvignon blanc often buy liquor store slo wine hm lame hate pay 9 glass wine restaurant easily buy 10 liquor store seem justify least slo bottle service good nice time would come back absolutely love tacos havent try anything else menu taco addiction simply put food delicious however portion extremely small order leave place full youll spend pretty penny wouldnt recommend place youre big eater would terrific spot go show light snack drink food nicely present tapas style really nothing remarkable dont go dinner often would impress presentation 

### TF-IDF vector creation

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

  _nan_object_mask = _nan_object_array != _nan_object_array


In [24]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1,1),min_df=0, norm='l2')

Convert user and product rdd to pandas dataframe

In [25]:
spark = SparkSession(sc)

userPandas = userReviewCollected.toDF().toPandas()
prodPandas = prodReviewCollected.toDF().toPandas()

Create TF-IDF vectors on user review text

In [26]:
tfidf_user = tf.fit_transform(userPandas['_2'])

In [27]:
tfidf_user.shape

(987, 26685)

Create TF-IDF vectors on product review text

In [28]:
tfidf_prod = tf.fit_transform(prodPandas['_2'])

In [29]:
tfidf_prod.shape

(3098, 26685)

Add TF-IDF vectors to product dataframe

In [30]:
prodPandas['Vector'] = tfidf_prod.toarray().tolist()

In [31]:
prodPandas.head()

Unnamed: 0,_1,_2,Vector
0,5REYrZfsX3m4E3FTwovp5Q,try first time last night pretty good one thin...,"[0.0048004686407, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0..."
1,HWrbZS1mxVRj2Y2VwMmDMg,oh man word can not describe excite bakeshop i...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,MvlQo4bev1eqp1q0HYOLHg,first dance class probably decade probably hav...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,X9Bql7RrPU5Mab5-hJsI8A,2nd visit promise first time order feature bur...,"[0.0224731737297, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0..."
4,owO2UkNKk9qrWWd_PTYLDA,feel like ive random experience place regulars...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


Add TF-IDF vectors to user dataframe

In [32]:
userPandas['Vector'] = tfidf_user.toarray().tolist()

In [33]:
userPandas.head()

Unnamed: 0,_1,_2,Vector
0,AKqMnr1bEFIZDG_zQzX8sA,place breathe fresh air find hot dog fantastic...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,hg71CGHy9bwpgY8cGVVg4w,wife come lunch walk buy chocolate moose look ...,"[0.031725336392, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,S1cjSFKcS5NVc3o1MkfpwA,would think vegan friendly hipster restaurant ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,eYnS0WVYYykN_gRuMUzoOw,stop pittsburgh home garden show quick drink c...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,A0ssDikuj0SQr9mVqnh91Q,sit bar good drink bunch beer options rum cour...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


Review text not needed

In [34]:
del prodPandas['_2']
del userPandas['_2']

### Create user numpy matrix from feature vectors

In [35]:
user_matrix = np.zeros((len(userPandas), tfidf_prod.shape[1]))
idx = 0
for index, row in userPandas.iterrows():
    vector = np.array(row['Vector'])
    user_matrix[idx] = vector
    idx += 1

In [36]:
user_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.03172534,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

### Create product numpy matrix from feature vectors

In [37]:
prod_matrix = np.zeros((len(prodPandas), tfidf_prod.shape[1]))
idx = 0
for index, row in prodPandas.iterrows():
    vector = np.array(row['Vector'])
    prod_matrix[idx] = vector
    idx += 1

In [38]:
prod_matrix

array([[ 0.00480047,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

### Compute cosine similarity by taking dot product

In [39]:
similarity_matrix = np.dot(user_matrix, prod_matrix.T)

In [40]:
similarity_matrix.shape

(987, 3098)

### Flatten similarity matrix to relate with user and products

In [41]:
prod = prodPandas['_1'].values
user = userPandas['_1'].values

In [42]:
zf = similarity_matrix.flatten()
xr = np.repeat(user, prod.size)
yt = np.tile(prod, user.size)
d = np.stack((xr, yt, zf), axis=-1)

In [43]:
d.shape

(3057726, 3)

### Convert similarity matrix to RDD

In [44]:
similarity_rdd = sc.parallelize(d)

In [45]:
similarity_rdd.take(5)

[array([u'AKqMnr1bEFIZDG_zQzX8sA', u'5REYrZfsX3m4E3FTwovp5Q',
        0.2687392953363509], dtype=object),
 array([u'AKqMnr1bEFIZDG_zQzX8sA', u'HWrbZS1mxVRj2Y2VwMmDMg',
        0.10418909020180221], dtype=object),
 array([u'AKqMnr1bEFIZDG_zQzX8sA', u'MvlQo4bev1eqp1q0HYOLHg',
        0.09500990959959338], dtype=object),
 array([u'AKqMnr1bEFIZDG_zQzX8sA', u'X9Bql7RrPU5Mab5-hJsI8A',
        0.24884022230440347], dtype=object),
 array([u'AKqMnr1bEFIZDG_zQzX8sA', u'owO2UkNKk9qrWWd_PTYLDA',
        0.15894637714087928], dtype=object)]

### Use Train and Test Data for predictions

In [46]:
trainData.take(5)

['1VVHf1BvtGC0aSCCIjQyiA,K5jY2W5Q3eNnwssV5UZtow,4,2016-11-16,2,2,2,past sunday one several time ive spirit its always eccentric fun time first music performance second hang recent visit sundays bingo bango spirit know pizza good drink also awesome last time get chard margarita time get tomatillo margarita hot ciders its always pleasant surprise see whats menu food drink drink little pricey drawback one coolest things spirit atmosphere its always super strange positive way really never know expect bingo bango definitely family appropriate its fun activity friends maybe even date doesnt mind something ordinary',
 'QYKexxaOJQlseGWmc6soRg,rzByiKaj-bLeLz-zKNBQdg,2,2015-04-13,0,0,0,old cramp build lot enough employees staff keep demand cause long wait time',
 '-ARdx8hOcEWlMDjzwLYZ_g,3cbsPfoUUrysf-M8FI_0IA,4,2014-03-24,6,4,3,live long world without donut menu dont know group nine din three varieties donuts include lemon lavender chocolate espresso zeppolli amaze pepper donut concoction ever m

Create key on (user, product)

In [47]:
similarity_rdd = similarity_rdd.map(lambda x: ((x[0], x[1]), float(x[2])))

In [None]:
similarity_rdd.take(5)

Join similarity matrix with train and test RDD to take only similarity values for training and testing

In [48]:
train = similarity_rdd.join(train_rdd)
test = similarity_rdd.join(test_rdd)

In [50]:
test.take(5)

[((u'nzl2KyGKLtV7j8QEfOhC-w', u'TU2oECyx4VhCnzRzGzOlTg'),
  (0.12674026709190792, 4.0)),
 ((u'JeIVRtZiwUnSP9qR4AmfEQ', u'WZDQqw960DaMWDyPbtRFtA'),
  (0.3785895597715922, 4.0)),
 ((u'g3V76Ja0XgWS1rqx0gxL_A', u'a384anuGRsOe5IXclb3cNQ'),
  (0.20300787250682253, 4.0)),
 ((u'3DltS5Wr9MOntnc8StBFBA', u'GjX2WgchvvS-JAWBmzkLyQ'),
  (0.22445823160425513, 5.0)),
 ((u'GMKoemATfrXg1deaXxt2jA', u'gldPX9ANF5Nic0N7igu2og'),
  (0.34381705401172574, 5.0))]

Convert RDD to List

In [51]:
train_ratings = train.collect()
test_ratings = test.collect()

In [52]:
test_rdd.take(5)

[(('JiPMk9WmbJu-VfTRAKpZpw', 'PdDpIGwBZoTYzOVasT-WuA'), 4.0),
 (('2wKnvn68eWybc7ID-7UQmQ', 'khRo2a5OaIjumox-tkg3GA'), 4.0),
 (('LsWpfxWjLQcazDqnZ_A62g', 'D_pwairtGGR0V_w2xx5XeA'), 2.0),
 (('0N9bSCmoJMoGmR0EldzjQg', '3iaOYhNoc6XL935MqnxJSQ'), 5.0),
 (('-hietrA8M58asfpyJkCyiA', 'O1ird5yRyuDFnOmYu90OoA'), 4.0)]

In [53]:
train_ratings

[((u'hHqH_E9FCI_B6WubV0jPYA', u'ZNdV9ytExuxPTXSN8i2xhw'),
  (0.37943312535887336, 3.0)),
 ((u'V0pP_PQnWdtyKpF-pifiaw', u'Fpm3WvqtrAg2ueh_4pz7iA'),
  (0.45619927921914505, 4.0)),
 ((u'IKnLl7SbuP0u6HS34jwHhw', u'guQww9yGHP7rRTea6zTnDg'),
  (0.3851705191882032, 3.0)),
 ((u'M9MXoSsb193m1g0QmmUYBQ', u'-i3pCgQi_Y9NiSSWs6G7bw'),
  (0.378031883588193, 5.0)),
 ((u'BkMqpJikNc3r5itc-ui6ww', u'x8WI_GkeGHGJCXggDm8flg'),
  (0.28922679482520175, 3.0)),
 ((u'135DbbQnr3BEkQbBzZ9T1A', u'CTiWs36A_sX3eahdqrYxUQ'),
  (0.2934744296807221, 5.0)),
 ((u'rF-MrPztxHEm6KJe0JUJIQ', u'w2tr0PA0b3tg8pId0kjM-A'),
  (0.1446343636646645, 3.0)),
 ((u'vHHjaq9pSuwq8uCrQ85qlw', u'_R1jBQQieKpNGMBqmrLRyA'),
  (0.5158261523776708, 2.0)),
 ((u'0N9bSCmoJMoGmR0EldzjQg', u'F9IJ3EddoyW89O0DFikPTQ'),
  (0.24306505259504227, 1.0)),
 ((u'1ZPnQs-tdvbX8ROjtnzcEg', u'OXrFWgoz533T8tMRemkiww'),
  (0.4862571237598709, 5.0)),
 ((u'PZNMPWCViVX8JLsn10MSnQ', u'MNmQ1s-zdjEgm3bdRs7iEw'),
  (0.29663516440401483, 4.0)),
 ((u'tR12WeWha2DGhUrKcvTttQ'

In [54]:
train.take(5)

[((u'hHqH_E9FCI_B6WubV0jPYA', u'ZNdV9ytExuxPTXSN8i2xhw'),
  (0.37943312535887336, 3.0)),
 ((u'V0pP_PQnWdtyKpF-pifiaw', u'Fpm3WvqtrAg2ueh_4pz7iA'),
  (0.45619927921914505, 4.0)),
 ((u'IKnLl7SbuP0u6HS34jwHhw', u'guQww9yGHP7rRTea6zTnDg'),
  (0.3851705191882032, 3.0)),
 ((u'M9MXoSsb193m1g0QmmUYBQ', u'-i3pCgQi_Y9NiSSWs6G7bw'),
  (0.378031883588193, 5.0)),
 ((u'BkMqpJikNc3r5itc-ui6ww', u'x8WI_GkeGHGJCXggDm8flg'),
  (0.28922679482520175, 3.0))]

### Training Regressor on similarity values

In [60]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

Convert data to numpy array for regressor

In [61]:
X_train = []
y_train = []
for ratings in train_ratings:
    X_train.append(ratings[1][0])
    y_train.append(ratings[1][1])

X_train = np.array(X_train)
X_train = X_train.reshape(-1,1)

In [62]:
X_test = []
y_test = []
for ratings in test_ratings:
    X_test.append(ratings[1][0])
    y_test.append(ratings[1][1])

X_test = np.array(X_test)
X_test = X_test.reshape(-1,1)

In [63]:
X_test

array([[ 0.12674027],
       [ 0.37858956],
       [ 0.20300787],
       ..., 
       [ 0.25466329],
       [ 0.18442728],
       [ 0.29656706]])

Train regressor on training data and make prediction on Test data<br>
Computed Mean Squared Error on predicted values

In [71]:
forest = RandomForestRegressor(max_depth=1, n_estimators=20)

rs = GradientBoostingRegressor(loss='ls', learning_rate=0.005, n_estimators=2)

reg = LinearRegression()

rs.fit(X_train, y_train)

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

expected = y_test
predicted = rs.predict(X_test)

train_expected = y_train
train_predicted = rs.predict(X_train)

print("Training:\n%s" % np.sqrt(mean_squared_error(train_expected, train_predicted)))
print("Result:\n%s" % np.sqrt(mean_squared_error(expected, predicted)))

Training:
1.03204199666
Result:
1.03786353237


### Computed Ratings on missing values

In [72]:
missing_test = test_rdd.subtractByKey(test)

In [73]:
missing_ratings_user = missing_test.map(lambda x: ((x[0][0]), (x[0][1], x[1]))).join(avg_rating).map(lambda x: ((x[0], x[1][0][0]), (x[1][1], x[1][0][1])))

In [74]:
missing_ratings_user.take(5)

[(('70sSlkooEgL_TEjWDQbr3A', 'afXMX5llxcMFzbaPaBBm6A'), (3.12, 4.0)),
 (('pr8_C12oHakeNB4ZPp_dig', '3ZcxnR9YkDVRqqkDJMRWBg'), (4.2, 5.0)),
 (('dz8CFWEWuR_4S1zlZhWCMQ', 'rKh_Nl5edIB9AevqnDmO6g'),
  (3.7291666666666665, 3.0)),
 (('dz8CFWEWuR_4S1zlZhWCMQ', '_VYUU5HPLYasd-xdKLimNA'),
  (3.7291666666666665, 3.0)),
 (('z4MQzyewTRzSoStg0NwL-w', 'lvZOJWiwNymeBhOAgoy11w'),
  (3.911764705882353, 4.0))]

Make predictions on test data using the trained regressor

In [75]:
predictions = test.map(lambda x: ((x[0]), (rs.predict(np.array(x[1][0]).reshape(1,-1))[0], x[1][1])))

In [76]:
final_predictions = predictions.union(missing_ratings_user)

In [68]:
final_predictions.count()

12496

In [69]:
mse = final_predictions.map(lambda x: (x[1][0]-x[1][1])**2)

In [70]:
np.sqrt(mse.mean())

1.0307912822424812

In [77]:
final_predictions = final_predictions.collect()

In [120]:
final_predictions

[((u'4wp4XI9AxKNqJima-xahlg', u'nc5uuDeM3EA9WJycGDeg1w'),
  (3.7741745257888799, 4.0)),
 ((u'nzl2KyGKLtV7j8QEfOhC-w', u'TU2oECyx4VhCnzRzGzOlTg'),
  (3.8827074360642819, 4.0)),
 ((u'135DbbQnr3BEkQbBzZ9T1A', u'CTiWs36A_sX3eahdqrYxUQ'),
  (3.8827074360642819, 5.0)),
 ((u'0N9bSCmoJMoGmR0EldzjQg', u'F9IJ3EddoyW89O0DFikPTQ'),
  (3.8827074360642819, 1.0)),
 ((u'1ZPnQs-tdvbX8ROjtnzcEg', u'OXrFWgoz533T8tMRemkiww'),
  (3.7713046737918883, 5.0)),
 ((u'b4ZbEEDbCPT6pVT4ImJU4w', u'Hdnx6cZBo0JfZopnQDWVYg'),
  (3.8827074360642819, 4.0)),
 ((u'mA3tALTzmDunDRctGEu-wA', u'lqHk6vmnSoTaZwMLSfq1MA'),
  (3.7741745257888799, 3.0)),
 ((u'396srm0Kn4gdjRQ1-aPrmQ', u'X2f2s37_cFKo4xpzGFqk1w'),
  (3.8802269059218282, 3.0)),
 ((u'3gwqM0K5iPDugcy75Qal5A', u'ddouZi236BpT15DzXjRHiA'),
  (3.8827074360642819, 3.0)),
 ((u'fmzIm7RxEdii5Jz44PtO7g', u'r6V0rVP_N2afc_BQNzfClA'),
  (3.8827074360642819, 5.0)),
 ((u'GMKoemATfrXg1deaXxt2jA', u'gldPX9ANF5Nic0N7igu2og'),
  (3.8284588855671542, 5.0)),
 ((u'ACwBMSJzgW6vOvV7vOrk8Q', u'

### Save predictions file

In [125]:
with open('/Users/lakshya/Desktop/INF-553/Project/TextBasedPredictions.txt', 'w') as f:
    for item in final_predictions:
        f.write(str(item[0][0])+","+str(item[0][1])+","+str(item[1][0])+"\n")

### Making predictions on training data for Hybrid model

In [71]:
train_predict = train.map(lambda x: ((x[0]), (forest.predict(np.array(x[1][0]).reshape(1,-1))[0], x[1][1])))

In [72]:
train_predict.take(5)

[((u'hHqH_E9FCI_B6WubV0jPYA', u'ZNdV9ytExuxPTXSN8i2xhw'),
  (3.8284588855671542, 3.0)),
 ((u'V0pP_PQnWdtyKpF-pifiaw', u'Fpm3WvqtrAg2ueh_4pz7iA'),
  (3.7713046737918883, 4.0)),
 ((u'IKnLl7SbuP0u6HS34jwHhw', u'guQww9yGHP7rRTea6zTnDg'),
  (3.7741745257888799, 3.0)),
 ((u'M9MXoSsb193m1g0QmmUYBQ', u'-i3pCgQi_Y9NiSSWs6G7bw'),
  (3.7741745257888799, 5.0)),
 ((u'JeIVRtZiwUnSP9qR4AmfEQ', u'WZDQqw960DaMWDyPbtRFtA'),
  (3.7741745257888799, 4.0))]

In [73]:
train_predictions = train_predict.map(lambda x: (x[1][0]-x[1][1])**2)

In [74]:
np.sqrt(train_predictions.mean())

1.0323727636048268

In [75]:
train_predict = train_predict.collect()

In [76]:
with open('/Users/lakshya/Desktop/INF-553/Project/Train_TextBasedPredictions.txt', 'w') as f:
    for item in train_predict:
        f.write(str(item[0][0])+","+str(item[0][1])+","+str(item[1][0])+"\n")