In [1]:
from __future__ import division
from pyspark import SparkConf, SparkContext
from pyspark.mllib.recommendation import ALS, Rating
import time
import json
import sys
from math import sqrt
from pyspark.sql.types import *

In [2]:
import findspark
findspark.init()

import pyspark
findspark.find()

from pyspark import SparkContext
sc = SparkContext()

In [3]:
def mapDict(vals):
        return_dict = {}
        for val in vals:
            return_dict[val[0]] = val[1]
        return return_dict

def findAvg(vals):
        '''Find average value for each row'''
        count = 0
        tot = 0
        for item, rating in vals.items():
            tot += rating
            count += 1
        average = float(tot) / count
        
        for item, rating in vals.items():
            vals[item] = rating - average
        vals['row_avg'] = average
        return vals

In [4]:
train = sc.textFile('./Train-New.csv')
test = sc.textFile('./Test-New.csv')

In [5]:
train_header = train.first()
train_filtered = train.filter(lambda row: row != train_header)
test_header = test.first()
test_filtered = test.filter(lambda row: row != test_header)

In [6]:
train_rdd = train_filtered.map(lambda x: x.split(',')).map(lambda line_split: (line_split[0], (line_split[1], float(line_split[2]))))
test_rdd = test_filtered.map(lambda x: x.split(',')).map(lambda line_split: (line_split[0], line_split[1], float(line_split[2])))


In [9]:
train_rdd.take(10)

[('273418', ('0671521438', 7.0)),
 ('267444', ('0345391055', 8.0)),
 ('182838', ('0929264576', 10.0)),
 ('36554', ('0679450378', 9.0)),
 ('82720', ('0394572122', 9.0)),
 ('259812', ('0446611212', 3.0)),
 ('98499', ('1550135201', 5.0)),
 ('60083', ('0739417096', 9.0)),
 ('225610', ('014029628X', 8.0)),
 ('6840', ('0440202086', 7.0))]

In [10]:
training_group = train_rdd.groupByKey()
training_group = train_rdd.groupByKey().mapValues(list)
training_group_dict = training_group.mapValues(mapDict)
training_group_dict_avg = training_group_dict.mapValues(findAvg)

In [14]:
training_group_dict_avg.take(10)

[('273418', {'0671521438': 0.5, '1551665484': -0.5, 'row_avg': 6.5}),
 ('259812', {'0446611212': 0.0, 'row_avg': 3.0}),
 ('60083',
  {'0739417096': 0.6666666666666661,
   '0446519790': -0.3333333333333339,
   '0505524643': -0.3333333333333339,
   'row_avg': 8.333333333333334}),
 ('273483',
  {'1551667517': -0.666666666666667,
   '0446679593': 0.33333333333333304,
   '0451193261': 0.33333333333333304,
   'row_avg': 7.666666666666667}),
 ('184513',
  {'0553561367': -1.9411764705882355,
   '0375504036': 1.0588235294117645,
   '0451400828': 1.0588235294117645,
   '0671019759': 0.0588235294117645,
   '0345387651': 2.0588235294117645,
   '0671663208': -1.9411764705882355,
   '0380782340': 0.0588235294117645,
   '0446606324': 0.0588235294117645,
   '0061042935': 1.0588235294117645,
   '0679437452': 0.0588235294117645,
   '0440220602': -0.9411764705882355,
   '0345365933': 0.0588235294117645,
   '0312134517': 0.0588235294117645,
   '0821766570': 0.0588235294117645,
   '0440162645': 0.058823529

In [11]:
training_group_dict_one = training_group_dict_avg.map(lambda x: (1, x))
training_one_reduce = training_group_dict_one.groupByKey().mapValues(list).map(lambda x: x[1])

In [12]:
training_one_reduce

PythonRDD[31] at RDD at PythonRDD.scala:53

In [13]:
training_data_compile = training_one_reduce.collect()
training_data_compile = training_data_compile[0]
training_compile_dict = {}

In [15]:
for i in range(len(training_data_compile)):
        training_compile_dict[training_data_compile[i][0]] = training_data_compile[i][1]

In [16]:
item_to_user_pre = train_rdd.map(lambda x: (x[1][0], x[0])).groupByKey().mapValues(list)
item_to_user_compile = item_to_user_pre.collect()
item_to_user_dict = {}

In [17]:
for i in range(len(item_to_user_compile)):
        item_to_user_dict[item_to_user_compile[i][0]] = set(item_to_user_compile[i][1])

In [18]:
test_data = test_rdd.collect()
RMSE_tmp = 0
tmp_result = []
pearson_threshold = 0.3
random_pred = 0
upper_limit = 150
lower_limit = 15

In [19]:
for test in test_data:
        '''Get all the rows corresponding to cur user and item of test dataset'''
        cur_user, cur_item = test[0], test[1]
        filtered_train = {}
        if cur_item not in training_compile_dict or cur_user not in item_to_user_dict:
            '''If it's an unseen business id, assign some random prediction'''
            prediction = 5
            random_pred += 1
        else:
            '''We want to attach row corresponding to current item'''
            filtered_train[cur_user] = training_compile_dict[cur_user]
            cur_user_info = filtered_train[cur_user]
            
            '''Get a list of user_id who contains the current item'''
            row_set = item_to_user_dict[cur_item]
            for row in row_set:
                if len(training_compile_dict[row]) < upper_limit:
                    filtered_train[row] = training_compile_dict[row]
                
            '''Compute Pearson for each row and add to the final result if Pearson
            passes the threshold value'''
            predict_num = 0
            predict_den = 0
            for user, item_list in filtered_train.items():
                if user != cur_user:
                    num = 0
                    den1 = 0
                    den2 = 0
                    for item, rating in item_list.items():
                        if item in cur_user_info and item != cur_item and item != 'row_avg':
                            num += rating * cur_user_info[item]
                            den1 += rating**2
                            den2 += (cur_user_info[item])**2
                    denom = sqrt(den1) * sqrt(den2)
                    if num == 0 or denom == 0:
                        pearson = 0
                    else:
                        pearson = float(num) / denom
                    if pearson > pearson_threshold:
                        predict_num += (filtered_train[user][cur_item]) * pearson
                        predict_den += abs(pearson)
            if predict_num == 0 or predict_den == 0:
                prediction = cur_user_info['row_avg']
            else:
                prediction = cur_user_info['row_avg'] + float(predict_num) / predict_den
                prediction = (prediction + cur_user_info['row_avg']) / 2.0 
        '''Save the results which consists of user_id, business_id, ground truth and predicted'''
        tmp_result.append((test, prediction))
        '''Compile results for final MSE computation'''
        RMSE_tmp += (test[2] - prediction)**2

In [20]:
RMSE = sqrt(RMSE_tmp / len(test_data))

In [21]:
print("RMSE for user-user model: ", RMSE)

RMSE for user-user model:  3.2025806492887643
