In [1]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy as np
import string
import random
import string
import pandas as pd
from sklearn import linear_model
import csv
import ast
import seaborn as sns
from sklearn.model_selection import train_test_split

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
allRecipes = pd.read_csv('CSE158-assignment_2/RAW_recipes.csv')  

In [4]:
interactions = pd.read_csv('CSE158-assignment_2/RAW_interactions.csv')

In [5]:
dataset = pd.merge(allRecipes, interactions, how='outer', left_on='id', right_on='recipe_id')

In [6]:
dataset['user_recipe_pair'] = dataset['user_id'].astype(str) + ',' + dataset['recipe_id'].astype(str)
dataset['rating_review'] = dataset['rating'].astype(str) + dataset['review']

In [7]:
train, test = train_test_split(dataset, test_size=0.3, shuffle=True)
train, valid = train_test_split(train, test_size=0.2)

In [8]:
usersPerItem = defaultdict(set)
itemsPerUser = defaultdict(set)
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)
itemNames = {}
ratingDict = {}

[usersPerItem[item].add(user) for user, item in zip(train['user_id'], train['recipe_id'])]
[itemsPerUser[user].add(item) for user, item in zip(train['user_id'], train['recipe_id'])]
[reviewsPerUser[int(user_item.split(',')[0])].append((rating, int(user_item.split(',')[1]))) for user_item, rating in zip(train['user_recipe_pair'], train['rating'])]
#[reviewsPerUser[dataset.iloc[i]['user_id']].append(dataset.iloc[i]) for i in range(dataset.shape[0])]#user, rating in zip(dataset['user_id'], dataset['rating'])]
[reviewsPerItem[int(user_item.split(',')[1])].append((rating, int(user_item.split(',')[0]))) for user_item, rating in zip(train['user_recipe_pair'], train['rating'])]
#[reviewsPerItem[item].append(rating) for item, rating in zip(dataset['recipe_id'], dataset['rating'])]
ratingDict = {(int(user_item.split(',')[0]), int(user_item.split(',')[1])): rating for user_item, rating in zip(train['user_recipe_pair'], train['rating'])}
None

In [9]:
userAverages = {}
itemAverages = {}
ratingMean = []

for u in itemsPerUser:
    rs = [ratingDict[(u,i)] for i in itemsPerUser[u]]
    userAverages[u] = sum(rs) / len(rs)
    
for i in usersPerItem:
    rs = [ratingDict[(u,i)] for u in usersPerItem[i]]
    itemAverages[i] = sum(rs) / len(rs)
    
ratingMean = list(train['rating'])
    
ratingMean = sum(ratingMean) / len(ratingMean)

In [10]:
alwaysPredictMean_valid = [ratingMean] * valid.shape[0]
alwaysPredictFive_valid = [5] * valid.shape[0]

alwaysPredictMean_test = [ratingMean] * test.shape[0]
alwaysPredictFive_test = [5] * test.shape[0]

In [11]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

In [12]:
def predictRating(user, item):
    ratings = []
    similarities = []
    for d in reviewsPerUser[user]:
        i2 = d[1]
        if i2 == item: continue
        ratings.append(d[0] - itemAverages[i2])
        similarities.append(Jaccard(usersPerItem[item],usersPerItem[i2]))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return itemAverages[item] + sum(weightedRatings) / sum(similarities)
    else:
        return ratingMean

In [23]:
valid_preds = [predictRating(user, recipe) for user, recipe in zip(valid['user_id'], valid['recipe_id'])]
test_preds = [predictRating(user, recipe) for user, recipe in zip(test['user_id'], test['recipe_id'])]

In [24]:
def MSE(y, ypred):
    diffs = [(a-b)**2 for (a,b) in zip(y,ypred)]
    return sum(diffs) / len(diffs)

In [25]:
valid_labels = list(valid['rating'])
test_labels = list(test['rating'])

In [26]:
mse_mean_valid = MSE(alwaysPredictMean_valid, valid_labels)
mse_five_valid = MSE(alwaysPredictFive_valid, valid_labels)
mse_valid = MSE(valid_preds, valid_labels)

mse_mean_test = MSE(alwaysPredictMean_test, test_labels)
mse_five_test = MSE(alwaysPredictFive_test, test_labels)
mse_test = MSE(test_preds, test_labels)

In [27]:
mse_mean_valid, mse_five_valid, mse_valid

(1.6014862900359228, 1.946338909494613, 1.73969657501988)

In [28]:
mse_mean_test, mse_five_test, mse_test

(1.6012984760164963, 1.9490802476222435, 1.7451196987108)

In [29]:
round_valid = [round(i) for i in valid_preds]
round_test = [round(i) for i in test_preds]

In [30]:
mse_round_valid = MSE(round_valid, valid_labels)
mse_round_test = MSE(round_test, test_labels)

In [31]:
mse_round_valid, mse_round_test

(1.7702293543259404, 1.7720621351678338)