In [1]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy as np
import string
import random
import string
import pandas as pd
from sklearn import linear_model
import csv
import ast
import seaborn as sns
from sklearn.model_selection import train_test_split

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
allRecipes = pd.read_csv('CSE158-assignment_2/RAW_recipes.csv')  

In [4]:
interactions = pd.read_csv('CSE158-assignment_2/RAW_interactions.csv')

In [5]:
dataset = pd.merge(allRecipes, interactions, how='outer', left_on='id', right_on='recipe_id')

In [6]:
dataset['user_recipe_pair'] = dataset['user_id'].astype(str) + ',' + dataset['recipe_id'].astype(str)

In [7]:
dataset['review'] = dataset['review'].fillna('')

In [8]:
train, test = train_test_split(dataset, test_size=0.3, shuffle=True)
# train, valid = train_test_split(train, test_size=0.2, shuffle=True)

In [9]:
userIDs, itemIDs = {},{}

for user, item in zip(dataset['user_id'], dataset['recipe_id']):
    if not user in userIDs: userIDs[user] = len(userIDs)
    if not item in itemIDs: itemIDs[item] = len(itemIDs)

nUsers, nItems = len(userIDs), len(itemIDs)

In [10]:
X = scipy.sparse.lil_matrix((dataset.shape[0], nUsers + nItems))

In [11]:
y = np.array(dataset['rating'])

In [12]:
for i in range(dataset.shape[0]):
    user = userIDs[dataset.loc[i]['user_id']]
    item = itemIDs[dataset.loc[i]['recipe_id']]
    X[i, user] = 1 # One-hot encoding of user
    X[i, nUsers + item] = 1 # One-hot encoding of item

In [13]:
# from fastFM import als
# fm = als.FMRegression(n_iter=1000, init_stdev=0.1, rank=5, l2_reg_w=0.1, l2_reg_V=0.5)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True)
# fm.fit(X_train, y_train)
# y_pred = fm.predict(X_test)
# MSE(y_pred, y_test)

In [14]:
usersPerItem = defaultdict(set)
itemsPerUser = defaultdict(set)
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)
itemNames = {}
ratingDict = {}

[usersPerItem[item].add(user) for user, item in zip(train['user_id'], train['recipe_id'])]
[itemsPerUser[user].add(item) for user, item in zip(train['user_id'], train['recipe_id'])]
[reviewsPerUser[int(user_item.split(',')[0])].append((rating, int(user_item.split(',')[1]))) for user_item, rating in zip(train['user_recipe_pair'], train['rating'])]
#[reviewsPerUser[dataset.iloc[i]['user_id']].append(dataset.iloc[i]) for i in range(dataset.shape[0])]#user, rating in zip(dataset['user_id'], dataset['rating'])]
[reviewsPerItem[int(user_item.split(',')[1])].append((rating, int(user_item.split(',')[0]))) for user_item, rating in zip(train['user_recipe_pair'], train['rating'])]
#[reviewsPerItem[item].append(rating) for item, rating in zip(dataset['recipe_id'], dataset['rating'])]
ratingDict = {(int(user_item.split(',')[0]), int(user_item.split(',')[1])): rating for user_item, rating in zip(train['user_recipe_pair'], train['rating'])}
None

In [15]:
userAverages = {}
itemAverages = {}
ratingMean = []

for u in itemsPerUser:
    rs = [ratingDict[(u,i)] for i in itemsPerUser[u]]
    userAverages[u] = sum(rs) / len(rs)
    
for i in usersPerItem:
    rs = [ratingDict[(u,i)] for u in usersPerItem[i]]
    itemAverages[i] = sum(rs) / len(rs)
    
ratingMean = list(train['rating'])
    
ratingMean = sum(ratingMean) / len(ratingMean)

In [16]:
# alwaysPredictMean_valid = [ratingMean] * valid.shape[0]
# alwaysPredictFive_valid = [5] * valid.shape[0]

alwaysPredictMean_test = [ratingMean] * test.shape[0]
alwaysPredictFive_test = [5] * test.shape[0]

In [17]:
def MSE(y, ypred):
    diffs = [(a-b)**2 for (a,b) in zip(y,ypred)]
    return sum(diffs) / len(diffs)

In [18]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

In [19]:
def predictRating(user, item):
    ratings = []
    similarities = []
    for d in reviewsPerUser[user]:
        i2 = d[1]
        if i2 == item: continue
        ratings.append(d[0] - itemAverages[i2])
        similarities.append(Jaccard(usersPerItem[item],usersPerItem[i2]))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return itemAverages[item] + sum(weightedRatings) / sum(similarities)
    else:
        return ratingMean

In [20]:
#valid_preds = [predictRating(user, recipe) for user, recipe in zip(valid['user_id'], valid['recipe_id'])]
test_preds = [predictRating(user, recipe) for user, recipe in zip(test['user_id'], test['recipe_id'])]

In [21]:
train_labels = list(train['rating'])
# valid_labels = list(valid['rating'])
test_labels = list(test['rating'])

In [22]:
# mse_mean_valid = MSE(alwaysPredictMean_valid, valid_labels)
# mse_five_valid = MSE(alwaysPredictFive_valid, valid_labels)
# mse_valid = MSE(valid_preds, valid_labels)

mse_mean_test = MSE(alwaysPredictMean_test, test_labels)
mse_five_test = MSE(alwaysPredictFive_test, test_labels)
mse_test = MSE(test_preds, test_labels)

In [23]:
# mse_mean_valid, mse_five_valid, mse_valid

In [24]:
mse_mean_test, mse_five_test, mse_test

(1.598506386225117, 1.9444704469387215, 1.7458351615777992)

In [25]:
# round_valid = [round(i) for i in valid_preds]
# round_test = [round(i) for i in test_preds]

In [26]:
# mse_round_valid = MSE(round_valid, valid_labels)
# mse_round_test = MSE(round_test, test_labels)

In [27]:
# mse_round_valid, mse_round_test

In [28]:
dataset['ingredients'] = dataset['ingredients'].apply(ast.literal_eval)

In [29]:
# from collections import Counter
# ing_counts = Counter([ing for ing_lst in dataset['ingredients'] for ing in ing_lst])
# ing_counts = ing_counts.most_common(1000)
# ings = [ing for ing, count in ing_counts]
# ingsIdx = dict(zip(ings, range(len(ings))))
# ings[:10]

In [30]:
# reviewtext_wordCount = defaultdict(int)
# punctuation = set(string.punctuation)
# for d in train['review']:
#     r = ''.join([c for c in d.lower() if not c in punctuation])
#     for w in r.split():
#         reviewtext_wordCount[w] += 1

# reviewtext_counts = [(reviewtext_wordCount[w], w) for w in reviewtext_wordCount]
# reviewtext_counts.sort(reverse=True)

# reviewtext_words = [x[1] for x in reviewtext_counts[:1000]]

# reviewtext_wordId = dict(zip(reviewtext_words, range(len(reviewtext_words))))
# reviewtext_wordSet = set(reviewtext_words)

In [31]:
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in train['review']:
    r = ''.join([c for c in d.lower() if not c in punctuation])
    ws = r.split()
    ws2 = [' '.join(x) for x in list(zip(ws[:-1],ws[1:]))]
    for w in ws + ws2:
        wordCount[w] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort(reverse=True)

mostCommon = [x[1] for x in counts][:1000]

wordId = dict(zip(mostCommon, range(len(mostCommon))))
wordSet = set(mostCommon)

In [32]:
def bow_review(datum):
    feat = [0]*len(wordSet)
    r = ''.join([c for c in datum.lower() if not c in punctuation])
    ws = r.split()
    ws2 = [' '.join(x) for x in list(zip(ws[:-1],ws[1:]))]
    for w in ws + ws2:
        if w in wordSet:
            feat[wordId[w]] += 1
    feat.append(1) #offset
    return feat

In [33]:
def feature(d):
    feat = []
    feat.append(predictRating(d['user_id'], d['recipe_id']))
    if d['review'] != None:
        feat.append(len(d['review']))
    else:
        feat.append(0)
    feat += bow_review(d['review'])
#     for ing in d['ingredients']:
#         if ing in ingsIdx:
#             feat[ingsIdx[ing]] += 1
    feat.append(1)
    return feat

In [34]:
X_train = [feature(train.iloc[i]) for i in range(train.shape[0])]

In [None]:
# X_valid = [feature(valid.iloc[i]) for i in range(valid.shape[0])]
X_test = [feature(test.iloc[i]) for i in range(test.shape[0])]

In [None]:
mod_log = linear_model.LogisticRegression(class_weight='balanced')
mod_log.fit(X_train, train_labels)
# preds_valid = mod_log.predict(X_valid)
preds_test = mod_log.predict(X_test)

In [None]:
preds_train = mod_log.predict(X_train)
mse_log_train = MSE(preds_train, train_labels)
mse_log_train

In [None]:
# mse_log_valid = MSE(preds_valid, valid_labels)
mse_log_test = MSE(preds_test, test_labels)
mse_log_test

In [None]:
# mse_log_valid, mse_log_test