In [13]:
import numpy as np
import sklearn
import scipy.optimize
import pandas as pd
import math
import random
from collections import defaultdict
import gzip

In [2]:
# Import users and items

def readGz(f):
  for l in gzip.open(f):
    yield eval(l)
data = []
for l in readGz("train.json.gz"):
    data.append(l)  
users = []
for l in readGz("train.json.gz"):
    users.append(l['reviewerID'])
    
items = []
for l in readGz("train.json.gz"):
    items.append(l['itemID'])
    
category = []
for l in readGz("train.json.gz"):
    category.append(l['categories'])

In [3]:
def split_data_ml100k(data, num_users, num_items,
                      split_mode='random', test_ratio=0.2):
    """Split the dataset in random mode or seq-aware mode."""
    if split_mode == 'seq-aware':
        train_items, test_items, train_list = {}, {}, []
        for line in data.itertuples():
            u, i, rating = line[1], line[2], line[3]
            train_items.setdefault(u, []).append((u, i, rating))
            if u not in test_items or test_items[u][-1] < time:
                test_items[u] = (i, rating, time)
        for u in range(1, num_users + 1):
            train_list.extend(sorted(train_items[u], key=lambda k: k[3]))
        test_data = [(key, *value) for key, value in test_items.items()]
        train_data = [item for item in train_list if item not in test_data]
        train_data = pd.DataFrame(train_data)
        test_data = pd.DataFrame(test_data)
    else:
        mask = [True if x == 1 else False for x in np.random.uniform(
            0, 1, (len(data))) < 1 - test_ratio]
        neg_mask = [not x for x in mask]
        train_data, test_data = data[mask], data[neg_mask]
    return train_data, test_data

In [4]:
data=pd.DataFrame(data)

In [5]:
users_df = pd.DataFrame(users)
items_df = pd.DataFrame(items)
num_users = users_df[0].unique().shape[0]
num_items = items_df[0].unique().shape[0]

In [6]:
train_data,test_data=split_data_ml100k(data, num_users, num_items, test_ratio=0.2)

In [7]:
users_train, users_test = train_data['reviewerID'], test_data['reviewerID']
items_train, items_test  = train_data['itemID'], test_data['itemID']
category_train, category_test = train_data['categories'], test_data['categories']

In [8]:
user_diction = defaultdict(list)
item_diction = defaultdict(list)

for u,i,c in zip(users_train, items_train, category_train):
    for row in c:
        for subcat in row:
            if subcat in user_diction[u]:
              user_diction[u].append(subcat)
            if subcat in item_diction[i]:
              item_diction[i].append(subcat)
            

threshold = list(np.arange(0.182,0.19,0.001)) ##trying out different thresholds, might need to modify this later

In [9]:
#jaccard similarity formula
def jaccard_similarity(set1,set2):
    common = set1.intersection(set2)
    return float(len(common)) / (len(set1) + len(set2) - len(common))

In [10]:
#trying to see if different thresholds make a difference
for k in threshold:
    predict = []
    for u,i in zip(users_test,items_test):
        test_i = set(item_diction[i])
        test_u = set(user_diction[u])
        if len(test_u) != 0:
            similarity = jaccard_similarity(test_i,test_u)
            if similarity >= k:
                predict.append(1)
            else:
                predict.append(0)
        else:
            predict.append(1)

In [11]:
businessCount = defaultdict(int)
totalPurchases = 0

for l in readGz("train.json.gz"):
  user,business = l['reviewerID'],l['itemID']
  businessCount[business] += 1
  totalPurchases += 1

mostPopular = [(businessCount[x], x) for x in businessCount]
mostPopular.sort()
mostPopular.reverse()

return1 = set()
count = 0
for ic, i in mostPopular:
  count += ic
  return1.add(i)
  if count > totalPurchases/2: break

In [12]:
predictions = open("predictions_Purchase.txt", 'w')

for l in open("pairs_Purchase.txt"):
    common = 0
    if l.startswith("reviewerID"):
        predictions.write(l)
        continue
    u,i = l.strip().split('-')
    item_cat = set(item_diction[i]) #converting to sets for jaccard
    user_cat = set(user_diction[u]) #converting to sets for jaccard

    if len(user_cat) != 0:
        similarity = jaccard_similarity(item_cat,user_cat)
        if similarity >= 0.182 or i in return1:
            predictions.write(u + '-' + i + ",1\n")
        else:
            predictions.write(u + '-' + i + ",0\n")
    elif len(user_cat) == 0 and i in return1:
        predictions.write(u + '-' + i + ",1\n")
    else:
        predictions.write(u + '-' + i + ",0\n")

predictions.close()