In [1]:
import gzip
from collections import defaultdict
import random
import scipy.optimize


In [2]:
path = "amazon_reviews_us_Musical_Instruments_v1_00.tsv.gz"

In [3]:
f = gzip.open(path, 'rt', encoding='utf-8')

In [4]:
header = f.readline()
header = header.strip().split('\t')

In [5]:
dataset = []

In [6]:
for line in f:
    fields = line.strip().split('\t')
    d = dict(zip(header, fields))
    d["star_rating"] = int(d["star_rating"])
    d["helpful_votes"] = int(d["helpful_votes"])
    d["total_votes"] = int(d["total_votes"])
    dataset.append(d)


In [7]:
dataset[0]

{'marketplace': 'US',
 'customer_id': '45610553',
 'review_id': 'RMDCHWD0Y5OZ9',
 'product_id': 'B00HH62VB6',
 'product_parent': '618218723',
 'product_title': 'AGPtek® 10 Isolated Output 9V 12V 18V Guitar Pedal Board Power Supply Effect Pedals with Isolated Short Cricuit / Overcurrent Protection',
 'product_category': 'Musical Instruments',
 'star_rating': 3,
 'helpful_votes': 0,
 'total_votes': 1,
 'vine': 'N',
 'verified_purchase': 'N',
 'review_headline': 'Three Stars',
 'review_body': 'Works very good, but induces ALOT of noise.',
 'review_date': '2015-08-31'}

In [8]:
users_per_item = defaultdict(set)
items_per_user = defaultdict(set)

In [9]:
item_names = {}

In [10]:
for d in dataset:
    user, item = d['customer_id'], d['product_id']
    users_per_item[item].add(user)
    item_names[item] = d['product_title']


In [11]:
def jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    return numer / denom



In [12]:
def mostSimilar(i):
    similarities = []
    users = users_per_item[i]
    for i2 in users_per_item:
        if i2 == i:
            continue
        sim = jaccard(users, users_per_item)
        similarities.append((sim,i2))
    similarities.sort(reverse=True)
    return similarities[:10]

In [13]:
dataset[2]

{'marketplace': 'US',
 'customer_id': '6111003',
 'review_id': 'RIZR67JKUDBI0',
 'product_id': 'B0006VMBHI',
 'product_parent': '603261968',
 'product_title': 'AudioQuest LP record clean brush',
 'product_category': 'Musical Instruments',
 'star_rating': 3,
 'helpful_votes': 0,
 'total_votes': 1,
 'vine': 'N',
 'verified_purchase': 'Y',
 'review_headline': 'Three Stars',
 'review_body': 'removes dust. does not clean',
 'review_date': '2015-08-31'}

In [14]:
query = dataset[2]['product_id']

In [15]:
# mostSimilar(query)

In [16]:
item_names[query]

'AudioQuest LP record clean brush'

In [None]:
[item_names[x[1]] for x in mostSimilar(query)]

In [None]:
def mostSimilarFast(i):
    similarities = []
    uers = users_per_item[i]
    candidateItems = set()
    for u in users:
        candidateItems = candidateItems.union(items_per_user[u])
    for i2 in candidateItems:
        if i2 == i:continue
        sim = jaccard(users,users_per_item[i2])
        similarities.append((sim,i2))
    similarities.sort(reverse = True)
    return similarities[:10]

In [None]:
mostSimilarFast(query)