In [636]:
import random
from sklearn import linear_model
from matplotlib import pyplot as plt
from collections import defaultdict
import gzip
import numpy as np
from sklearn import metrics
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [637]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [638]:
answers = {}

In [639]:
def parseData(fname):
    for l in open(fname):
        yield eval(l)

In [640]:
data = list(parseData("data/beer_50000.json"))

In [641]:
random.seed(0)
random.shuffle(data)

In [642]:
dataTrain = data[:25000]
dataValid = data[25000:37500]
dataTest = data[37500:]

In [643]:
yTrain = [d['beer/ABV'] > 7 for d in dataTrain]
yValid = [d['beer/ABV'] > 7 for d in dataValid]
yTest = [d['beer/ABV'] > 7 for d in dataTest]

In [644]:
categoryCounts = defaultdict(int)
for d in data:
    categoryCounts[d['beer/style']] += 1

In [645]:
categories = [c for c in categoryCounts if categoryCounts[c] > 1000]

In [646]:
catID = dict(zip(list(categories),range(len(categories))))

In [647]:
max_len = 0
for datum in dataTrain:
    if len(datum['review/text']) > max_len:
        max_len = len(datum['review/text'])

In [648]:
def feat(d, includeCat = True, includeReview = True, includeLength = True):
    # In my solution, I wrote a reusable function that takes parameters to generate features for each question
    # Feel free to keep or discard

    output_feat = []

    if includeCat:
        cat_feat = [0]*len(catID)
        cat = d['beer/style']
        if cat in categories:
            cat_feat[catID[cat]] = 1
        output_feat += cat_feat
    
    if includeReview:
        rev_feat = [d['review/aroma'], d['review/overall'], d['review/appearance'], \
            d['review/taste'], d['review/palate']]
        output_feat += rev_feat

    if includeLength:
        len_feat = len(d['review/text']) / max_len
        output_feat += [len_feat]
    
    return [1] + output_feat

In [649]:
# sanity check

test = np.array([1, 1, 2])
test1 = np.array([1, 1, 3])
sum(test==test1)

2

In [650]:
# sanity check

[feat(datum, True, True, True) for datum in dataTrain[:5]]

[[1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  4.0,
  4.0,
  4.0,
  4.0,
  4.0,
  0.06333403939843253],
 [1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  2.0,
  5.0,
  4.0,
  3.0,
  3.0,
  0.07371319635670409],
 [1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  3.0,
  4.5,
  4.0,
  4.5,
  5.0,
  0.10760432111840712],
 [1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  3.0,
  3.0,
  2.5,
  3.0,
  2.5,
  0.08112687989832662],
 [1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  4.0,
  4.5,
  4.0,
  4.0,
  3.0,
  0.13408176233848762]]

In [651]:
def pipeline(reg, includeCat = True, includeReview = True, includeLength = True):
    # ...

    features = [feat(datum, includeCat, includeReview, includeLength) for datum in dataTrain]
    reg = linear_model.LogisticRegression(C=reg, fit_intercept=False, class_weight='balanced').fit(features, yTrain)

    # validation
    valid_feat = [feat(datum, includeCat, includeReview, includeLength) for datum in dataValid]
    valid_preds = reg.predict(valid_feat)
    valid_acc = sum(valid_preds==yValid) / len(valid_preds)
    tn, fp, fn, tp = metrics.confusion_matrix(yValid, valid_preds).ravel()
    tpr = tp / (tp + fn)
    fpr = fp / (fp + tn) 
    valid_ber = 0.5*(fpr + (1 - tpr))

    # test
    test_feat = [feat(datum, includeCat, includeReview, includeLength) for datum in dataTest]
    test_preds = reg.predict(test_feat)
    test_acc = sum(test_preds==yTest) / len(test_preds)
    tn, fp, fn, tp = metrics.confusion_matrix(yTest, test_preds).ravel()
    tpr = tp / (tp + fn)
    fpr = fp / (fp + tn) 
    test_ber = 0.5*(fpr + (1 - tpr))

    return reg, valid_ber, test_ber

In [652]:
### Question 1

In [653]:
mod, validBER, testBER = pipeline(10, True, False, False)

In [654]:
answers['Q1'] = [validBER, testBER]

In [655]:
assertFloatList(answers['Q1'], 2)

In [657]:
### Question 2

In [658]:
mod, validBER, testBER = pipeline(10, True, True, True)

In [659]:
answers['Q2'] = [validBER, testBER]

In [660]:
assertFloatList(answers['Q2'], 2)

In [662]:
### Question 3

In [663]:
for c in [0.001, 0.01, 0.1, 1, 10]:
    validation_ber = pipeline(c, True, True, True)[1]
    print(f'c = {c} - valid BER = {validation_ber}\n')

c = 0.001 - valid BER = 0.15432897254842565

c = 0.01 - valid BER = 0.1482544084665771

c = 0.1 - valid BER = 0.1423838526493911

c = 1 - valid BER = 0.14184454647401384

c = 10 - valid BER = 0.14164748339047073



In [664]:
bestC = 10

In [665]:
mod, validBER, testBER = pipeline(10, True, True, True)

In [666]:
answers['Q3'] = [bestC, validBER, testBER]

In [667]:
assertFloatList(answers['Q3'], 3)

In [669]:
### Question 4

In [670]:
# sanity check

[feat(datum, True, False, True) for datum in dataTrain[:5]]

[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.06333403939843253],
 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.07371319635670409],
 [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.10760432111840712],
 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.08112687989832662],
 [1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.13408176233848762]]

In [671]:
mod, validBER, testBER_noCat = pipeline(1, False, True, True)

In [672]:
mod, validBER, testBER_noReview = pipeline(1, True, False, True)

In [673]:
mod, validBER, testBER_noLength = pipeline(1, True, True, False)

In [674]:
answers['Q4'] = [testBER_noCat, testBER_noReview, testBER_noLength]

In [675]:
assertFloatList(answers['Q4'], 3)

In [677]:
### Question 5

In [678]:
path = "data/amazon_reviews_us_Musical_Instruments_v1_00.tsv.gz"
f = gzip.open(path, 'rt', encoding="utf8")

header = f.readline()
header = header.strip().split('\t')

In [679]:
header

['marketplace',
 'customer_id',
 'review_id',
 'product_id',
 'product_parent',
 'product_title',
 'product_category',
 'star_rating',
 'helpful_votes',
 'total_votes',
 'vine',
 'verified_purchase',
 'review_headline',
 'review_body',
 'review_date']

In [680]:
dataset = []

pairsSeen = set()

for line in f:
    fields = line.strip().split('\t')
    d = dict(zip(header, fields))
    ui = (d['customer_id'], d['product_id'])
    if ui in pairsSeen:
        print("Skipping duplicate user/item:", ui)
        continue
    pairsSeen.add(ui)
    d['star_rating'] = int(d['star_rating'])
    d['helpful_votes'] = int(d['helpful_votes'])
    d['total_votes'] = int(d['total_votes'])
    dataset.append(d)

Skipping duplicate user/item: ('46953315', 'B00QM3CNN6')
Skipping duplicate user/item: ('31616428', 'B0026RB0G8')
Skipping duplicate user/item: ('47240912', 'B008I653SC')
Skipping duplicate user/item: ('14503091', 'B003FRMRC4')
Skipping duplicate user/item: ('38538360', 'B00HVLUR86')
Skipping duplicate user/item: ('43448024', 'B00HVLUR86')
Skipping duplicate user/item: ('51525270', 'B00HVLUR86')
Skipping duplicate user/item: ('20652160', 'B004OU2IQG')
Skipping duplicate user/item: ('10964440', 'B00HVLUR86')
Skipping duplicate user/item: ('20043677', 'B00HVLUR86')
Skipping duplicate user/item: ('44796499', 'B00HVLUSGM')
Skipping duplicate user/item: ('29066899', 'B0002CZSYO')
Skipping duplicate user/item: ('10385056', 'B004OU2IQG')
Skipping duplicate user/item: ('1658551', 'B00HVLURL8')
Skipping duplicate user/item: ('907433', 'B00N9Q2E5G')
Skipping duplicate user/item: ('39412969', 'B00HVLUR86')
Skipping duplicate user/item: ('4901688', 'B00HVLUR86')
Skipping duplicate user/item: ('234

In [681]:
dataTrain = dataset[:int(len(dataset)*0.9)]
dataTest = dataset[int(len(dataset)*0.9):]

In [682]:
dataset[0]

{'marketplace': 'US',
 'customer_id': '45610553',
 'review_id': 'RMDCHWD0Y5OZ9',
 'product_id': 'B00HH62VB6',
 'product_parent': '618218723',
 'product_title': 'AGPtek® 10 Isolated Output 9V 12V 18V Guitar Pedal Board Power Supply Effect Pedals with Isolated Short Cricuit / Overcurrent Protection',
 'product_category': 'Musical Instruments',
 'star_rating': 3,
 'helpful_votes': 0,
 'total_votes': 1,
 'vine': 'N',
 'verified_purchase': 'N',
 'review_headline': 'Three Stars',
 'review_body': 'Works very good, but induces ALOT of noise.',
 'review_date': '2015-08-31'}

In [683]:
# Feel free to keep or discard

usersPerItem = defaultdict(set) # Maps an item to the users who rated it
itemsPerUser = defaultdict(set) # Maps a user to the items that they rated
itemNames = {}
ratingDict = {} # To retrieve a rating for a specific user/item pair
reviewsPerUser = defaultdict(list)

# for d in dataTrain:
#     user, item = d['customer_id'], d['product_id']
#     usersPerItem[item].add(user)
#     itemsPerUser[user].add(item)
#     itemNames[item] = d['product_title']
#     ratingDict[(user, item)] = d['star_rating']
#     reviewsPerUser[user].append(d)

for d in dataset:
    user, item = d['customer_id'], d['product_id']
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)
    itemNames[item] = d['product_title']
    ratingDict[(user, item)] = d['star_rating']
    reviewsPerUser[user].append(d)

In [684]:
userAverages = {}
itemAverages = {}

for u in itemsPerUser:
    ratings = [ratingDict[(u, i)] for i in itemsPerUser[u]]
    userAverages[u] = sum(ratings) / len(ratings)
    
for i in usersPerItem:
    ratings = [ratingDict[(u, i)] for u in usersPerItem[i]]
    itemAverages[i] = sum(ratings) / len(ratings)

ratingMean = np.mean(list(ratingDict.values()))

In [685]:
# sanity check

sum([d['star_rating'] for d in dataset]) / len(dataset)

4.251083571824148

In [686]:
ratingMean

4.251083571824148

In [687]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

In [688]:
# sanity check

test = set([1, 2, 3])
test1 = set([1, 5 ,3])
len(test.union(test1))

4

In [689]:
def mostSimilar(i, N):
    similarities = []
    users = usersPerItem[i]
    for i2 in usersPerItem:
        if i2 == i:
            continue
        similarity = Jaccard(users, usersPerItem[i2])
        similarities.append((similarity, i2))
    similarities.sort(key = lambda x: x[0], reverse=True)
    return similarities[:N]


In [690]:
query = 'B00KCHRKD6'

In [691]:
ms = mostSimilar(query, 10)

In [692]:
answers['Q5'] = ms

In [694]:
assertFloatList([m[0] for m in ms], 10)

In [695]:
### Question 6

In [696]:
def MSE(y, ypred):
    errors = [(x-y)**2 for x,y in zip(y, ypred)]
    return sum(errors) / len(errors)

In [697]:
'B00KCHRKD6' in itemNames.keys()

True

In [698]:
dataTrain[0]

{'marketplace': 'US',
 'customer_id': '45610553',
 'review_id': 'RMDCHWD0Y5OZ9',
 'product_id': 'B00HH62VB6',
 'product_parent': '618218723',
 'product_title': 'AGPtek® 10 Isolated Output 9V 12V 18V Guitar Pedal Board Power Supply Effect Pedals with Isolated Short Cricuit / Overcurrent Protection',
 'product_category': 'Musical Instruments',
 'star_rating': 3,
 'helpful_votes': 0,
 'total_votes': 1,
 'vine': 'N',
 'verified_purchase': 'N',
 'review_headline': 'Three Stars',
 'review_body': 'Works very good, but induces ALOT of noise.',
 'review_date': '2015-08-31'}

In [699]:
reviewsPerUser = defaultdict(list)
usersPerItem = defaultdict(set)
itemAverages = {}

for d in dataTrain:
    user, item = d['customer_id'], d['product_id']
    reviewsPerUser[user].append(d)
    usersPerItem[item].add(user)

for i in usersPerItem:
    itemAverages[i] = np.mean([ratingDict[(u, i)] for u in usersPerItem[i]])


In [700]:
averageRating = np.mean([d['star_rating'] for d in dataTrain])

In [701]:
def predictRating(user,item):

    if item not in itemAverages.keys(): return averageRating

    ratings = []
    similarities = []
    for r in reviewsPerUser[user]:
        item2 = r['product_id']
        if item2 == item:
            continue
        ratings.append(r['star_rating'] - itemAverages[item2])
        
        # calculating similarity between item and item2
        similarity = Jaccard(usersPerItem[item], usersPerItem[item2])
        similarities.append(similarity)
    
    if sum(similarities) > 0:
        weightedRatings = [(x*y) for x,y in zip(ratings, similarities)]
        return itemAverages[item] + (sum(weightedRatings) / sum(similarities))
    else:
        return itemAverages[item]
    

In [702]:
# test = set([1])
# test1 = set([])
# ans = test.union(test1)
# len(ans) is 0

In [703]:
alwaysPredictMean = [averageRating]*len(dataTest)

In [704]:
# sanity check

len(alwaysPredictMean) / len(dataset)

0.10000033162106324

In [705]:
simPredictions = [predictRating(d['customer_id'], d['product_id']) for d in dataTest]

In [706]:
labels = [d['star_rating'] for d in dataTest]

In [707]:
answers['Q6'] = MSE(simPredictions, labels)

In [708]:
assertFloat(answers['Q6'])

In [709]:
answers

{'Q1': [0.16130237168160536, 0.16078380246088317],
 'Q2': [0.14164748339047073, 0.14297185466520057],
 'Q3': [10, 0.14164748339047073, 0.14297185466520057],
 'Q4': [0.3141591951591849, 0.16102212535390079, 0.14691521312454542],
 'Q5': [(0.015228426395939087, 'B00H7NFDKA'),
  (0.014492753623188406, 'B00QKVV3HC'),
  (0.014492753623188406, 'B00GXRMD7W'),
  (0.014084507042253521, 'B00H7ILRRI'),
  (0.014084507042253521, 'B0057RUMPO'),
  (0.014084507042253521, 'B000B6DTYW'),
  (0.013888888888888888, 'B00L2708TI'),
  (0.013513513513513514, 'B009Z1KKWI'),
  (0.013333333333333334, 'B000VYINCW'),
  (0.013333333333333334, 'B003F2BDZQ')],
 'Q6': 1.7165666373341593}

In [710]:
### Question 7 - incorporate time-weight collaborative filtering
### (simple temporal dynamics feature for recommender systems)

In [711]:
pd.to_datetime(dataset[9999]['review_date']) 

Timestamp('2015-08-21 00:00:00')

In [712]:
delta = pd.to_datetime(dataset[0]['review_date']) - pd.to_datetime(dataset[9999]['review_date'])

In [713]:
delta.days

10

In [714]:
reviewDates = {}
for d in dataset:
    pair = (d['customer_id'], d['product_id'])
    reviewDates[pair] = d['review_date']

In [715]:
# test = [pd.to_datetime(d['review_date']) for d in dataset]

In [716]:
# min(test)

In [717]:
# max(test) 

In [718]:
# abs(min(test)- max(test)).days / 365

In [719]:
def predictRating_q7(user,item):

    if item not in itemAverages.keys(): return averageRating

    itemTimestamp = pd.to_datetime(reviewDates[(user, item)])

    ratings = []
    similarities = []
    recency = []
    for r in reviewsPerUser[user]:
        item2 = r['product_id']
        if item2 == item:
            continue
        ratings.append(r['star_rating'] - itemAverages[item2])
        
        # calculating similarity between item and item2
        similarity = Jaccard(usersPerItem[item], usersPerItem[item2])
        similarities.append(similarity)

        # calculating time delta between review user,item2 and review user,item
        # and determine weight/influence using decay function depending on recency
        item2Timestamp = pd.to_datetime(reviewDates[(user, item2)])
        timeDelta = abs(itemTimestamp - item2Timestamp)
        recency.append(np.exp(-1/8*(timeDelta.days/365)))
    
    if sum([(x*y) for x,y in zip(similarities, recency)]) > 0:
        weightedRatings = [(x*y) for x,y in zip(ratings, similarities)]
        ratingsWithRecency = [(x*y) for x,y in zip(weightedRatings, recency)]
        return itemAverages[item] + (sum(ratingsWithRecency) / \
            sum([(x*y) for x,y in zip(similarities, recency)]))
    else:
        return itemAverages[item]

In [720]:
preds_q7 = [predictRating_q7(d['customer_id'], d['product_id']) for d in dataTest]

In [721]:
MSE(preds_q7, simPredictions)

1.5200635692769324e-05

In [722]:
actual = [d['star_rating'] for d in dataTest]

In [723]:
itsMSE = MSE(preds_q7, actual)

In [724]:
answers['Q7'] = ["I chose t(u, j) to represent the duration between when user u"+\
    " reviewed item i and when they reviewed item j, in years, which is then"+\
    " plugged into a decay function with lambda = -1/8. Graphically, this lambda"+\
    " parameter choice adequately captures the range of duration, or doesn't converge"+\
    " to zero too early or late.", itsMSE]

In [725]:
assertFloat(answers['Q7'][1])

In [726]:
answers

{'Q1': [0.16130237168160536, 0.16078380246088317],
 'Q2': [0.14164748339047073, 0.14297185466520057],
 'Q3': [10, 0.14164748339047073, 0.14297185466520057],
 'Q4': [0.3141591951591849, 0.16102212535390079, 0.14691521312454542],
 'Q5': [(0.015228426395939087, 'B00H7NFDKA'),
  (0.014492753623188406, 'B00QKVV3HC'),
  (0.014492753623188406, 'B00GXRMD7W'),
  (0.014084507042253521, 'B00H7ILRRI'),
  (0.014084507042253521, 'B0057RUMPO'),
  (0.014084507042253521, 'B000B6DTYW'),
  (0.013888888888888888, 'B00L2708TI'),
  (0.013513513513513514, 'B009Z1KKWI'),
  (0.013333333333333334, 'B000VYINCW'),
  (0.013333333333333334, 'B003F2BDZQ')],
 'Q6': 1.7165666373341593,
 'Q7': ["I chose t(u, j) to represent the duration between when user u reviewed item i and when they reviewed item j, in years, which is then plugged into a decay function with lambda = -1/8. Graphically, this lambda parameter choice adequately captures the range of duration, or doesn't converge to zero too early or late.",
  1.71656266

In [727]:
f = open("answers_hw2.txt", 'w')
f.write(str(answers) + '\n')
f.close()