In [1]:
import numpy as np
import pandas as pd
import scipy as sp
from scipy import spatial

In [2]:
reviews = pd.read_csv("reviews.txt", header=None)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
reviews.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,reviewerID,asin,reviewerName,helpful,unixReviewTime,reviewText,overall,reviewTime,summary
1,A1F6404F1VG29J,B000F83SZQ,Avidreader,"[0, 0]",1399248000,I enjoy vintage books and movies so I enjoyed ...,5.0,"05 5, 2014",Nice vintage story
2,AN0N05A9LIJEQ,B000F83SZQ,critters,"[2, 2]",1388966400,This book is a reissue of an old one; the auth...,4.0,"01 6, 2014",Different...
3,A795DMNCJILA6,B000F83SZQ,dot,"[2, 2]",1396569600,This was a fairly interesting read. It had ol...,4.0,"04 4, 2014",Oldie
4,A1FV0SX13TWVXQ,B000F83SZQ,"Elaine H. Turley ""Montana Songbird""","[1, 1]",1392768000,I'd never read any of the Amy Brewster mysteri...,5.0,"02 19, 2014",I really liked it.


In [3]:
# Create two dictionaries:
# (1) Map product -> list of reviews for that product
# (2) Map reviewer -> list of products that user has reviewed

products_to_reviewers = dict()
reviewers_to_products = dict()

for review in reviews.itertuples():
    reviewerID = review[1]
    product = review[2]
    helpful = review[4]
    rating = review[7]
    
    if reviewerID in reviewers_to_products:
        products_list = reviewers_to_products[reviewerID]
        products_list.append((product, helpful, rating))
        reviewers_to_products[reviewerID] = products_list
    else:
        reviewers_to_products[reviewerID] = [(product, helpful, rating)]
        
    if product in products_to_reviewers:
        reviews_list = products_to_reviewers[product]
        reviews_list.append(reviewerID)
        products_to_reviewers[product] = reviews_list
    else:
        products_to_reviewers[product] = [reviewerID]


In [5]:
print products_to_reviewers.keys()[0:5]

['B005AYSN8M', 'B00J6S89AA', 'B00GA664GC', 'B00DVPOZXE', 'B00B0QIS98']


In [8]:
print reviewers_to_products.keys()[0]

A3MH40TK0FRBYG


In [4]:
# Create similarity matrix
# Each row is a user
# Each column is a product
# Each entry in the matrix is the score the user gave that product, or 0 if the user didn't review that product

reviewers = reviewers_to_products.keys()[:1000]
products = products_to_reviewers.keys()

n_reviewers = len(reviewers)
n_products = len(products)

sim_matrix = np.zeros((n_reviewers, n_products))

for i,reviewer in enumerate(reviewers):
    for j,product in enumerate(products):
        if reviewer not in products_to_reviewers[product]:
            sim_matrix[i, j] = 0
        else:
            reviewer_products = reviewers_to_products[reviewer]
            split = zip(*reviewer_products)
            product_index = (split[0]).index(product)
            rating = split[2][product_index]
            sim_matrix[i, j] = rating

In [10]:
# Example - similarity score between two users
d1 = sim_matrix[0,:]
similarities = np.zeros(sim_matrix.shape[0] - 1)

for i,d2 in enumerate(sim_matrix[1:]):    
    similarities[i] = 1 - spatial.distance.cosine(d1,d2)

m = np.argmax(similarities)
print m
print similarities[m]

703
0.11396057646


In [5]:
# Returns the IDs and associated similarity scores of the top 10 most similar users (in descending sorted order)
def similar_users(userID, sim_matrix, reviewers):
    
    # Index of the userID in the similarity matrix
    user_index = reviewers.index(userID)
    
    # The vector representing the passed-in user's product reviews
    user_reviews = sim_matrix[user_index,:]
    similarities = np.zeros(sim_matrix.shape[0])
    for i,user2_reviews in enumerate(sim_matrix):
        similarities[i] = 1 - spatial.distance.cosine(user_reviews,user2_reviews)
    
    # Indices of the top 10 similarities
    ind = np.argpartition(similarities, -10)[-10:]
    ind = ind[np.argsort(similarities[ind])[::-1][:len(similarities)]]
    scores = np.array([similarities[i] for i in ind])
    return ind,scores

In [12]:
similar_users('A3MH40TK0FRBYG', sim_matrix, reviewers)

(array([  0, 704, 907, 926, 754, 825, 942, 370, 638, 505], dtype=int64),
 array([ 1.        ,  0.11396058,  0.10322795,  0.09690942,  0.08867022,
         0.08112739,  0.06619034,  0.05730871,  0.05380701,  0.04847506]))

In [54]:
# Return product recommendations for a user
def product_recs(userID, sim_matrix, reviewers, reviewer_to_products_dict):
    ind_sim_users,sim_score = similar_users(userID, sim_matrix, reviewers)
    
    # products the passed-in user has already reviewed
    already_reviewed = np.array(zip(*reviewer_to_products_dict[userID])[0]).flatten()
    
    # for each of the top 5 most similar users, get the products that the passed-in user hasn't reviewed
    new_products = []  
    for ind in ind_sim_users[1:6]: # the 0th index is the passed-in user, so skip over it
        reviewer_id = reviewers[ind]
        other_products = np.array(zip(*reviewer_to_products_dict[reviewer_id])[0]).flatten()
        unique = set(other_products) - set(already_reviewed)
        
        both_reviewed = []
        for item in already_reviewed:
            if item in other_products:
                both_reviewed.append(item)
  
        # RMSE proxy score
        SSE = []
        for item in both_reviewed:
            item_ind = other_products.tolist().index(item)
            other_rating = zip(*reviewer_to_products_dict[reviewer_id])[2][item_ind]
            recommendee_rating = zip(*reviewer_to_products_dict[userID])[2][item_ind]
            
            SSE.append((other_rating-recommendee_rating)**2)
        
        print "Items with confidence score of",np.sqrt(np.sum(SSE)/float(len(both_reviewed)))/float(len(both_reviewed))
        
        # Only recommend products that were positively rated by the other user
        for item in unique:
            item_ind = other_products.tolist().index(item)
            rating = zip(*reviewer_to_products_dict[reviewer_id])[2][item_ind]
            
            if rating >= 4:
                new_products.append(item)
                print "\t" + item
        
    return new_products

In [55]:
recs = product_recs('A3MH40TK0FRBYG', sim_matrix, reviewers, reviewers_to_products)
print "Number of recommendations:",len(recs)
print recs

Items with confidence score of 0.0
	B00EWTJJWA
	B00J1Q5A3G
	B00HNFL9FU
	B00FIT71IW
	B00HY3W7F2
	B00IFE9B6C
Items with confidence score of 0.707106781187
	B00KA0AGJK
	B00BEQP450
	B00AJUV7AI
	B00BI4PNHC
	B009W9QTVY
	B00GT0NLZQ
	B0064I72LC
	B00KLBHX44
	B00B2TF8PU
	B00AKH5X4G
	B00JDYC5OI
	B008Y6B4S4
	B00KFUJX58
	B00KYETHVA
	B00ES473BS
	B00I7X5ML4
	B00CBP5WA0
	B0092MTG70
	B00CIJYPX4
	B00B97XOJM
	B00K9ZLW3K
	B00JMEPIP2
	B00GVEYW1W
Items with confidence score of 0.0
	B00GQ0OF8G
	B00DK40J36
	B00FG5OCUI
	B00KFVKIL0
	B00HHCA47I
	B00FUMPU1C
	B007NK64H8
	B00DTFKFY4
	B00J8UC600
Items with confidence score of 1.0
	B006Z1GKXO
	B009XGD2DY
	B00CC68FLG
	B00I48KN16
	B00E7IWEFU
	B009XGIHES
	B005VFXLIW
	B00HH1JVCI
Items with confidence score of 1.0
	B00B8376JG
	B006QQRH3A
	B00IECLA3C
	B00B6JFNZ6
	B005347N30
	B00CGFGS3A
	B007QPOT2C
	B00COW2FYQ
	B00IHCC5F6
	B00ENVYVHA
Number of recommendations: 56
['B00EWTJJWA', 'B00J1Q5A3G', 'B00HNFL9FU', 'B00FIT71IW', 'B00HY3W7F2', 'B00IFE9B6C', 'B00KA0AGJK', 'B00BEQP450',