In [1]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.cm as cmx
import matplotlib.colors as colors
import pandas as pd
from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.decomposition import PCA
%matplotlib inline
import random
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split

## Overview

Baseline models studied:

- Always Predict K (K = 1,2,3,4,5)
- Always Predict Average Y Train (Average Y Train tend to be around 4.4)
- ObviAlgo: Predict Average(Avg Rating for Item i, Avg Raing for User i)

Evaluation Metric: MSE

I additionally did basic collaborative filterting models using the below links instructions

http://www.salemmarafi.com/code/collaborative-filtering-with-python/

## Read The Data

In [3]:
df = pd.read_csv('datasets/reviews.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,B000F83SZQ,"[0, 0]",5,I enjoy vintage books and movies so I enjoyed ...,"05 5, 2014",A1F6404F1VG29J,Avidreader,Nice vintage story,1399248000
1,1,B000F83SZQ,"[2, 2]",4,This book is a reissue of an old one; the auth...,"01 6, 2014",AN0N05A9LIJEQ,critters,Different...,1388966400
2,2,B000F83SZQ,"[2, 2]",4,This was a fairly interesting read. It had ol...,"04 4, 2014",A795DMNCJILA6,dot,Oldie,1396569600
3,3,B000F83SZQ,"[1, 1]",5,I'd never read any of the Amy Brewster mysteri...,"02 19, 2014",A1FV0SX13TWVXQ,"Elaine H. Turley ""Montana Songbird""",I really liked it.,1392768000
4,4,B000F83SZQ,"[0, 1]",4,"If you like period pieces - clothing, lingo, y...","03 19, 2014",A3SPTOKDG7WBLN,Father Dowling Fan,Period Mystery,1395187200


In [45]:
df.shape

(982619, 9)

In [4]:
# --- Start Item Based Recommendations --- #
# Drop any column named "user"
df = df.drop('Unnamed: 0', 1)

In [95]:
df.head()

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,B000F83SZQ,"[0, 0]",5,I enjoy vintage books and movies so I enjoyed ...,"05 5, 2014",A1F6404F1VG29J,Avidreader,Nice vintage story,1399248000
1,B000F83SZQ,"[2, 2]",4,This book is a reissue of an old one; the auth...,"01 6, 2014",AN0N05A9LIJEQ,critters,Different...,1388966400
2,B000F83SZQ,"[2, 2]",4,This was a fairly interesting read. It had ol...,"04 4, 2014",A795DMNCJILA6,dot,Oldie,1396569600
3,B000F83SZQ,"[1, 1]",5,I'd never read any of the Amy Brewster mysteri...,"02 19, 2014",A1FV0SX13TWVXQ,"Elaine H. Turley ""Montana Songbird""",I really liked it.,1392768000
4,B000F83SZQ,"[0, 1]",4,"If you like period pieces - clothing, lingo, y...","03 19, 2014",A3SPTOKDG7WBLN,Father Dowling Fan,Period Mystery,1395187200


## Clean Data

In [96]:
df_rate = df.drop(['helpful','reviewText','reviewTime','reviewerName','summary','unixReviewTime'],1)
df_rate.head()

Unnamed: 0,asin,overall,reviewerID
0,B000F83SZQ,5,A1F6404F1VG29J
1,B000F83SZQ,4,AN0N05A9LIJEQ
2,B000F83SZQ,4,A795DMNCJILA6
3,B000F83SZQ,5,A1FV0SX13TWVXQ
4,B000F83SZQ,4,A3SPTOKDG7WBLN


In [16]:
df_rate.to_csv('datasets/reviews_ratingsOnly.csv')

In [17]:
items = list(set(df_rate.asin))
len(items)

61934

## Eval Simple Models (Using Full Dataset)

In [36]:
from sklearn.metrics import mean_squared_error
from math import sqrt


In [37]:
def RMSE (y_actual, y_predicted):
    return sqrt(mean_squared_error(y_actual, y_predicted))

In [97]:
y = df_rate.values[:,1]

In [98]:
y_avg = np.mean(y)

In [99]:
y_avg

4.347801131466011

In [60]:
def alwaysPredict(length, val):
    return np.ones(length)*float(val)

In [61]:
allAvg = alwaysPredict(len(y),y_avg)

In [73]:
rmseAvg =RMSE(y, allAvg)

In [69]:
all5 = alwaysPredict(len(y),5)

In [70]:
all4 = alwaysPredict(len(y),4)

In [74]:
rmse5 =RMSE(y, all5)

In [76]:
rmse4 =RMSE(y, all4)

In [77]:
rmse3 = RMSE(y,alwaysPredict(len(y),3))
rmse2 = RMSE(y,alwaysPredict(len(y),2))
rmse1 = RMSE(y,alwaysPredict(len(y),1))

In [112]:
print "Always Predict 5:     %0.2f" %rmse5
print "Always Predict 4:     %0.2f" %rmse4
print "Always Predict 3:     %0.2f" %rmse3
print "Always Predict 2:     %0.2f" %rmse2
print "Always Predict 1:     %0.2f" %rmse1
print "Always Predict Population Avg:     %0.2f" %rmseAvg

Always Predict 5:     1.08
Always Predict 4:     0.99
Always Predict 3:     1.67
Always Predict 2:     2.57
Always Predict 1:     3.52
Always Predict Population Avg:     0.90


## Create Train and Test Data

In [101]:
x = df_rate.values[:,[0,2]]

In [102]:
x

array([['B000F83SZQ', 'A1F6404F1VG29J'],
       ['B000F83SZQ', 'AN0N05A9LIJEQ'],
       ['B000F83SZQ', 'A795DMNCJILA6'],
       ..., 
       ['B00M13FNSS', 'A20KO0BPMNREJL'],
       ['B00M13FNSS', 'A1BQO66R6OLCCW'],
       ['B00M13FNSS', 'A2NRGE3CSFY2TQ']], dtype=object)

In [103]:
x.shape

(982619, 2)

In [107]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=42)

In [119]:
x_train

array([['B001892DIO', 'A1XP9NSCHRNB20'],
       ['B00CHU9PSO', 'A33HEX7SSKNY7W'],
       ['B00DC4XO1I', 'A2IUE8ERXIZ6YB'],
       ..., 
       ['B0064A96OG', 'AOQWLP1S4VXMD'],
       ['B00FK1ZRXY', 'A1I5LTLTT2EZYN'],
       ['B005UPRUBW', 'A2YFUGQI0PKWN8']], dtype=object)

## Simple Averaging Baseline Model

ObviAlgo: Predict Average(Avg Rating for Item i, Avg Raing for User i)

In [None]:
yAvgTrain = np.mean(y_train)

In [108]:
item_train = list(set(x_train[:,0]))
len(item_train)

61430

In [110]:
item_train[0:5]

['B005AYSN8M', 'B00J6S89AA', 'B00GA664GC', 'B00DVPOZXE', 'B00G7067FY']

In [111]:
user_train = list(set(x_train[:,1]))
print len(user_train)

67464

In [159]:
## Create Train Dictionary for
## Key: Item i ---> Val: Average Rating

obviItemDict = {}

nItemTrain =len(item_train)
#for i in range(nItemTrain):
for i in range(1000):
    obviItemDict[item_train[i]] = np.mean(y_train[x_train[:,0]==item_train[i]])
    

In [162]:
obviItemDict

{'B005AYSN8M': 3.5,
 'B00J6S89AA': 3.2,
 'B00GA664GC': 4.25,
 'B00DVPOZXE': 3.933333333333333,
 'B00G7067FY': 4.714285714285714,
 'B00H9WCJR4': 4.75,
 'B00957T4K6': 4.857142857142857,
 'B00CCYWTEC': 4.428571428571429,
 'B0066MINXC': 5.0,
 'B00BK9X4US': 3.2,
 'B0051AD2AY': 4.380952380952381,
 'B00BBZXMFI': 4.285714285714286,
 'B003AQBBQS': 4.5,
 'B00JAF9CTQ': 5.0,
 'B003V8BSIG': 3.8,
 'B002IC1KGG': 3.6666666666666665,
 'B00HBUA35E': 4.0,
 'B00B0JP38Y': 4.0,
 'B00J59Y1TS': 5.0,
 'B005NB295K': 5.0,
 'B00KFDVTKM': 3.75,
 'B00CO73GA8': 4.535714285714286,
 'B00HE1NYWE': 5.0,
 'B003SX0Y3Y': 2.8181818181818183,
 'B00ID9X3AY': 4.916666666666667,
 'B00BPD0V50': 4.538461538461538,
 'B007ZL3RZW': 4.333333333333333,
 'B00E8OLJY0': 3.6,
 'B00A56BJTU': 4.0,
 'B00H5TD038': 4.5,
 'B00FT694AC': 4.0,
 'B003XRF1CO': 4.4,
 'B00BM8MCVE': 4.75,
 'B009E7TX08': 4.25,
 'B00971BN5E': 3.6818181818181817,
 'B00BGI7JTU': 4.666666666666667,
 'B0050CL1MY': 4.25,
 'B00BTRMMS6': 4.5,
 'B00HMSXEO2': 4.8,
 'B007OLTI06': 

In [163]:
import csv

In [164]:
with open('obviItemDict.csv', 'wb') as csv_file:
    writer = csv.writer(csv_file)
    for key, value in obviItemDict.items():
        writer.writerow([key, value])

In [128]:
## Create Train Dictionary for
## Key: User J ---> Val: Average Rating

obviUserDict = {}
nUserTrain =len(user_train)
for i in range(nUserTrain):
#for i in range(1000):
    obviUserDict[user_train[i]] = np.mean(y_train[x_train[:,1]==user_train[i]])
    

In [137]:
x_train.shape

(491309, 2)

In [2]:
## Given X_Test = [item i, user j]
## Y_Predict = wItem* (Avg Rating Item i) + wUser* (Avg Rating User j)
## Where wItem and wUser are defaulted to .5
## If either item i or user j aren't in train data, just use available average
## If both are missing, use total Train avg. rating

def obviPredict(x_test, wItem = .5):
    predict = np.zeros(len(x_test))
    nPredict =x_test.shape[0]
    wUser = float(1 - wItem)
    countHere = 0
    
    for i in range(nPredict):
        itemAvg= None
        userAvg = None
        if x_test[i,0] in obviItemDict:
            itemAvg = obviItemDict[x_test[i,0]]
        if x_test[i,1] in obviUserDict:
            userAvg = obviUserDict[x_test[i,1]]
        
        if itemAvg == None and userAvg == None:
            predict[i] = yAvgTrain
        elif itemAvg == None:
            predict[i] = userAvg
        elif userAvg == None:
            countHere += 1
            predict[i] = itemAvg
        else:
            predict[i] = itemAvg *wItem + userAvg*wUser
        
    print countHere
    
    return predict

In [None]:
obviPredictions = obviPredict(x_test)

In [167]:
rmseObvi = RMSE(y_test,obviPredictions)

## Baseline Model Scores

In [168]:
rmseObvi

0.9414037903986995

In [157]:
rmse5 = RMSE(y_test,alwaysPredict(len(y_test),5))
rmse4 = RMSE(y_test,alwaysPredict(len(y_test),4))
rmse3 = RMSE(y_test,alwaysPredict(len(y_test),3))
rmse2 = RMSE(y_test,alwaysPredict(len(y_test),2))
rmse1 = RMSE(y_test,alwaysPredict(len(y_test),1))
rmseAvg = RMSE(y_test,alwaysPredict(len(y_test),yAvgTrain))

In [151]:

print "Always Predict 5:     %0.2f" %rmse5
print "Always Predict 4:     %0.2f" %rmse4
print "Always Predict 3:     %0.2f" %rmse3
print "Always Predict 2:     %0.2f" %rmse2
print "Always Predict 1:     %0.2f" %rmse1
print "Always Predict Train Avg:     %0.2f" %rmseAvg

Always Predict 5:     1.15
Always Predict 4:     1.01
Always Predict 3:     1.65
Always Predict 2:     2.54
Always Predict 1:     3.48
Always Predict Train Avg:     0.95


## Basic Collaborative Filtering

I followed the below instructions

http://www.salemmarafi.com/code/collaborative-filtering-with-python/

It worked for a small subset, but was too slow for the full dataset

In [3]:
# --- Import Libraries --- #
 
import pandas as pd
from scipy.spatial.distance import cosine
 
# --- Read Data --- #
data = pd.read_csv('datasets/reviews5_1000.csv')

In [4]:
# --- Start Item Based Recommendations --- #
# Drop any column named "user"
data_amazon = data.drop('user', 1)

In [5]:
data.head()

Unnamed: 0,user,B00B66PUX4,B0051AD2AY,B002IC1KGG,B0015YEQ6O,B004RQ84R4,B00BXJ25GE,B00C1GTQ2Y,B00GJ3CGCC,B008LMT8AW,...,B009AZ3D8C,B00IKZ7U4G,B004TMB0PE,B00F5EPITO,B000UMZMP6,B002VFPS4U,B0015YEQ6Y,B001IWL1ZM,B00GWTYX9M,B00457VJM2
0,A1F6404F1VG29J,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,A1FV0SX13TWVXQ,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,A1UG4Q4D3OAH3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,A1ZT7WV0ZUA0OJ,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,A3H8PE1UFK04JZ,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
data_amazon.head()

Unnamed: 0,B00B66PUX4,B0051AD2AY,B002IC1KGG,B0015YEQ6O,B004RQ84R4,B00BXJ25GE,B00C1GTQ2Y,B00GJ3CGCC,B008LMT8AW,B00971BN5E,...,B009AZ3D8C,B00IKZ7U4G,B004TMB0PE,B00F5EPITO,B000UMZMP6,B002VFPS4U,B0015YEQ6Y,B001IWL1ZM,B00GWTYX9M,B00457VJM2
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# Create a placeholder dataframe listing item vs. item
data_ibs = pd.DataFrame(index=data_amazon.columns,columns=data_amazon.columns)
 

In [None]:
# Lets fill in those empty spaces with cosine similarities
# Loop through the columns
for i in range(0,len(data_ibs.columns)) :
    # Loop through the columns for each column
    for j in range(0,len(data_ibs.columns)) :
        # Fill in placeholder with cosine similarities
        data_ibs.ix[i,j] = 1-cosine(data_amazon.ix[:,i],data_amazon.ix[:,j])

# Create a placeholder items for closes neighbours to an item
data_neighbours = pd.DataFrame(index=data_ibs.columns,columns=[range(1,11)])
 
# Loop through our similarity dataframe and fill in neighbouring item names
for i in range(0,len(data_ibs.columns)):
    data_neighbours.ix[i,:10] = data_ibs.ix[0:,i].order(ascending=False)[:10].index

# --- End Item Based Recommendations --- #
 
# --- Start User Based Recommendations --- #
 
# Helper function to get similarity scores
def getScore(history, similarities):
    return sum(history*similarities)/sum(similarities)
 
# Create a place holder matrix for similarities, and fill in the user name column
data_sims = pd.DataFrame(index=data.index,columns=data.columns)
data_sims.ix[:,:1] = data.ix[:,:1]
 
#Loop through all rows, skip the user column, and fill with similarity scores
for i in range(0,len(data_sims.index)):
    for j in range(1,len(data_sims.columns)):
        user = data_sims.index[i]
        product = data_sims.columns[j]
 
        if data.ix[i][j] == 1:
            data_sims.ix[i][j] = 0
        else:
            product_top_names = data_neighbours.ix[product][1:10]
            product_top_sims = data_ibs.ix[product].order(ascending=False)[1:10]
            user_purchases = data_amazon.ix[user,product_top_names]
 
            data_sims.ix[i][j] = getScore(user_purchases,product_top_sims)

    
# Get the top songs
data_recommend = pd.DataFrame(index=data_sims.index, columns=['user','1','2','3','4','5','6'])
data_recommend.ix[0:,0] = data_sims.ix[:,0]
 
# Instead of top song scores, we want to see names
for i in range(0,len(data_sims.index)):
    data_recommend.ix[i,1:] = data_sims.ix[i,:].order(ascending=False).ix[1:7,].index.transpose()

# Print a sample
print data_recommend.ix[:10,:4]