In [2]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.cm as cmx
import matplotlib.colors as colors
import pandas as pd
from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.cluster import KMeans
%matplotlib inline

In [3]:
reviews = pd.read_csv('Reviews.csv')
unique_users = reviews.UserId.unique()
unique_products = reviews.ProductId.unique()

## Baseline Model:
- Simple Averaging: A simple baseline model is to use the average of all other scores of a particular product to predict how a user will rate the product. If there is no other data on the product we guess 3.
- Content based recommendation

### 1. Simple Averaging

In [5]:
#Uses average score to predict how a user will score, for the first 10000 observations
def compute_baseline_error():
    l1score = []
    misclassified = []

    for i in  range(10000):
        correct = 0 
        score = reviews.ix[i,:].Score
        product = reviews.ix[i,:].ProductId
        ID = reviews.ix[i,:].Id
        other_reviews = reviews[(reviews.ProductId == product) & (reviews.Id != ID)].Score.values
        if len(other_reviews) == 0:
            guess = 3 
        else:
            guess = other_reviews.mean()
        
        offby = guess - score
        if int(guess+0.5) == int(score):
            correct = 1
        
        l1score.append(offby)
        misclassified.append(correct)
    return l1score,misclassified

In [6]:
import time 
start = time.time()
l,m = compute_baseline_error()
end = time.time()
print end - start

323.726979971


In [8]:
print "Fraction Correctly Classified: ", sum(m)/float(len(m))

Fraction Correctly Classified:  0.3287


We can also relabel good (4,5) or bad (1,2,3) scores as 1 and 0 and see how well this does, guessing 1 if there's no data (justify later but people are probably more likely to give positive reviews than negative ones). 

In [15]:
def modified_baseline_error():
    misclassified = []

    for i in  range(10000):
        correct = 0 
        score = reviews.ix[i,:].Score
        product = reviews.ix[i,:].ProductId
        ID = reviews.ix[i,:].Id
        other_reviews = reviews[(reviews.ProductId == product) & (reviews.Id != ID)].Score.values
        if len(other_reviews) == 0:
            guess = 5 
        else:
            guess = other_reviews.mean()
        
        if guess >= 3.5:
            new_guess = 1
        else:
            new_guess = 0
        if score >=3.5:
            new_score = 1
        else:
            new_score = 0
                
        if new_guess == new_score:
            misclassified.append(1)
        else:
            misclassified.append(0)
    return misclassified

In [16]:
import time 
start = time.time()
m = modified_baseline_error()
end = time.time()
print end - start

312.308862925


In [20]:
print 'Fraction Correctly Classified: ', sum(m)/float(len(m))

Fraction Correctly Classified:  0.7487


### 2. Content Based Recommendation

In [4]:
#distances between items
from scipy.spatial.distance import cosine
from scipy.cluster.vq import vq,kmeans,whiten

In [48]:
#can't train on too large of a subset of the data on my computer, use first 10000

reviews1 = reviews.ix[0:10000].dropna()
vectorizer = CountVectorizer(stop_words= 'english', min_df=4)
text = reviews1['Text'].values + ' ' + reviews1['Summary'].values
vectorizer.fit(text)
x = vectorizer.fit_transform(text)
x = x.toarray( )
features = vectorizer.get_feature_names() 
features = np.array(features)

In [41]:
#input: product ID
#output: sparse vector based on vectorizer

def to_vector(product_id):
    temp = reviews[reviews.ProductId == product_id]
    text = list(temp['Text'].values + temp['Summary'].values)
    new_string = np.array([''.join(text)]).reshape(1,)
    return vectorizer.transform(new_string).toarray()
    

In [6]:
def similarity(product1,product2):
    vector1 = vectorizer.transform(product1)
    vector2 = vectorizer.transform(product2)
    #do some cosine thing -- > closer to 1 is closer in distance
    #might need to transform to lower dimensional space

In [7]:
#One approach to reduce dimensionality is k means clustering, similarity can be computed through
#distance between histograms

def make_cluster(k,vector):
    bagofwords= kmeans(vector,n)
    return bagofwords[0]
def hist(vector,cluster):
    p = vq(vector,cluster)
    labels = p[0]
    hist,bins = np.histogram(labels,bins=np.arange(-.5,200,1))
    return hist

In [8]:
#Only use the first 1000 products for now - too slow
df_products = pd.DataFrame(index = unique_products[0:100], columns = unique_products[0:100])

In [42]:
a = to_vector(unique_products[2])

In [50]:
cluster= make_cluster(50,)

array([[0, 0, 0, ..., 0, 0, 0]])

In [17]:
x.shape

(10001, 6469)

In [49]:
reviews.head(10)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...
5,6,B006K2ZZ7K,ADT0SRK1MGOEU,Twoapennything,0,0,4,1342051200,Nice Taffy,I got a wild hair for taffy and ordered this f...
6,7,B006K2ZZ7K,A1SP2KVKFXXRU1,David C. Sullivan,0,0,5,1340150400,Great! Just as good as the expensive brands!,This saltwater taffy had great flavors and was...
7,8,B006K2ZZ7K,A3JRGQVEQN31IQ,Pamela G. Williams,0,0,5,1336003200,"Wonderful, tasty taffy",This taffy is so good. It is very soft and ch...
8,9,B000E7L2R4,A1MZYO9TZK0BBI,R. James,1,1,5,1322006400,Yay Barley,Right now I'm mostly just sprouting this so my...
9,10,B00171APVA,A21BT40VZCCYT4,Carol A. Reed,0,0,5,1351209600,Healthy Dog Food,This is a very healthy dog food. Good for thei...
