In [2]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.cm as cmx
import matplotlib.colors as colors
import pandas as pd
from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.cluster import KMeans
%matplotlib inline

In [3]:
reviews = pd.read_csv('Reviews.csv')
unique_users = reviews.UserId.unique()
unique_products = reviews.ProductId.unique()

## Baseline Model:
- Simple Averaging: A simple baseline model is to use the average of all other scores of a particular product to predict how a user will rate the product. If there is no other data on the product we guess 3.
- Content based recommendation

### 1. Simple Averaging

In [5]:
#Uses average score to predict how a user will score, for the first 10000 observations
def compute_baseline_error():
    l1score = []
    misclassified = []

    for i in  range(10000):
        correct = 0 
        score = reviews.ix[i,:].Score
        product = reviews.ix[i,:].ProductId
        ID = reviews.ix[i,:].Id
        other_reviews = reviews[(reviews.ProductId == product) & (reviews.Id != ID)].Score.values
        if len(other_reviews) == 0:
            guess = 3 
        else:
            guess = other_reviews.mean()
        
        offby = guess - score
        if int(guess+0.5) == int(score):
            correct = 1
        
        l1score.append(offby)
        misclassified.append(correct)
    return l1score,misclassified

In [6]:
import time 
start = time.time()
l,m = compute_baseline_error()
end = time.time()
print end - start

323.726979971


In [8]:
print "Fraction Correctly Classified: ", sum(m)/float(len(m))

Fraction Correctly Classified:  0.3287


We can also relabel good (4,5) or bad (1,2,3) scores as 1 and 0 and see how well this does, guessing 1 if there's no data (justify later but people are probably more likely to give positive reviews than negative ones). 

In [15]:
def modified_baseline_error():
    misclassified = []

    for i in  range(10000):
        correct = 0 
        score = reviews.ix[i,:].Score
        product = reviews.ix[i,:].ProductId
        ID = reviews.ix[i,:].Id
        other_reviews = reviews[(reviews.ProductId == product) & (reviews.Id != ID)].Score.values
        if len(other_reviews) == 0:
            guess = 5 
        else:
            guess = other_reviews.mean()
        
        if guess >= 3.5:
            new_guess = 1
        else:
            new_guess = 0
        if score >=3.5:
            new_score = 1
        else:
            new_score = 0
                
        if new_guess == new_score:
            misclassified.append(1)
        else:
            misclassified.append(0)
    return misclassified

In [16]:
import time 
start = time.time()
m = modified_baseline_error()
end = time.time()
print end - start

312.308862925


In [20]:
print 'Fraction Correctly Classified: ', sum(m)/float(len(m))

Fraction Correctly Classified:  0.7487


### 2. Content Based Recommendation

In [34]:
#Useful Packages
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from scipy.spatial.distance import cosine
from scipy.cluster.vq import vq,kmeans,whiten
import time

Create a new dataframe with unique products and their associated text:

In [37]:
#whole thing takes a long time(>30minutes), use first 1000 products
start = time.time()
unique_prod_reviews = map(lambda x: to_words(x),unique_products[0:1000])
end = time.time()
print end - start

29.7132220268


In [39]:
product_df = pd.DataFrame()
product_df['ProductId'] = unique_products[0:1000]
product_df['Text'] = unique_prod_reviews
product_df.head()

Unnamed: 0,ProductId,Text
0,B001E4KFG0,[I have bought several of the Vitality canned ...
1,B00813GRG4,[Product arrived labeled as Jumbo Salted Peanu...
2,B000LQOCH0,[This is a confection that has been around a f...
3,B000UA0QIQ,[If you are looking for the secret ingredient ...
4,B006K2ZZ7K,[Great taffy at a great price. There was a wi...


Use SKLearns implementation of Tf-Idf on the data

In [40]:
a = np.array(unique_prod_reviews)
a = a.reshape(len(a),)
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(a)

In [41]:
tfidf_matrix.toarray()
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_similarities

array([[  1.00000000e+00,   5.36110174e-03,   0.00000000e+00, ...,
          8.28603989e-03,   5.19533133e-03,   5.02833995e-03],
       [  5.36110174e-03,   1.00000000e+00,   0.00000000e+00, ...,
          5.29899866e-03,   0.00000000e+00,   1.78627910e-03],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       ..., 
       [  8.28603989e-03,   5.29899866e-03,   0.00000000e+00, ...,
          1.00000000e+00,   8.30201976e-03,   7.89411311e-04],
       [  5.19533133e-03,   0.00000000e+00,   0.00000000e+00, ...,
          8.30201976e-03,   1.00000000e+00,   4.70532436e-03],
       [  5.02833995e-03,   1.78627910e-03,   0.00000000e+00, ...,
          7.89411311e-04,   4.70532436e-03,   1.00000000e+00]])

Above is the TF-IDF Matrix for the first 1000 products. Write a function to return 8 most similar items given a product ID as input.

In [42]:
#takes a product name in the unique products data and returns the 8 most similar products
#input: Product ID, Product Dataframe, Similarity Matrix
#output: Array of 8 nearest items in the database using cosine similarity

def return_nearest(product,cosine_similarities,product_df):
    idx = product_df[product_df.ProductId == product].index[0]
    similar_indices = cosine_similarities[idx].argsort()[:-10:-1]
    similar_items = [(cosine_similarities[idx][i], product_df['ProductId'][i]) for i in similar_indices]
    return similar_items[1:]


In [43]:
return_nearest('B00813GRG4',cosine_similarities,product_df)

[(0.060071873611170931, 'B000CMFMG8'),
 (0.052140838382313696, 'B0047LRBX2'),
 (0.043432096349825311, 'B001EO6BCC'),
 (0.037325003722012799, 'B000G6RPMY'),
 (0.026276191344016604, 'B001FA1L7U'),
 (0.025318462964514555, 'B004AVYUOW'),
 (0.025057186536542934, 'B001HOUGFC'),
 (0.023818763987623219, 'B000YT5DBS')]

In [53]:
reviews[(reviews.ProductId == 'B00813GRG4') | 
            (reviews.ProductId == 'B000CMFMG8') |
           (reviews.ProductId == 'B0047LRBX2') | 
           (reviews.ProductId == 'B001EO6BCC') |
           (reviews.ProductId == 'B000G6RPMY') |
           (reviews.ProductId == 'B001FA1L7U') |
           (reviews.ProductId == 'B004AVYUOW')]

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
51,52,B000G6RPMY,A3S5KJDA6ED2PS,Mike Kaser,4,4,5,1243900800,HOT! And good! Came back for more :),Got a free package of these with a bottle of b...
52,53,B000G6RPMY,A9L6L5H9BPEBO,Edwin C. Pauzer,1,1,4,1348876800,You'll go nuts over Ass-Kickin' Peanuts.,This wasn't in stock the last time I looked. I...
53,54,B000G6RPMY,AQ9DWWYP2KJCQ,"Roel Trevino ""protomex""",0,0,3,1278028800,not ass kickin,we're used to spicy foods down here in south t...
384,385,B004AVYUOW,A1ZZMVHSLQSE7R,Murdy,1,1,5,1325376000,The Best,"If you are a peanut lover, these are for you. ..."
1295,1296,B001FA1L7U,A3CP4FB1CZCRW3,A. Simonian,2,2,5,1181001600,u talked me into it mr. beer nut,I was just messing around on amazon when I ran...
1296,1297,B001FA1L7U,A1NXGRAQROM9AL,Michael W. Plum,2,2,5,1165968000,mmmmmmmm...beer...nuts!,i used to eat beer nuts all the time as a kid....
1297,1298,B001FA1L7U,A3AEB0UGBHU44M,jim k,1,1,5,1348099200,jgk likes beer nuts,great product! wish they were still available ...
1298,1299,B001FA1L7U,AECDRQB9XVYB4,"T. E. Lawson ""filmflam""",1,1,5,1318809600,Just as I remembered.,Some reviewers must have a short memory. I've ...
1299,1300,B001FA1L7U,A26IDKJAMW7IH8,RICHARD M WING,1,1,4,1309737600,Mild Sweet And Salty Blend,After a second purchase of a 12-oz Beer Nuts P...


Stuff after this is useless:

In [5]:
#can't train on too large of a subset of the data on my computer, use first 10000

reviews1 = reviews.ix[0:10000].dropna()
vectorizer = CountVectorizer(stop_words= 'english', min_df=4)
text = reviews1['Text'].values + ' ' + reviews1['Summary'].values
vectorizer.fit(text)
x = vectorizer.fit_transform(text)
x = x.toarray( )
features = vectorizer.get_feature_names() 
features = np.array(features)

In [6]:
#input: product ID
#output: sparse vector based on vectorizer

def to_vector(product_id):
    temp = reviews[reviews.ProductId == product_id]
    text = list(temp['Text'].values + ' ' + temp['Summary'].values)
    new_string = np.array([''.join(text)]).reshape(1,)
    return vectorizer.transform(new_string).toarray()

def to_words(product_id):
    temp = reviews[reviews.ProductId == product_id]
    text = list(temp['Text'].astype(str).values + ' ' + temp['Summary'].astype(str).values)
    return np.array([''.join(text)]).reshape(1,)



    

In [6]:
def similarity(product1,product2):
    vector1 = vectorizer.transform(product1)
    vector2 = vectorizer.transform(product2)
    #do some cosine thing -- > closer to 1 is closer in distance
    #might need to transform to lower dimensional space

In [7]:
#One approach to reduce dimensionality is k means clustering, similarity can be computed through
#distance between histograms

def make_cluster(k,vector):
    bagofwords= kmeans(vector,n)
    return bagofwords[0]
def hist(vector,cluster):
    p = vq(vector,cluster)
    labels = p[0]
    hist,bins = np.histogram(labels,bins=np.arange(-.5,200,1))
    return hist

In [8]:
#Only use the first 1000 products for now - too slow
df_products = pd.DataFrame(index = unique_products[0:100], columns = unique_products[0:100])

In [42]:
a = to_vector(unique_products[2])

In [50]:
cluster= make_cluster(50,)

array([[0, 0, 0, ..., 0, 0, 0]])

In [17]:
x.shape

(10001, 6469)