# Building a Content-Based Recommender System

Paul Lim

## Libraries

In [5]:
# Main imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib

# sklearn
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import manhattan_distances
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.externals import joblib
from sklearn import pipeline, feature_selection, decomposition
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import DBSCAN, AgglomerativeClustering, Birch
from sklearn.neighbors import NearestNeighbors, LSHForest

# NLP 
import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
import spacy
import gensim
from gensim import models
from gensim.models import word2vec
import snowballstemmer

# Misc.
import re
import datetime
import time
import logging
import math

% matplotlib inline

sns.set_style("white")
sns.set_style('ticks')
sns.set_style({'xtick.direction': u'in', 'ytick.direction': u'in'})
sns.set_style({'legend.frameon': True})

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)



## Functions/Classes

In [6]:
class DataframeToSeriesTransformer(BaseEstimator, TransformerMixin):
        
    def __init__(self, col=None):
        self.col = col
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        if self.col:
            print("DTST: ", X[self.col].shape)
            return X[self.col]
        else:
            return X
        
class SeparateFeaturesTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, num_cols=None, text_cols=None):
        self.num_cols = num_cols
        self.text_cols = text_cols
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        if self.num_cols:
            print("SFT: ", X.loc[:, self.num_cols].shape)
            return X.loc[:, self.num_cols]
        elif self.text_cols:
            print("SFT: ", X.loc[:, self.text_cols].shape)
            return X.loc[:, self.text_cols]
        else:
            return X
        
class WilsonAverageTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, num_col=None, biz_list=None):
        self.num_col = num_col
        self.biz_list = biz_list
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        if self.num_col and self.biz_list.all():
            scores = get_average_rating(X, self.biz_list)
            
            X_avg = pd.DataFrame({'average': scores})
            print("WAT: ", X_avg.shape)
            return X_avg
        else:
            return X
        
class CleanTextTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, text_col=None):
        self.text_col = text_col
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):

        X_list = X.loc[:, self.text_col].tolist()
        
        if self.text_col:
            df = pd.DataFrame()
            clean_review_list = []
            
            for review in X_list:
                clean_review = ''
                
                for word in TextBlob(review).words:
                    clean_review += word.lemmatize() + ' '
                        
                clean_review_list.append(clean_review)
                        
            df['clean_reviews'] = clean_review_list
            print("CTT: ", df.shape)
            return df
        else:
            return X
        
class DensifyTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = pd.DataFrame(X.toarray())
        print("DT: ", df.shape)
        return df
    
class SentimentTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, text_col=None):
        self.text_col = text_col
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        if self.text_col:
            df = pd.DataFrame()
            sum_pol_list = []
            sum_sub_list = []

            for doc in X.loc[:, self.text_col]:
                sum_pol = 0
                sum_sub = 0
                doc_blob = TextBlob(doc)

                for sent in doc_blob.sentences:
                    sum_pol += sent.sentiment[0]
                    sum_sub += sent.sentiment[1]

                sum_pol_list.append(sum_pol)
                sum_sub_list.append(sum_sub)

            df['pol'] = sum_pol_list
            df['sub'] = sum_sub_list
            df['clean_reviews'] = X.loc[:, self.text_col] # Need to keep the clean reviews for the W2V transformer.
            print("ST: ", df.shape)
            return df
        else:
            return X

class Word2VecTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, text_col=None, w2v=None):
        self.text_col = text_col
        self.w2v = w2v
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        if self.text_col:
            avg_w2v_list = []
            
            for review in X.loc[:, self.text_col]:
                avg_w2v = np.zeros(300)
                count = 0
                
                for word in review:
                    try:
                        avg_w2v += w2v.word_vec(word)
                        count += 1
                    except Exception:
                        continue

                avg_w2v_list.append(avg_w2v/count)
            df = pd.DataFrame(avg_w2v_list)
#             print(df.head())
            print("W2V: ", df.shape)
            return df
        else:
            return X
        
class ToDataFrameTransformer(BaseEstimator, TransformerMixin):
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        df = pd.DataFrame(X)
#         print(df.head())
        print("TDFT: ", df.shape)
        return df
        
class DropTextTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, text_col=None):
        self.text_col = text_col
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        if self.text_col:
            df = X.drop(self.text_col, axis=1)
            print("DTT: ", df.shape)
            return df

## Content-Based Recommender System

### This recommender is based on Yelp reviews on cafes near the San Francisco Bay Area

#### Load the data

##### For local computer

In [7]:
df_best = joblib.load('../data/df_best')

##### For AWS

In [3]:
df_best = joblib.load('/home/plim0793/fletcher/df_best')

In [6]:
df_best.shape

(4155, 303)

#### Separate the rating and name columns from the 300 dimensional space

In [8]:
df_rn = df_best[['name', 'rating', 'reviews']]

In [9]:
df_rn['name'] = df_rn['name'].apply(lambda x: re.sub('[0-9]*_', '', x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [10]:
df_rn.head()

Unnamed: 0,name,rating,reviews
0,FourBarrelCoffee,5.0,"Best coffee, hands down. Parking is a challeng..."
1,FourBarrelCoffee,5.0,One of the best almond milk lattes Ive ever ha...
2,FourBarrelCoffee,4.0,Great hole-in-the-wall coffee spot. Good coffe...
3,FourBarrelCoffee,4.0,"Gibraltar ReviewThe Pour:Very good, beans are ..."
4,FourBarrelCoffee,4.0,TOP notch coffee and the most AMAZING fresh pa...


In [11]:
df_300 = df_best[[i for i in range(1,301)]]

In [12]:
arr_300 = np.array(df_300)

#### Load the Google word2vec model

##### For local computer

In [13]:
# ONLY RUN ONCE AT THE START OF THE KERNEL
w2v = models.KeyedVectors.load_word2vec_format("~/Documents/GoogleNews-vectors-negative300.bin.gz",binary=True)

2017-06-01 16:47:03,016 : INFO : loading projection weights from ~/Documents/GoogleNews-vectors-negative300.bin.gz
2017-06-01 16:50:28,675 : INFO : loaded (3000000, 300) matrix from ~/Documents/GoogleNews-vectors-negative300.bin.gz


##### For AWS

In [None]:
# ONLY RUN ONCE AT THE START OF THE KERNEL
w2v = models.KeyedVectors.load_word2vec_format("/home/plim0793/GoogleNews-vectors-negative300.bin.gz",binary=True)

2017-05-31 02:30:50,891 : INFO : loading projection weights from /home/plim0793/GoogleNews-vectors-negative300.bin.gz


#### Train the LSH Forest algorithm

In [14]:
lsh = LSHForest(n_neighbors=5, n_estimators=50)

In [15]:
lsh.fit(df_300)

LSHForest(min_hash_match=4, n_candidates=50, n_estimators=50, n_neighbors=5,
     radius=1.0, radius_cutoff_ratio=0.9, random_state=None)

#### Provide a sample input

In [16]:
sample_input = ["good coffee and quiet setting and fast wifi"]
sample_df = pd.DataFrame(sample_input, columns=["sample"])

#### Load the pipeline to fit and transform the sample input

In [17]:
pipe_sample = Pipeline([
                        ('split_text', SeparateFeaturesTransformer(text_cols=['sample'])),
                        ('clean', CleanTextTransformer('sample')),
                        ('sentiment', SentimentTransformer(text_col='clean_reviews')),
                        ('vectorize', Word2VecTransformer(text_col='clean_reviews', w2v=w2v))
                                            ])

In [18]:
sample_transform = pipe_sample.fit_transform(sample_df)

SFT:  (1, 1)
CTT:  (1, 1)
ST:  (1, 3)
W2V:  (1, 300)


#### Get the 20 nearest reviews for a sample input

In [19]:
distances, indices = lsh.kneighbors(sample_transform, n_neighbors=20)

In [20]:
def get_nearest(indices, distances, df):
    df_temp = df.loc[indices, ['rating','name','reviews']]
    df_temp['dist'] = distances
    df_temp = df_temp.sort_values(['dist'], ascending=False)
    df_temp = df_temp.drop_duplicates()
    df_temp = df_temp.reset_index()
    return df_temp
        

In [21]:
df_sample_rec = get_nearest(indices[0], distances[0], df_rn)
df_sample_rec

Unnamed: 0,index,rating,name,reviews,dist
0,346,4.0,Beanery,This neighborhood is filled with amazingly goo...,0.011694
1,3794,2.0,CaffeStrada,Coffee tastes lousy. If it weren't for the ni...,0.011595
2,1239,4.0,DarwinCafe,"Great lunch spot, though line gets long right ...",0.011468
3,3560,3.0,Joey&Pat’sItalianBakery&Cafe,"I had a ground beef/croissanty thing, it was g...",0.011299
4,1402,4.0,CafeCoco,I don't give a shit if you think the food is m...,0.011167
5,1729,4.0,RitualCoffeeRoasters,Damn good coffee! That's the most important t...,0.011157
6,3185,4.0,Starbucks,Don't get hung up on it being the big bad chai...,0.010904
7,804,5.0,TaylorStreetCoffeeShop,Breakfast - eggs Benedict was good. Portion si...,0.010894
8,176,2.0,farm:table,Disappointed. Insisted to the hubs we go here ...,0.010868
9,3672,4.0,TheManor,Thorough impressed with the renovations and ne...,0.010865


In [22]:
for i in range(len(df_sample_rec)):
    print(df_sample_rec.loc[i, 'name'])
    print("NUMBER " + str(i) + ": ", df_sample_rec.loc[i, 'reviews'])
    print("\n")
    

Beanery
NUMBER 0:  This neighborhood is filled with amazingly good coffee options. Loved their latte. It makes waiting for the Muni so much better!


CaffeStrada
NUMBER 1:  Coffee tastes lousy.  If it weren't for the nice outdoor setting, I'd give it one star.  Problem is, not much good coffee in the vicinity.  Americano so bad, almost made me a tea drinker!


DarwinCafe
NUMBER 2:  Great lunch spot, though line gets long right after noon and there is limited seating on premise. Beautiful salads and somewhat limited selection of hearty sandwiches w side salad. Cold smoked trout w horseradish aioli was great, though more like tuna salad sandwich than fish filets that I was expecting.


Joey&Pat’sItalianBakery&Cafe
NUMBER 3:  I had a ground beef/croissanty thing, it was good.  Strong hot coffee.  It was weird using the bathroom with the back door left open, but nobody came in so it's all good.  A lot of flies inside, that I wasn't so thrilled with.  The staff were very nice.  I wouldn't m