# Building a Recommender System

Paul Lim

## Libraries

In [1]:
# Main imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib

# sklearn
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import manhattan_distances
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.externals import joblib
from sklearn import pipeline, feature_selection, decomposition
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import DBSCAN, AgglomerativeClustering, Birch
from sklearn.neighbors import NearestNeighbors, LSHForest

# NLP 
import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
import spacy
import gensim
from gensim import models
from gensim.models import word2vec
import snowballstemmer

# Misc.
import re
import datetime
import time
import logging
import math

% matplotlib inline

sns.set_style("white")
sns.set_style('ticks')
sns.set_style({'xtick.direction': u'in', 'ytick.direction': u'in'})
sns.set_style({'legend.frameon': True})

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)



## Functions/Classes

In [2]:
class DataframeToSeriesTransformer(BaseEstimator, TransformerMixin):
        
    def __init__(self, col=None):
        self.col = col
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        if self.col:
            print("DTST: ", X[self.col].shape)
            return X[self.col]
        else:
            return X
        
class SeparateFeaturesTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, num_cols=None, text_cols=None):
        self.num_cols = num_cols
        self.text_cols = text_cols
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        if self.num_cols:
            print("SFT: ", X.loc[:, self.num_cols].shape)
            return X.loc[:, self.num_cols]
        elif self.text_cols:
            print("SFT: ", X.loc[:, self.text_cols].shape)
            return X.loc[:, self.text_cols]
        else:
            return X
        
class WilsonAverageTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, num_col=None, biz_list=None):
        self.num_col = num_col
        self.biz_list = biz_list
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        if self.num_col and self.biz_list.all():
            scores = get_average_rating(X, self.biz_list)
            
            X_avg = pd.DataFrame({'average': scores})
            print("WAT: ", X_avg.shape)
            return X_avg
        else:
            return X
        
class CleanTextTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, text_col=None):
        self.text_col = text_col
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
#         NLP = spacy.load('en')
        stemmer = snowballstemmer.EnglishStemmer()
        stop = stopwords.words('english')
        stop_list = stemmer.stemWords(stop)
        stop_list = set(stop_list)
        stop = set(stop + list(stop_list))
        
        if self.text_col:
            df = pd.DataFrame()
            clean_review_list = []
            
            for review in X.loc[:, self.text_col]:
                clean_review = ''
                
                for word in TextBlob(review).words:
                    if word not in stop:
                        clean_review += word.lemmatize() + ' '
                        
#                 clean_review = NLP(clean_review)
                clean_review_list.append(clean_review)
                        
            df['clean_reviews'] = clean_review_list
            print("CTT: ", df.shape)
            return df
        else:
            return X
        
class DensifyTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = pd.DataFrame(X.toarray())
        print("DT: ", df.shape)
        return df
    
class SentimentTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, text_col=None):
        self.text_col = text_col
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        if self.text_col:
            df = pd.DataFrame()
            sum_pol_list = []
            sum_sub_list = []

            for doc in X.loc[:, self.text_col]:
                sum_pol = 0
                sum_sub = 0
                doc_blob = TextBlob(doc)

                for sent in doc_blob.sentences:
                    sum_pol += sent.sentiment[0]
                    sum_sub += sent.sentiment[1]

                sum_pol_list.append(sum_pol)
                sum_sub_list.append(sum_sub)

            df['pol'] = sum_pol_list
            df['sub'] = sum_sub_list
            df['clean_reviews'] = X.loc[:, self.text_col] # Need to keep the clean reviews for the W2V transformer.
            print("ST: ", df.shape)
            return df
        else:
            return X

class Word2VecTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, text_col=None, w2v=None):
        self.text_col = text_col
        self.w2v = w2v
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        if self.text_col:
            avg_w2v_list = []
            
            for review in X.loc[:, self.text_col]:
                avg_w2v = np.zeros(300)
                count = 0
                
                for word in review:
                    try:
                        avg_w2v += w2v.word_vec(word)
                        count += 1
                    except Exception:
                        continue

                avg_w2v_list.append(avg_w2v/count)
            df = pd.DataFrame(avg_w2v_list)
#             print(df.head())
            print("W2V: ", df.shape)
            return df
        else:
            return X
        
class ToDataFrameTransformer(BaseEstimator, TransformerMixin):
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        df = pd.DataFrame(X)
#         print(df.head())
        print("TDFT: ", df.shape)
        return df
        
class DropTextTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, text_col=None):
        self.text_col = text_col
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        if self.text_col:
            df = X.drop(self.text_col, axis=1)
            print("DTT: ", df.shape)
            return df

## Content-Based Recommender System

### This recommender is based on Yelp reviews on cafes near the San Francisco Bay Area

#### Load the data

In [3]:
df_best = joblib.load('../data/df_best')

In [4]:
df_best.head()

Unnamed: 0,rating,1,2,3,4,5,6,7,8,9,...,293,294,295,296,297,298,299,300,name,reviews
0,5.0,-0.166373,0.111432,0.013181,0.141403,-0.054315,0.017994,-0.113104,-0.062868,-0.057131,...,-0.069553,0.108498,-0.047816,-0.141465,-0.113119,-0.018519,-0.138836,0.15715,0_FourBarrelCoffee,"Best coffee, hands down. Parking is a challeng..."
1,5.0,-0.148131,0.117487,-0.012955,0.132629,-0.041712,0.029891,-0.095817,-0.047179,-0.046625,...,-0.09776,0.089792,-0.02001,-0.140758,-0.084806,-0.013744,-0.10325,0.151,0_FourBarrelCoffee,One of the best almond milk lattes Ive ever ha...
2,4.0,-0.154828,0.109273,0.015791,0.144656,-0.028327,0.035225,-0.102963,-0.042544,-0.056152,...,-0.088661,0.104102,-0.032073,-0.141803,-0.107568,-0.010066,-0.140037,0.155104,0_FourBarrelCoffee,Great hole-in-the-wall coffee spot. Good coffe...
3,4.0,-0.15634,0.107694,0.012915,0.135845,-0.043747,0.031081,-0.09774,-0.05629,-0.047788,...,-0.107758,0.086701,-0.024822,-0.147303,-0.095368,-0.013405,-0.107973,0.154647,0_FourBarrelCoffee,"Gibraltar ReviewThe Pour:Very good, beans are ..."
4,4.0,-0.155028,0.115164,0.00439,0.137599,-0.037805,0.018079,-0.096963,-0.055715,-0.029595,...,-0.099597,0.092129,-0.03301,-0.150503,-0.09004,-0.010341,-0.111718,0.151157,0_FourBarrelCoffee,TOP notch coffee and the most AMAZING fresh pa...


#### Separate the rating and name columns from the 300 dimensional space

In [5]:
df_rn = df_best[['name', 'rating', 'reviews']]

In [6]:
df_300 = df_best[[i for i in range(1,301)]]

In [7]:
arr_300 = np.array(df_300)

#### Load the Google word2vec model

In [8]:
# ONLY RUN ONCE AT THE START OF THE KERNEL
w2v = models.KeyedVectors.load_word2vec_format("~/Documents/GoogleNews-vectors-negative300.bin.gz",binary=True)

2017-05-28 15:11:03,844 : INFO : loading projection weights from ~/Documents/GoogleNews-vectors-negative300.bin.gz
2017-05-28 15:14:04,434 : INFO : loaded (3000000, 300) matrix from ~/Documents/GoogleNews-vectors-negative300.bin.gz


#### Load the pipeline to fit and transform the training data

In [9]:
pipe_w2v = Pipeline([
                    ('combined_features', FeatureUnion([

                        ('num_feat', SeparateFeaturesTransformer(num_cols=['rating'])),
                        ('text_feat', Pipeline([

                            ('split_text', SeparateFeaturesTransformer(text_cols=['reviews'])),
                            ('clean', CleanTextTransformer('reviews')),
                            ('sentiment', SentimentTransformer(text_col='clean_reviews')),
                            ('vectorize', Word2VecTransformer(text_col='clean_reviews', w2v=w2v))
                                                ]))
                                                        ]))
                    ])

#### Train the LSH Forest algorithm

In [10]:
lsh = LSHForest(n_neighbors=5, n_estimators=50)

In [11]:
lsh.fit(df_300)

LSHForest(min_hash_match=4, n_candidates=50, n_estimators=50, n_neighbors=5,
     radius=1.0, radius_cutoff_ratio=0.9, random_state=None)

#### Provide a sample input

In [51]:
sample_input = ["good coffee and quiet setting and fast wifi"]
sample_df = pd.DataFrame(sample_input, columns=["sample"])

#### Load the pipeline to fit and transform the sample input

In [52]:
pipe_sample = Pipeline([
                        ('split_text', SeparateFeaturesTransformer(text_cols=['sample'])),
                        ('clean', CleanTextTransformer('sample')),
                        ('sentiment', SentimentTransformer(text_col='clean_reviews')),
                        ('vectorize', Word2VecTransformer(text_col='clean_reviews', w2v=w2v))
                                            ])

In [53]:
sample_transform = pipe_sample.fit_transform(sample_df)

SFT:  (1, 1)
CTT:  (1, 1)
ST:  (1, 3)
W2V:  (1, 300)


#### Get the 20 nearest reviews for a sample input

In [54]:
distances, indices = lsh.kneighbors(sample_transform, n_neighbors=20)

In [55]:
def get_nearest(indices, distances, df):
    df_temp = df.loc[indices, ['rating','name','reviews']]
    df_temp['dist'] = distances
    df_temp = df_temp.sort_values(['dist'], ascending=False)
    df_temp = df_temp.reset_index()
    return df_temp
        

In [56]:
df_sample_rec = get_nearest(indices[0], distances[0], df_rn)
df_sample_rec

Unnamed: 0,index,rating,name,reviews,dist
0,3942,1.0,80_Joy’sPlace,"This place is like a furnace, it's super hot. ...",0.014844
1,3794,2.0,783_CaffeStrada,Coffee tastes lousy. If it weren't for the ni...,0.014754
2,1275,5.0,32_CafeStJorge,"Cafe St. Jorge has the nicest, most edible, mo...",0.014655
3,2641,4.0,576_Olea,Delicious food and interesting wine list. But ...,0.014651
4,1239,4.0,322_DarwinCafe,"Great lunch spot, though line gets long right ...",0.014594
5,16,5.0,102_BlueBottleCoffee,I love coffee but i admit i always stick to th...,0.014589
6,2989,4.0,638_BioCafe,"We had the salmon, basil and cheese sandwich. ...",0.014466
7,640,5.0,215_PhilzCoffee,"Quick, efficient, friendly, clean, and delicio...",0.013842
8,372,4.0,167_ParamoCoffeeCompany,"In an area that is full of coffee options, I g...",0.013731
9,3364,5.0,705_AllStarDonuts,Best sugar donuts I've had in my entire life h...,0.013667


In [57]:
for i in range(len(df_sample_rec)):
    print(df_sample_rec.loc[i, 'name'])
    print("NUMBER " + str(i) + ": ", df_sample_rec.loc[i, 'reviews'])
    print("\n")
    

80_Joy’sPlace
NUMBER 0:  This place is like a furnace, it's super hot. I thought I ordered a matcha latte, but got a regular almond milk latte, which is fine, but the coffee sucks. It's sour/bitter and just not tasteful. They do have food, but I am definitely not feeling this place at all.


783_CaffeStrada
NUMBER 1:  Coffee tastes lousy.  If it weren't for the nice outdoor setting, I'd give it one star.  Problem is, not much good coffee in the vicinity.  Americano so bad, almost made me a tea drinker!


32_CafeStJorge
NUMBER 2:  Cafe St. Jorge has the nicest, most edible, most original cafe menu in the city. Everything is real honest food, not gross gut bombs or the generic cafe fair of old quiche and bagels. I'm picky- I don't want big gross carb fests but I also don't want rabbit food. Think fava bean purée on sliced baguettes with fresh lemon, olive oil, sea salt and tuna (or was it sardines? I dunno but it was filling and delicious), big chia bowls with dried fruit, nuts and fresh

# Scale up to use the entire dataset. This can be done on AWS.