# Building a Recommender System

Paul Lim

## Libraries

In [44]:
# Main imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib

# sklearn
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import manhattan_distances
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.externals import joblib
from sklearn import pipeline, feature_selection, decomposition
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import DBSCAN, AgglomerativeClustering, Birch
from sklearn.neighbors import NearestNeighbors, LSHForest

# NLP 
import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
import spacy
import gensim
from gensim import models
from gensim.models import word2vec
import snowballstemmer

# Misc.
import re
import datetime
import time
import logging
import math

% matplotlib inline

sns.set_style("white")
sns.set_style('ticks')
sns.set_style({'xtick.direction': u'in', 'ytick.direction': u'in'})
sns.set_style({'legend.frameon': True})

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Functions/Classes

In [25]:
class DataframeToSeriesTransformer(BaseEstimator, TransformerMixin):
        
    def __init__(self, col=None):
        self.col = col
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        if self.col:
            print("DTST: ", X[self.col].shape)
            return X[self.col]
        else:
            return X
        
class SeparateFeaturesTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, num_cols=None, text_cols=None):
        self.num_cols = num_cols
        self.text_cols = text_cols
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        if self.num_cols:
            print("SFT: ", X.loc[:, self.num_cols].shape)
            return X.loc[:, self.num_cols]
        elif self.text_cols:
            print("SFT: ", X.loc[:, self.text_cols].shape)
            return X.loc[:, self.text_cols]
        else:
            return X
        
class WilsonAverageTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, num_col=None, biz_list=None):
        self.num_col = num_col
        self.biz_list = biz_list
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        if self.num_col and self.biz_list.all():
            scores = get_average_rating(X, self.biz_list)
            
            X_avg = pd.DataFrame({'average': scores})
            print("WAT: ", X_avg.shape)
            return X_avg
        else:
            return X
        
class CleanTextTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, text_col=None):
        self.text_col = text_col
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
#         NLP = spacy.load('en')
        stemmer = snowballstemmer.EnglishStemmer()
        stop = stopwords.words('english')
        stop_list = stemmer.stemWords(stop)
        stop_list = set(stop_list)
        stop = set(stop + list(stop_list))
        
        if self.text_col:
            df = pd.DataFrame()
            clean_review_list = []
            
            for review in X.loc[:, self.text_col]:
                clean_review = ''
                
                for word in TextBlob(review).words:
                    if word not in stop:
                        clean_review += word.lemmatize() + ' '
                        
#                 clean_review = NLP(clean_review)
                clean_review_list.append(clean_review)
                        
            df['clean_reviews'] = clean_review_list
            print("CTT: ", df.shape)
            return df
        else:
            return X
        
class DensifyTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = pd.DataFrame(X.toarray())
        print("DT: ", df.shape)
        return df
    
class SentimentTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, text_col=None):
        self.text_col = text_col
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        if self.text_col:
            df = pd.DataFrame()
            sum_pol_list = []
            sum_sub_list = []

            for doc in X.loc[:, self.text_col]:
                sum_pol = 0
                sum_sub = 0
                doc_blob = TextBlob(doc)

                for sent in doc_blob.sentences:
                    sum_pol += sent.sentiment[0]
                    sum_sub += sent.sentiment[1]

                sum_pol_list.append(sum_pol)
                sum_sub_list.append(sum_sub)

            df['pol'] = sum_pol_list
            df['sub'] = sum_sub_list
            df['clean_reviews'] = X.loc[:, self.text_col] # Need to keep the clean reviews for the W2V transformer.
            print("ST: ", df.shape)
            return df
        else:
            return X

class Word2VecTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, text_col=None, w2v=None):
        self.text_col = text_col
        self.w2v = w2v
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        if self.text_col:
            avg_w2v_list = []
            
            for review in X.loc[:, self.text_col]:
                avg_w2v = np.zeros(300)
                count = 0
                
                for word in review:
                    try:
                        avg_w2v += w2v.word_vec(word)
                        count += 1
                    except Exception:
                        continue

                avg_w2v_list.append(avg_w2v/count)
            df = pd.DataFrame(avg_w2v_list)
#             print(df.head())
            print("W2V: ", df.shape)
            return df
        else:
            return X
        
class ToDataFrameTransformer(BaseEstimator, TransformerMixin):
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        df = pd.DataFrame(X)
#         print(df.head())
        print("TDFT: ", df.shape)
        return df
        
class DropTextTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, text_col=None):
        self.text_col = text_col
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        if self.text_col:
            df = X.drop(self.text_col, axis=1)
            print("DTT: ", df.shape)
            return df

## Content-Based Recommender System

### This recommender is based on Yelp reviews on cafes near the San Francisco Bay Area

#### Load the data

In [27]:
df_best = joblib.load('../data/df_best')

In [28]:
df_best.head()

Unnamed: 0,rating,1,2,3,4,5,6,7,8,9,...,292,293,294,295,296,297,298,299,300,name
0,4.0,-0.171138,0.110859,0.007889,0.148174,-0.032908,0.011566,-0.097138,-0.047231,-0.034002,...,-0.023892,-0.113161,0.081303,-0.028848,-0.163551,-0.096561,-0.006837,-0.09654,0.170566,0_FourBarrelCoffee
1,5.0,-0.148333,0.119471,0.00899,0.129535,-0.040324,0.018529,-0.11037,-0.052895,-0.063144,...,-0.020208,-0.116654,0.091738,-0.02603,-0.148781,-0.113272,-0.010141,-0.115695,0.14518,0_FourBarrelCoffee
2,4.0,-0.165882,0.112039,0.012008,0.131343,-0.039475,0.021509,-0.101088,-0.057847,-0.044514,...,-0.006879,-0.099389,0.09493,-0.028523,-0.151354,-0.106891,-0.015555,-0.115064,0.156558,0_FourBarrelCoffee
3,2.0,-0.182368,0.110774,0.00286,0.140167,-0.051075,0.016756,-0.095475,-0.046202,-0.047613,...,-0.014586,-0.103274,0.082766,-0.03393,-0.157451,-0.109556,-0.020041,-0.106313,0.154082,0_FourBarrelCoffee
4,5.0,-0.159966,0.105777,0.004223,0.13247,-0.050917,0.013202,-0.092117,-0.073106,-0.037257,...,0.003893,-0.098995,0.088994,-0.037172,-0.156656,-0.102626,-0.017717,-0.094929,0.141094,0_FourBarrelCoffee


#### Separate the rating and name columns from the 300 dimensional space

In [33]:
df_rn = df_best[['name', 'rating']]

In [35]:
df_300 = df_best[[i for i in range(1,301)]]

In [37]:
arr_300 = np.array(df_300)

#### Load the Google word2vec model

In [26]:
# ONLY RUN ONCE AT THE START OF THE KERNEL
# w2v = models.KeyedVectors.load_word2vec_format("~/Documents/GoogleNews-vectors-negative300.bin.gz",binary=True)

2017-05-26 18:26:41,968 : INFO : loading projection weights from ~/Documents/GoogleNews-vectors-negative300.bin.gz
2017-05-26 18:29:29,513 : INFO : loaded (3000000, 300) matrix from ~/Documents/GoogleNews-vectors-negative300.bin.gz


#### Load the pipeline

In [29]:
pipe_w2v = Pipeline([
                    ('combined_features', FeatureUnion([

                        ('num_feat', SeparateFeaturesTransformer(num_cols=['rating'])),
                        ('text_feat', Pipeline([

                            ('split_text', SeparateFeaturesTransformer(text_cols=['reviews'])),
                            ('clean', CleanTextTransformer('reviews')),
                            ('sentiment', SentimentTransformer(text_col='clean_reviews')),
                            ('vectorize', Word2VecTransformer(text_col='clean_reviews', w2v=w2v))
                                                ]))
                                                        ]))
                    ])

#### Set the model

In [32]:
db = DBSCAN(eps=0.5,
            min_samples=10,
            metric="euclidean")

#### Train the LSH Forest algorithm

In [47]:
lsh = LSHForest(n_neighbors=5, n_estimators=50)

In [48]:
lsh.fit(df_300)

LSHForest(min_hash_match=4, n_candidates=50, n_estimators=50, n_neighbors=5,
     radius=1.0, radius_cutoff_ratio=0.9, random_state=None)

In [50]:
distances, indices = lsh.kneighbors(df_300, n_neighbors=100)

#### Save the distances and indices so that the code doesn't have to be rerun each time

In [54]:
joblib.dump(distances, '../data/dist')
joblib.dump(indices, '../data/indices')

['../data/indices']

In [58]:
distances.shape

(4155, 100)

In [62]:
indices[0]

array([   0,   17, 3820, 2155, 3025,  694, 2350,  544,  203, 1935, 3485,
       2590,  611, 1984, 1392, 2904, 3719, 3551, 2370, 3749, 1195, 2326,
         48,  781, 3275, 2119, 3056, 2430, 2606,  612,  539, 3641, 2286,
       1567, 2869, 2785, 3440,  108,  303, 3748, 3511,  790, 1432, 3648,
       3646, 2450, 2342,  739, 1788,  507,  452, 3405, 1220, 1479,   16,
       2183, 2124,  653, 3055, 2537, 4118,  302, 2455, 3967, 1248, 3973,
       1143, 2420, 3277, 3063, 1435, 3459, 2158,  247,  359, 1721, 3010,
       1851, 1850, 2835, 3136,  242, 2019, 3562, 3656, 2185,  796,  795,
       3854, 2768,  721, 1124, 1523, 4019, 1173, 3416,  737, 1222, 3258,
       3607])

In [64]:
df_best.head()

Unnamed: 0,rating,1,2,3,4,5,6,7,8,9,...,292,293,294,295,296,297,298,299,300,name
0,4.0,-0.171138,0.110859,0.007889,0.148174,-0.032908,0.011566,-0.097138,-0.047231,-0.034002,...,-0.023892,-0.113161,0.081303,-0.028848,-0.163551,-0.096561,-0.006837,-0.09654,0.170566,0_FourBarrelCoffee
1,5.0,-0.148333,0.119471,0.00899,0.129535,-0.040324,0.018529,-0.11037,-0.052895,-0.063144,...,-0.020208,-0.116654,0.091738,-0.02603,-0.148781,-0.113272,-0.010141,-0.115695,0.14518,0_FourBarrelCoffee
2,4.0,-0.165882,0.112039,0.012008,0.131343,-0.039475,0.021509,-0.101088,-0.057847,-0.044514,...,-0.006879,-0.099389,0.09493,-0.028523,-0.151354,-0.106891,-0.015555,-0.115064,0.156558,0_FourBarrelCoffee
3,2.0,-0.182368,0.110774,0.00286,0.140167,-0.051075,0.016756,-0.095475,-0.046202,-0.047613,...,-0.014586,-0.103274,0.082766,-0.03393,-0.157451,-0.109556,-0.020041,-0.106313,0.154082,0_FourBarrelCoffee
4,5.0,-0.159966,0.105777,0.004223,0.13247,-0.050917,0.013202,-0.092117,-0.073106,-0.037257,...,0.003893,-0.098995,0.088994,-0.037172,-0.156656,-0.102626,-0.017717,-0.094929,0.141094,0_FourBarrelCoffee
