# Collaborative Filter Recommendation System

Paul Lim

## Libraries

In [46]:
# Main imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib

# sklearn
from sklearn.pipeline import make_pipeline
from sklearn import pipeline, feature_selection, decomposition
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.cluster import DBSCAN, AgglomerativeClustering, Birch
from sklearn.decomposition import PCA, NMF
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.utils.extmath import randomized_svd

# Misc.
import re
import datetime
import time
import logging
import math
import json

% matplotlib inline

sns.set_style("white")
sns.set_style('ticks')
sns.set_style({'xtick.direction': u'in', 'ytick.direction': u'in'})
sns.set_style({'legend.frameon': True})

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Functions

In [2]:
def extract_reviews_json(file, nth=1, limit=100):
    
    user_list = []
    biz_list = []
    rating_list = []
    useful_list = []
    funny_list = []
    cool_list = []
    
    df = pd.DataFrame()

    with open(file) as f:
        count = 0
        for i, line in enumerate(f):
            if count % nth == 0:
                review_entry = json.loads(line)
                user_list.append(review_entry['user_id'])
                biz_list.append(review_entry['business_id'])
                rating_list.append(review_entry['stars'])
                useful_list.append(review_entry['useful'])
                funny_list.append(review_entry['funny'])
                cool_list.append(review_entry['cool'])
                
            if count > limit:
                break
            count += 1
    df['user_id'] = user_list
    df['business_id'] = biz_list
    df['stars'] = rating_list
    df['useful'] = useful_list
    df['funny'] = funny_list
    df['cool'] = cool_list
    
    return df

def extract_business_names(file, nth=1, limit=100):
    
    city_list = []
    state_list = []
    biz_encrypt_list = []
    biz_names_list = []
    
    df = pd.DataFrame()
    
    with open(file) as f:
        count = 0
        for i, line in enumerate(f):
            if count % nth == 0:
                business_entry = json.loads(line)
                
                city_list.append(business_entry['city'])
                state_list.append(business_entry['state'])
                biz_encrypt_list.append(business_entry['business_id'])
                biz_names_list.append(business_entry['name'])

            if count > limit:
                break
            count += 1
    df['city'] = city_list
    df['state'] = state_list
    df['name'] = biz_names_list
    df['encrypt'] = biz_encrypt_list
    return df

## Creating the recommender system

### Load in review data and convert to a dataframe (~1M reviews)

In [3]:
df_reviews = extract_reviews_json("/home/plim0793/yelp_academic_dataset_review.json", nth=1, limit=2000000)

In [4]:
df_names = extract_business_names("/home/plim0793/yelp_academic_dataset_business.json", nth=1, limit=144072)

In [5]:
df_names.state.value_counts()

AZ     43492
NV     28214
ON     24507
NC     10177
OH      9966
PA      8091
QC      6668
WI      3899
EDH     3539
BW      2905
IL      1556
SC       498
MLN      191
HLD      172
FIF       72
ELN       36
WLN       34
NI        20
NY        13
ESX       11
SCB        3
FAL        1
STG        1
NTH        1
FLN        1
NLK        1
PKN        1
KHL        1
VT         1
Name: state, dtype: int64

#### Focus on just businesses from AZ

In [48]:
df_OH = df_names[df_names['state'] == 'OH']

In [49]:
df_tot = df_reviews.merge(df_OH, how='left', left_on='business_id', right_on='encrypt')
df_tot = df_tot.drop('encrypt', axis=1)
df_tot = df_tot.dropna()

In [50]:
df_tot.shape

(97648, 9)

### Create the pivot table

In [51]:
df_wide = pd.pivot_table(df_tot, values=['stars'],
                                index=['name', 'user_id'],
                                aggfunc=np.mean).unstack()
df_wide.shape

(4886, 36573)

In [52]:
df_wide_sample = df_wide.sample(frac=1)

In [53]:
df_wide_sample.shape

(4886, 36573)

In [77]:
df_wide_sample.ix[:5, :5]

Unnamed: 0_level_0,stars,stars,stars,stars,stars
user_id,--Awhttr0FhVud5prVpVlA,--J8UruLD_xvVuI1lMAxpA,--_EpULz-cjQit4npXy1ng,--agAy0vRYwG6WqbInorfg,--amZ_cR9Zgu4B1RrAHJyA
name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Delta Computers Inc,2.5,2.5,2.5,2.5,2.5
Parma Auto Body,2.5,2.5,2.5,2.5,2.5
Skyview Lodge Event Center,2.5,2.5,2.5,2.5,2.5
Sweet Melissa,2.5,2.5,2.5,2.5,2.5
Dish Disposal,2.5,2.5,2.5,2.5,2.5


### Fill the null values with 2.5 (midpoint between 0 and 5)

In [55]:
df_wide_sample = df_wide_sample.fillna(2.5)

### Get user matrix and business matrix

In [57]:
U, sig, VT = randomized_svd(df_wide_sample, 
                            n_components=10,
                            n_iter=5)

In [60]:
print("Business in User Space: ", U.shape)
print("User in Business Space: ", VT.shape)

Business in User Space:  (4886, 10)
User in Business Space:  (10, 36573)


### Calculate cosine similarities and convert to dataframe

In [61]:
dists = cosine_similarity(U)

In [62]:
np.mean(dists)

0.73743438255103022

In [63]:
df_dists = pd.DataFrame(dists, columns=df_wide_sample.index)
df_dists.index = df_dists.columns

In [74]:
df_dists.ix[:5, :5]

name,Delta Computers Inc,Parma Auto Body,Skyview Lodge Event Center,Sweet Melissa,Dish Disposal
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Delta Computers Inc,1.0,0.972145,0.959831,0.08113,0.971435
Parma Auto Body,0.972145,1.0,0.987774,-0.101072,0.999429
Skyview Lodge Event Center,0.959831,0.987774,1.0,-0.10351,0.991503
Sweet Melissa,0.08113,-0.101072,-0.10351,1.0,-0.100075
Dish Disposal,0.971435,0.999429,0.991503,-0.100075,1.0


### Try a sample input

In [76]:
food = ['LongHorn Steakhouse', "Flury's Cafe"]

In [69]:
food_sum = df_dists[food].apply(lambda row: np.sum(row), axis=1)

#### Top 10 recommendations

In [75]:
food_sum.sort_values(ascending=False)[:10]

name
Hibachi Japan Steak House                   1.803817
Sugo Modern Italian Bistro                  1.802863
Ceedo's Eatery                              1.789440
The Natatorium Health and Fitness Center    1.781428
Mark & Philly's                             1.778921
Luca's New York Style Pizza                 1.775471
Great Lakes Baking Company                  1.773838
Sugar-Luv Confections                       1.772519
Corner Cup Coffeehouse                      1.767198
Rinky Dink Family Fun Center                1.747650
dtype: float64

In [78]:
joblib.dump(df_dists, '/home/plim0793/fletcher/df_dists')

['/home/plim0793/fletcher/df_dists']