# Collaborative Filter Recommendation System

Paul Lim

## Libraries

In [1]:
# Main imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib

# sklearn
from sklearn.pipeline import make_pipeline
from sklearn import pipeline, feature_selection, decomposition
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.cluster import DBSCAN, AgglomerativeClustering, Birch
from sklearn.decomposition import PCA, NMF
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity

# Misc.
import re
import datetime
import time
import logging
import math
import json

% matplotlib inline

sns.set_style("white")
sns.set_style('ticks')
sns.set_style({'xtick.direction': u'in', 'ytick.direction': u'in'})
sns.set_style({'legend.frameon': True})

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Functions

In [2]:
def extract_reviews_json(file, nth=1, limit=100):
    
    user_list = []
    biz_list = []
    rating_list = []
    useful_list = []
    funny_list = []
    cool_list = []
    
    df = pd.DataFrame()

    with open(file) as f:
        count = 0
        for i, line in enumerate(f):
            if count % nth == 0:
                review_entry = json.loads(line)
                user_list.append(review_entry['user_id'])
                biz_list.append(review_entry['business_id'])
                rating_list.append(review_entry['stars'])
                useful_list.append(review_entry['useful'])
                funny_list.append(review_entry['funny'])
                cool_list.append(review_entry['cool'])
                
            if count > limit:
                break
            count += 1
    df['user_id'] = user_list
    df['business_id'] = biz_list
    df['stars'] = rating_list
    df['useful'] = useful_list
    df['funny'] = funny_list
    df['cool'] = cool_list
    
    return df

def extract_business_names(file, nth=1, limit=100):
    
    city_list = []
    state_list = []
    biz_encrypt_list = []
    biz_names_list = []
    
    df = pd.DataFrame()
    
    with open(file) as f:
        count = 0
        for i, line in enumerate(f):
            if count % nth == 0:
                business_entry = json.loads(line)
                
                city_list.append(business_entry['city'])
                state_list.append(business_entry['state'])
                biz_encrypt_list.append(business_entry['business_id'])
                biz_names_list.append(business_entry['name'])

            if count > limit:
                break
            count += 1
    df['city'] = city_list
    df['state'] = state_list
    df['name'] = biz_names_list
    df['encrypt'] = biz_encrypt_list
    return df

## Creating the recommender system

### Load in review data and convert to a dataframe (~1M reviews)

In [3]:
df_reviews = extract_reviews_json("/home/plim0793/yelp_academic_dataset_review.json", nth=1, limit=2000000)

In [4]:
df_names = extract_business_names("/home/plim0793/yelp_academic_dataset_business.json", nth=1, limit=144072)

In [5]:
df_names.state.value_counts()

AZ     43492
NV     28214
ON     24507
NC     10177
OH      9966
PA      8091
QC      6668
WI      3899
EDH     3539
BW      2905
IL      1556
SC       498
MLN      191
HLD      172
FIF       72
ELN       36
WLN       34
NI        20
NY        13
ESX       11
SCB        3
FAL        1
STG        1
NTH        1
FLN        1
NLK        1
PKN        1
KHL        1
VT         1
Name: state, dtype: int64

#### Focus on just businesses from AZ

In [5]:
df_AZ = df_names[df_names['state'] == 'AZ']

In [6]:
df_tot = df_reviews.merge(df_AZ, how='left', left_on='business_id', right_on='encrypt')
df_tot = df_tot.drop('encrypt', axis=1)
df_tot = df_tot.dropna()

In [7]:
df_tot.shape

(602358, 9)

### Create the pivot table

In [8]:
df_wide = pd.pivot_table(df_tot, values=['stars'],
                                index=['name', 'user_id'],
                                aggfunc=np.mean).unstack()
df_wide.shape

MemoryError: 

In [None]:
df_wide_sample = df_wide.sample(frac=0.5)

In [10]:
df_wide.ix[0:5, 0:5]

Unnamed: 0_level_0,stars,stars,stars,stars,stars
user_id,---94vtJ_5o_nikEs6hUjg,---cu1hq55BP9DWVXXKHZg,--1av6NdbEbMiuBr7Aup9A,--3WaS23LcIXtxyFULJHTA,--3oMd6gjXpAzhjLBrsVCQ
name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
$100 Substance Abuse Evaluation,,,,,
.99 Cent Pizza Place,,,,,
1 800-Flowers,,,,,
1 Epic Nails & Spa,,,,,
1 on 1 Technologies,,,,,


### Fill the null values with 2.5 (midpoint between 0 and 5)

In [13]:
df_wide = df_wide.fillna(2.5)

MemoryError: 

### Calculate cosine similarities and convert to dataframe

In [54]:
dists = cosine_similarity(df_wide)

In [55]:
df_dists = pd.DataFrame(dists, columns=df_wide.index)
df_dists.index = df_dists.columns

In [56]:
df_dists.ix[:10, :10]

name,.99 Cent Pizza Place,10 Factory Fitness Center,1800-Plumbing of Arizona,1st Bank,1st Class Collision,2 B Wireless,2-Save Auto Glass,201 West,2015 NAILS & SPA,24 Hour Glass & Door Repair
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
.99 Cent Pizza Place,1.0,0.999766,0.999708,0.999822,0.999602,0.999803,0.999784,0.999843,0.999813,0.99984
10 Factory Fitness Center,0.999766,1.0,0.999712,0.999827,0.999607,0.999807,0.999789,0.999854,0.999818,0.999844
1800-Plumbing of Arizona,0.999708,0.999712,1.0,0.999768,0.999549,0.999749,0.999731,0.999796,0.999759,0.999786
1st Bank,0.999822,0.999827,0.999768,1.0,0.999663,0.999864,0.999845,0.999911,0.999874,0.9999
1st Class Collision,0.999602,0.999607,0.999549,0.999663,1.0,0.999644,0.999625,0.999691,0.999654,0.99968
2 B Wireless,0.999803,0.999807,0.999749,0.999864,0.999644,1.0,0.999826,0.999891,0.999854,0.999881
2-Save Auto Glass,0.999784,0.999789,0.999731,0.999845,0.999625,0.999826,1.0,0.999873,0.999836,0.999862
201 West,0.999843,0.999854,0.999796,0.999911,0.999691,0.999891,0.999873,1.0,0.999901,0.999928
2015 NAILS & SPA,0.999813,0.999818,0.999759,0.999874,0.999654,0.999854,0.999836,0.999901,1.0,0.999891
24 Hour Glass & Door Repair,0.99984,0.999844,0.999786,0.9999,0.99968,0.999881,0.999862,0.999928,0.999891,1.0


In [57]:
food = ['Taco Bell']

df_dists[food].head()

name,Taco Bell
name,Unnamed: 1_level_1
.99 Cent Pizza Place,0.999369
10 Factory Fitness Center,0.999374
1800-Plumbing of Arizona,0.999315
1st Bank,0.99943
1st Class Collision,0.99921


In [58]:
food_sum = df_dists[food].apply(lambda row: np.sum(row), axis=1)

In [59]:
food_sum

name
.99 Cent Pizza Place                                      0.999369
10 Factory Fitness Center                                 0.999374
1800-Plumbing of Arizona                                  0.999315
1st Bank                                                  0.999430
1st Class Collision                                       0.999210
2 B Wireless                                              0.999410
2-Save Auto Glass                                         0.999392
201 West                                                  0.999458
2015 NAILS & SPA                                          0.999421
24 Hour Glass & Door Repair                               0.999447
3 Margaritas                                              0.999409
3 Oceans Entertainment                                    0.999259
34 Easy St                                                0.999439
360 Physical Therapy                                      0.999374
3rd Avenue Grill                                         