# Subsetting Data

### Import Libraries

In [2]:
import csv
import os
import pandas as pd
from time import time
import warnings

# For reading from Postgres
from sqlalchemy import create_engine

warnings.filterwarnings('ignore')

### Set filepaths

In [57]:
raw_data_directory     = os.path.join('..', 'data', 'raw')
interim_data_directory = os.path.join('..', 'data', 'interim')

review_filepath            = os.path.join(raw_data_directory, 'yelp_academic_dataset_review.csv')
business_filepath          = os.path.join(raw_data_directory, 'yelp_academic_dataset_business.csv')
restaurant_review_filepath = os.path.join(interim_data_directory, 'restaurant_review.csv')
restaurant_filepath        = os.path.join(interim_data_directory, 'restaurant.csv')

### Load Data

In [58]:
%%time 
review_df = pd.read_csv(restaurant_review_filepath)

CPU times: user 25.6 s, sys: 5.15 s, total: 30.8 s
Wall time: 32.8 s


In [59]:
review_df.head()

Unnamed: 0,date,stars,text,review_id,business_id,business_name
0,2011-02-25,2,The pizza was okay. Not the best I've had. I p...,x7mDIiDB3jEiPGPHOmDzyw,iCQpiavjjPzJ5_3gPD5Ebg,Secret Pizza
1,2012-11-13,5,I love this place! My fiance And I go here atl...,dDl8zu1vWPdKGihJrwQbpw,pomGBqfbxcqPv14c3XH-ZQ,Leticia's Mexican Cocina
2,2014-10-23,1,Terrible. Dry corn bread. Rib tips were all fa...,LZp4UX5zK3e-c5ZGSeo3kA,jtQARsP6P-LbkyjbO1qNGg,H&H BBQ Plus 2
3,2011-02-25,2,Back in 2005-2007 this place was my FAVORITE t...,Er4NBWCmCD4nM8_p1GRdow,elqbBhBfElMNSrjFqW3now,Pin Kaow Thai Restaurant
4,2014-09-05,5,Delicious healthy food. The steak is amazing. ...,jsDu6QEJHbwP2Blom1PLCA,Ums3gaP2qM3W1XcA5r6SsQ,Braddah's Island Style


In [6]:
restaurant_df = pd.read_csv(restaurant_filepath)

In [8]:
restaurant_df.head()

Unnamed: 0,name,business_id,stars,review_count,categories,longitude,latitude,postal_code,city,state
0,Minhas Micro Brewery,Apn5Q_b6Nz61Tq4XzPdf9A,4.0,24,"Tours, Breweries, Pizza, Restaurants, Food, Ho...",-114.031675,51.091813,T2E 6L6,Calgary,AB
1,CK'S BBQ & Catering,AjEbIBw6ZFfln7ePHha9PA,4.5,3,"Chicken Wings, Burgers, Caterers, Street Vendo...",-114.939821,35.960734,89002,Henderson,NV
2,La Bastringue,O8S5hYJ1SMc8fA4QBtVujA,4.0,5,"Breakfast & Brunch, Restaurants, French, Sandw...",-73.5993,45.540503,H2G 1K7,Montréal,QC
3,Thai One On,6OuOZAok8ikONMS_T3EzXg,2.0,7,"Restaurants, Thai",-79.632763,43.712946,L4T 1A8,Mississauga,ON
4,Filiberto's Mexican Food,8-NRKkPY1UiFXW20WXKiXg,2.5,40,"Mexican, Restaurants",-112.341302,33.448106,85323,Avondale,AZ


In [5]:
mon_ami_gabi_id = '4JNXUYY8wbaaDmk3BPzlWw'

### Selecting a restaurant using PostgreSQL

In [4]:
public_ip = 'localhost'
username = 'postgres'
password = 'password'
port = '5432'
database = 'yelp'

# Construct database URL from environment variables
uri = f'postgresql://{username}:{password}@{public_ip}:{port}/{database}'

# Connection to Postgres database
engine = create_engine(uri)

In [20]:
%%time

SQL = f'''
SELECT date, stars, text, review_id, business_name
FROM reviews
WHERE business_id = '{mon_ami_gabi_id}'
  AND text ILIKE '%%onion soup%%'
'''

reviews = pd.read_sql(SQL, con = engine)

CPU times: user 8.91 ms, sys: 4.42 ms, total: 13.3 ms
Wall time: 29.4 s


In [21]:
reviews.head()

Unnamed: 0,date,stars,text,review_id,business_name
0,2015-02-10,2,Other than being right across the Fountains of...,uczUlWIWuO-KzoUiLhICNw,Mon Ami Gabi
1,2014-12-03,5,Where to begin! Now our dining experience her...,QoY3L_d_axTcMn68pI8zxQ,Mon Ami Gabi
2,2017-04-24,3,French onion soup was watery with little taste...,185E0cpQpDRUO4JRGu3fXQ,Mon Ami Gabi
3,2008-05-16,4,This place gets an extra star just for their c...,-RR6sy7KqQ7J_x5hQxEUkA,Mon Ami Gabi
4,2010-12-04,4,Charming resturant that looks like it would be...,nth_q-GqOy_Ly8sxsREIwA,Mon Ami Gabi


In [22]:
reviews.shape

(868, 5)

### Building my own search engine 

In [124]:
class SearchCriteria:
    '''
    query (any): value to search for
    column (str): column name of dataframe to search
    exact (bool): True: return exact matches
                  False: return results that contains the query
    '''
    
    query = None
    column = None
    exact = None
    
    def __init__(self, query, column, exact = False):
        
        assert type(exact) is bool, 'TypeError: exact must be boolean.'
        assert type(column) is str, 'TypeError: column must be string.'
        
        self.query = query
        self.column = column
        self.exact = exact
        
        
    def summarize(self):
        
        print(f'Query: {self.query}\nColumn: {self.column}\nExact: {self.exact}\n')
        
    

In [21]:
def searchCriteria(query, column, exact = False):
    '''
    Returns a dictionary for a single query criteria.
    '''
    if type(exact) is not bool:
        print('Error: Exact must be True or False.')
        return None
    
    return {
        'query' : query,
        'column' : column,
        'exact' : exact
    }

### TODO: Optimize search_reviews function
Function takes 63 seconds vs   
Iterative Masking takes 0.48 seconds

In [43]:
def search_reviews(df, queries):
    '''
    Returns a subset of df that matches a list of SearchCriteria.
    
    df: Dataframe
    queries: 
    
    Use the SearchCriteria class to build a list of queries.
    '''
    supermask = True

    for query in queries:
        if query.exact:
            mask = df[query.column] == query.query
        else:
            mask = df[query.column].str.contains(query.query, case = False)
        supermask = supermask & mask

    return df[supermask]                


### Select Mon Ami Gabi reviews

In [60]:
mon_ami_gabi_reviews = review_df[review_df['business_id'] == mon_ami_gabi_id]

In [61]:
mon_ami_gabi_reviews.shape

(7968, 6)

[Mon Ami Gabi's Yelp Page](https://www.yelp.com/biz/mon-ami-gabi-las-vegas-2)

In [62]:
mon_ami_gabi_reviews.to_csv('../data/interim/mon_ami_gabi_reviews.csv', index = False)

In [63]:
mon_ami_gabi_reviews.head()

Unnamed: 0,date,stars,text,review_id,business_id,business_name
213,2012-06-10,4,I booked a table here for brunch and it did no...,wl8BO_I-is-JaMwMW5c_gQ,4JNXUYY8wbaaDmk3BPzlWw,Mon Ami Gabi
407,2012-01-20,4,Came here for lunch after a long night of part...,cf9RrqHY9eQ9M53OPyXLtg,4JNXUYY8wbaaDmk3BPzlWw,Mon Ami Gabi
1028,2017-05-10,5,Loved the fried goat cheese in tomato sauce al...,BvmhSQ6WFm2Jxu01G8OpdQ,4JNXUYY8wbaaDmk3BPzlWw,Mon Ami Gabi
1311,2014-05-03,5,"Love the outdoor atmosphere. Price was right, ...",IoKp9n1489XohTV_-EJ0IQ,4JNXUYY8wbaaDmk3BPzlWw,Mon Ami Gabi
1612,2014-06-04,5,Best steak in Vegas. Best mashed potatoes in V...,7YNmSq7Lb1zi4SUKXaSjfg,4JNXUYY8wbaaDmk3BPzlWw,Mon Ami Gabi


### Select reviews from Mon Ami Gabi that mention a certain menu item

#### Select Mon Ami Gabi reviews that mention French `onion soup`

I tried subsetting using three methods: 
1. A search function for reusable code
2. Iterative masking: `df1 = df[mask1]`, `df2 = df1[mask2]`, etc.
3. Consecutive masking: `df[mask1 & mask2 & ...]`

Iterative masking performed the fastest.

In [64]:
queries = []
queries.append(SearchCriteria(mon_ami_gabi_id, 'business_id'))
queries.append(SearchCriteria('onion soup', 'text'))

In [85]:
# Using search_reviews function
t0 = time()

onion_soup_reviews = search_reviews(mon_ami_gabi_reviews, queries)

t1 = np.round (time()-t0, 4)

print(f'Found {onion_soup_reviews.shape[0]} results in {t1} seconds')

Found 868 results in 0.1076 seconds


In [86]:
%%time
mon_ami_gabi_mask = review_df['business_id'] == mon_ami_gabi_id
onion_soup_mask = mon_ami_gabi_reviews['text'].str.contains('onion soup', case = False)


CPU times: user 320 ms, sys: 3.99 ms, total: 324 ms
Wall time: 323 ms


In [97]:
# Searching Mon Ami Gabi AND Onion Soup
t0 = time()

onion_soup_reviews = review_df[mon_ami_gabi_mask & onion_soup_mask]

t1 = np.round(time() - t0, 4)

print(f'Found {onion_soup_reviews.shape[0]} results in {t1} seconds')

Found 868 results in 0.6099 seconds


In [90]:
# Searching Onion Soup in Mon Ami Gabi data
t0 = time()

onion_soup_reviews = mon_ami_gabi_reviews[onion_soup_mask]

t1 = np.round(time() - t0, 4)

print(f'Found {onion_soup_reviews.shape[0]} results in {t1} seconds')

Found 868 results in 0.0021 seconds


In [92]:
# Searching Mon Ami Gabi, then searching Onion Soup
t0 = time()

mon_ami_gabi = review_df[mon_ami_gabi_mask]
onion_soup_reviews = mon_ami_gabi[onion_soup_mask]

t1 = np.round(time() - t0, 4)

print(f'Found {onion_soup_reviews.shape[0]} results in {t1} seconds')

Found 868 results in 0.0123 seconds


In [99]:
# chain masking
# Searching Mon Ami Gabi, then searching Onion Soup
t0 = time()

onion_soup_reviews = review_df[mon_ami_gabi_mask][onion_soup_mask]

t1 = np.round(time() - t0, 4)

print(f'Found {onion_soup_reviews.shape[0]} results in {t1} seconds')

Found 868 results in 0.0111 seconds


Chain masking seems to be the fastest method. However it requires the entire dataset to be loaded into memory, which is not scalable. 

In [93]:
onion_soup_reviews.shape

(868, 6)

In [94]:
onion_soup_reviews.head()

Unnamed: 0,date,stars,text,review_id,business_id,business_name
9215,2015-02-10,2,Other than being right across the Fountains of...,uczUlWIWuO-KzoUiLhICNw,4JNXUYY8wbaaDmk3BPzlWw,Mon Ami Gabi
20080,2017-04-24,3,French onion soup was watery with little taste...,185E0cpQpDRUO4JRGu3fXQ,4JNXUYY8wbaaDmk3BPzlWw,Mon Ami Gabi
23257,2014-12-03,5,Where to begin! Now our dining experience her...,QoY3L_d_axTcMn68pI8zxQ,4JNXUYY8wbaaDmk3BPzlWw,Mon Ami Gabi
24424,2010-12-04,4,Charming resturant that looks like it would be...,nth_q-GqOy_Ly8sxsREIwA,4JNXUYY8wbaaDmk3BPzlWw,Mon Ami Gabi
37580,2010-07-11,5,This review is long overdue! I have been eat...,l0Lm7Dx69s6aH7a-5dwKDg,4JNXUYY8wbaaDmk3BPzlWw,Mon Ami Gabi


#### Save Onion Soup reviews

In [95]:
onion_soup_reviews.to_csv('../data/interim/onion_soup_reviews.csv', index = False)

#### Select Mon Ami Gabi reviews that  mention eggs `benedict`

In [100]:
t0 = time()

eggs_benedict_reviews = mon_ami_gabi_reviews[mon_ami_gabi_reviews['text'].str.contains('benedict', case = False)]
t1 = np.round(time() - t0, 2)

print(f'Found {eggs_benedict_reviews.shape[0]} results in {t1} seconds')

Found 610 results in 4.47 seconds


#### Save Eggs Benedict reviews

In [108]:
eggs_benedict_reviews.to_csv('../data/interim/eggs_benedict_reviews.csv', index = False)

### Select restaurants with the most reviews

In [111]:
popular_restaurants = restaurant_df.nlargest(1000, 'review_count')
popular_restaurants.head(10)

Unnamed: 0,name,business_id,stars,review_count,categories,longitude,latitude,postal_code,city,state
42044,Mon Ami Gabi,4JNXUYY8wbaaDmk3BPzlWw,4.0,7968,"Steakhouses, Breakfast & Brunch, Restaurants, ...",-115.172581,36.112827,89109,Las Vegas,NV
56089,Bacchanal Buffet,RESDUcs7fIiihp38-d6_6g,4.0,7866,"Sandwiches, Buffets, Breakfast & Brunch, Food,...",-115.176222,36.116113,89109,Las Vegas,NV
19340,Wicked Spoon,K7lWdNUhCbcnEvI0NhGewg,3.5,6446,"Buffets, Restaurants, Breakfast & Brunch",-115.176155,36.10955,89109,Las Vegas,NV
56981,Gordon Ramsay BurGR,cYwJA2A6I12KNkm2rtXd5g,4.0,5472,"Burgers, American (Traditional), Restaurants",-115.172169,36.110724,89109,Las Vegas,NV
51619,Hash House A Go Go,f4x1YBxkLrZg652xt2KR5g,4.0,5382,"Breakfast & Brunch, American (New), Restaurants",-115.17158,36.118181,89109,Las Vegas,NV
53840,Earl of Sandwich,DkYS3arLOhA8si5uUEmHOw,4.5,4981,"Sandwiches, Wraps, Food, Caterers, Restaurants...",-115.171869,36.109443,89109,Las Vegas,NV
55021,The Buffet,2weQS-RnoOBhb1KsHKyoSQ,3.5,4240,"Restaurants, Buffets",-115.16559,36.126887,89109,Las Vegas,NV
41313,The Buffet at Bellagio,ujHiaprwCQ5ewziu0Vi9rw,3.5,4091,"Buffets, American (New), Restaurants",-115.17689,36.11322,89109,Las Vegas,NV
54814,Secret Pizza,iCQpiavjjPzJ5_3gPD5Ebg,4.0,4078,"Pizza, Restaurants",-115.174212,36.109837,89109,Las Vegas,NV
8939,Lotus of Siam,KskYqH1Bi7Z_61pH6Om8pg,4.0,3975,"Wine Bars, Nightlife, Restaurants, Seafood, Ca...",-115.141891,36.143664,89104,Las Vegas,NV


Vegas loves their restaurants.

#### Save 1000 most reviewed restaurants

In [107]:
popular_restaurants.to_csv('../data/interim/popular_restaurants.csv')

### Select restaurants with the lowest stars

In [127]:
worst_restaurants = popular_restaurants.nsmallest(1000, 'stars')

worst_restaurants.head(10)

Unnamed: 0,name,business_id,stars,review_count,categories,longitude,latitude,postal_code,city,state
51023,Sam Woo BBQ Restaurant,DVfCbJhJUDWRlUfrKzaKOA,2.0,532,"Noodles, Chinese, Restaurants, Barbeque",-115.19598,36.125381,89102,Las Vegas,NV
12464,The Buffet At TI,7EZ4Eu7YJ1ltRCC5jXFJrQ,2.5,748,"American (Traditional), Asian Fusion, Sushi Ba...",-115.171991,36.124829,89109,Las Vegas,NV
54474,Paradise Garden Buffet,_kb6GT4qawjwq47OsQ52xw,2.5,497,"Buffets, Restaurants",-115.171394,36.11569,89109,Las Vegas,NV
53340,Beijing Noodle No. 9,U9aA5H13y7t9xWnoQslV0Q,2.5,827,"Restaurants, Chinese",-115.175067,36.116165,89109,Las Vegas,NV
32302,MGM Grand Buffet,-U7tvCtaraTQ9b0zBhpBMA,2.5,1096,"Restaurants, Buffets, American (Traditional)",-115.171778,36.102091,89109,Las Vegas,NV
56805,Hakkasan Nightclub,X8c23dur0ll2D9XTu-I8Qg,2.5,1720,"Restaurants, Bars, Nightlife, Chinese, Dance C...",-115.172452,36.101375,89109,Las Vegas,NV
37954,The Buffet at Luxor,ABJjxuO6oh5D9R48-eAUdQ,2.5,747,"Arts & Entertainment, Buffets, Casinos, Restau...",-115.175835,36.095492,89109,Las Vegas,NV
30331,Riviera Hotel & Casino,5Zc41a446gV3K_o7CDs69Q,2.5,809,"Hotels & Travel, Hotels, Arts & Entertainment,...",-115.162176,36.135164,89109,Las Vegas,NV
13127,Saddle Ranch Chop House,IyVdd_IqwUtzQDTxw2W9qw,2.5,520,"Steakhouses, Restaurants, Bars, Nightlife",-112.260762,33.532952,85305,Glendale,AZ
47263,Blondies Sports Bar & Grill,4X0KWUPcD2EkHN83aa4cXg,2.5,543,"American (Traditional), Bars, Sports Bars, Res...",-115.169562,36.110526,89109,Las Vegas,NV


### Trade offs

**Speed comparison:**  
PostgreSQL: `29.4 s`  
Loading dataset into memory: `32.8 s`
My custom search engine: `0.1076 s`  
Chain masking: `0.0111 s`  

Verdict: SQL is the way to go. Loading the entire dataset into memory is not scalable. The query takes marginally less time if you're only doing one query. Ideally, use a database system that caches queries, such as Google BigQuery.