# Subsetting Data

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
from time import time
import warnings

warnings.filterwarnings('ignore')

### Load Data

In [3]:
t0 = time()
review_df = pd.read_csv('../data/interim/restaurant_reviews.csv')
t1 = time() - t0
print(f'Loaded in {np.round(t1, 2)} seconds')

Loaded in 29.65 seconds


In [6]:
restaurant_df = pd.read_csv('../data/interim/restaurants.csv')

In [92]:
review_df.head()

Unnamed: 0,text,cool,funny,review_id,date,stars,business_id,useful,user_id,business_name
0,the pizza was okay. not the best i've had. i prefer biaggio's on flamingo / fort apache. the che...,0.0,0.0,x7mDIiDB3jEiPGPHOmDzyw,2011-02-25,2.0,iCQpiavjjPzJ5_3gPD5Ebg,0.0,msQe1u7Z_XuqjGoqhB0J5g,Secret Pizza
1,i love this place! my fiance and i go here atleast once a week. the portions are huge! food is a...,0.0,0.0,dDl8zu1vWPdKGihJrwQbpw,2012-11-13,5.0,pomGBqfbxcqPv14c3XH-ZQ,0.0,msQe1u7Z_XuqjGoqhB0J5g,Leticia's Mexican Cocina
2,terrible. dry corn bread. rib tips were all fat and mushy and had no flavor. if you want bbq in ...,1.0,1.0,LZp4UX5zK3e-c5ZGSeo3kA,2014-10-23,1.0,jtQARsP6P-LbkyjbO1qNGg,3.0,msQe1u7Z_XuqjGoqhB0J5g,H&H BBQ Plus 2
3,back in 2005-2007 this place was my favorite thai place ever. i'd go here alllll the time. i nev...,0.0,0.0,Er4NBWCmCD4nM8_p1GRdow,2011-02-25,2.0,elqbBhBfElMNSrjFqW3now,2.0,msQe1u7Z_XuqjGoqhB0J5g,Pin Kaow Thai Restaurant
4,delicious healthy food. the steak is amazing. fish and pork are awesome too. service is above an...,0.0,0.0,jsDu6QEJHbwP2Blom1PLCA,2014-09-05,5.0,Ums3gaP2qM3W1XcA5r6SsQ,0.0,msQe1u7Z_XuqjGoqhB0J5g,Braddah's Island Style


In [90]:
review_df.to_csv('../data/interim/restaurant_reviews.csv', index = False)

In [91]:
restaurant_df.head()

Unnamed: 0,name,business_id,stars,review_count,categories,longitude,latitude,postal_code,city,state
0,Minhas Micro Brewery,Apn5Q_b6Nz61Tq4XzPdf9A,4.0,24,"Tours, Breweries, Pizza, Restaurants, Food, Hotels & Travel",-114.031675,51.091813,T2E 6L6,Calgary,AB
1,CK'S BBQ & Catering,AjEbIBw6ZFfln7ePHha9PA,4.5,3,"Chicken Wings, Burgers, Caterers, Street Vendors, Barbeque, Food Trucks, Food, Restaurants, Even...",-114.939821,35.960734,89002,Henderson,NV
2,La Bastringue,O8S5hYJ1SMc8fA4QBtVujA,4.0,5,"Breakfast & Brunch, Restaurants, French, Sandwiches, Cafes",-73.5993,45.540503,H2G 1K7,Montréal,QC
3,Thai One On,6OuOZAok8ikONMS_T3EzXg,2.0,7,"Restaurants, Thai",-79.632763,43.712946,L4T 1A8,Mississauga,ON
4,Filiberto's Mexican Food,8-NRKkPY1UiFXW20WXKiXg,2.5,40,"Mexican, Restaurants",-112.341302,33.448106,85323,Avondale,AZ


### TODO: Build list of menu items for _one_ restaurant, list of spelling variations for each menu item

In [29]:
def searchCriteria(query, column, exact = False):
    '''
    Returns a dictionary for a single query criteria.
    '''
    if type(exact) is not bool:
        print('Error: Exact must be True or False.')
        return None
    
    return {
        'query' : query,
        'column' : column,
        'exact' : exact
    }

### TODO: Optimize search_reviews function
Function takes 63 seconds vs   
Iterative Masking takes 0.48 seconds

In [30]:
def search_reviews(df, queries):
    '''
    Returns a subset of df given a list of queries structured like so:
    
    queries = [{'query' : 'onion soup',
                'column' : 'text', 
                'exact' : False},
               {'query' : '4JNXUYY8wbaaDmk3BPzlWw',
                'column' : 'business_id',
                'exact' : True}]
    Use searchCriteria() to build a list of queries.
    '''
    supermask = True

    for query in queries:
        if query['exact']:
            mask = df[query['column']] == query['query']
        else:
            mask = df[query['column']].str.contains(query['query'], case = False)
        supermask = supermask & mask

    return df[supermask]                


### Select Mon Ami Gabi reviews

In [4]:
mon_ami_gabi_id = '4JNXUYY8wbaaDmk3BPzlWw'

mon_ami_gabi_reviews = review_df[review_df['business_id'] == mon_ami_gabi_id]

In [5]:
mon_ami_gabi_reviews.shape

(7968, 10)

[Mon Ami Gabi's Yelp Page](https://www.yelp.com/biz/mon-ami-gabi-las-vegas-2)

In [6]:
mon_ami_gabi_reviews.to_csv('../data/interim/mon_ami_gabi_reviews.csv', index = False)

In [7]:
mon_ami_gabi_reviews.head()

Unnamed: 0,text,cool,funny,review_id,date,stars,business_id,useful,user_id,business_name
213,"I booked a table here for brunch and it did not disappoint, it was a great experience and more r...",0.0,0.0,wl8BO_I-is-JaMwMW5c_gQ,2012-06-10,4.0,4JNXUYY8wbaaDmk3BPzlWw,0.0,fo4mpUqgXL2mJqALc9AvbA,Mon Ami Gabi
407,"Came here for lunch after a long night of partying. I'm a huge fan of French food, and had defi...",0.0,0.0,cf9RrqHY9eQ9M53OPyXLtg,2012-01-20,4.0,4JNXUYY8wbaaDmk3BPzlWw,0.0,TVvTtXwPXsvrg2KJGoOUTg,Mon Ami Gabi
1028,Loved the fried goat cheese in tomato sauce along with dogfish 60 minutes IPA. Very nice view of...,0.0,0.0,BvmhSQ6WFm2Jxu01G8OpdQ,2017-05-10,5.0,4JNXUYY8wbaaDmk3BPzlWw,0.0,etbAVunw-4kwr6VTRweZpA,Mon Ami Gabi
1311,"Love the outdoor atmosphere. Price was right, service exceptional and the food tasted fantastic",0.0,0.0,IoKp9n1489XohTV_-EJ0IQ,2014-05-03,5.0,4JNXUYY8wbaaDmk3BPzlWw,0.0,vKXux2Xx3xcicTgYZoR0pg,Mon Ami Gabi
1612,Best steak in Vegas. Best mashed potatoes in Vegas. Best French restaurant in Vegas. MAKE MAKE ...,0.0,0.0,7YNmSq7Lb1zi4SUKXaSjfg,2014-06-04,5.0,4JNXUYY8wbaaDmk3BPzlWw,3.0,e3s1x4LLqfSkRTWDy_-Urg,Mon Ami Gabi


### Select reviews from Mon Ami Gabi that mention a certain menu item

#### Select Mon Ami Gabi reviews that mention French `onion soup`

I tried subsetting using three methods: 
1. A search function for reusable code
2. Iterative masking: `df1 = df[mask1]`, `df2 = df1[mask2]`, etc.
3. Consecutive masking: `df[mask1 & mask2 & ...]`

Iterative masking performed the fastest.

In [31]:
queries = []
queries.append(searchCriteria(mon_ami_gabi_id, 'business_id'))
queries.append(searchCriteria('onion soup', 'text'))

In [12]:
queries

[{'query': '4JNXUYY8wbaaDmk3BPzlWw', 'column': 'business_id', 'exact': False},
 {'query': 'onion soup', 'column': 'text', 'exact': False}]

In [74]:
# Using search_reviews function
t0 = time()

onion_soup_reviews = search_reviews(mon_ami_gabi_reviews, queries)
t1 = np.round(time() - t0, 2)

print(f'Found {onion_soup_reviews.shape[0]} results in {t1} seconds')

Found 868 results in 0.1 seconds


In [75]:
# Searching Mon Ami Gabi AND Onion Soup
t0 = time()

mon_ami_gabi_mask = review_df['business_id'] == mon_ami_gabi_id
onion_soup_mask = mon_ami_gabi['text'].str.contains('onion soup', case = False)

onion_soup_reviews = review_df[mon_ami_gabi_mask & onion_soup_mask]

t1 = np.round(time() - t0, 2)

print(f'Found {onion_soup_reviews.shape[0]} results in {t1} seconds')

Found 868 results in 0.97 seconds


In [8]:
# Searching Onion Soup in Mon Ami Gabi data
t0 = time()

onion_soup_reviews = mon_ami_gabi_reviews[mon_ami_gabi_reviews['text'].str.contains('onion soup', case = False)]
t1 = np.round(time() - t0, 2)

print(f'Found {onion_soup_reviews.shape[0]} results in {t1} seconds')

Found 868 results in 0.08 seconds


In [94]:
# Searching Mon Ami Gabi, then searching Onion Soup
t0 = time()

mon_ami_gabi_mask = review_df['business_id'] == mon_ami_gabi_id
onion_soup_mask = mon_ami_gabi['text'].str.contains('onion soup', case = False)

mon_ami_gabi = review_df[mon_ami_gabi_mask]
onion_soup_reviews = mon_ami_gabi[onion_soup_mask]

t1 = np.round(time() - t0, 2)

print(f'Found {onion_soup_reviews.shape[0]} results in {t1} seconds')

Found 868 results in 0.31 seconds


In [95]:
onion_soup_reviews.shape

(868, 10)

In [9]:
onion_soup_reviews.head()

Unnamed: 0,text,cool,funny,review_id,date,stars,business_id,useful,user_id,business_name
9212,"Other than being right across the Fountains of Bellagio, I'm not quite sure what the hype is abo...",1.0,2.0,uczUlWIWuO-KzoUiLhICNw,2015-02-10,2.0,4JNXUYY8wbaaDmk3BPzlWw,3.0,9zuYkm3k4_9KjE1PC8EPfg,Mon Ami Gabi
20072,French onion soup was watery with little taste. We sent it back and were refunded the cost. Th...,0.0,0.0,185E0cpQpDRUO4JRGu3fXQ,2017-04-24,3.0,4JNXUYY8wbaaDmk3BPzlWw,0.0,EYiYLS0ZHDKGJSb1IKcpwg,Mon Ami Gabi
23249,Where to begin! Now our dining experience here was not a common one - we were treated to dinner...,0.0,0.0,QoY3L_d_axTcMn68pI8zxQ,2014-12-03,5.0,4JNXUYY8wbaaDmk3BPzlWw,1.0,mp3Xy-w2isyLjEN91xOeGQ,Mon Ami Gabi
24415,Charming resturant that looks like it would be heavily overpriced. Been here twice now and reall...,0.0,0.0,nth_q-GqOy_Ly8sxsREIwA,2010-12-04,4.0,4JNXUYY8wbaaDmk3BPzlWw,0.0,M4g64KUEia1qgcn-qNlYsw,Mon Ami Gabi
37563,"This review is long overdue! I have been eating here for years, it is always on my ""must have""...",0.0,0.0,l0Lm7Dx69s6aH7a-5dwKDg,2010-07-11,5.0,4JNXUYY8wbaaDmk3BPzlWw,0.0,pQAUyBorkc1ZOxmV-uJ02w,Mon Ami Gabi


#### Save Onion Soup reviews

In [10]:
onion_soup_reviews.to_csv('../data/interim/onion_soup_reviews.csv', index = False)

#### Select Mon Ami Gabi reviews that  mention eggs `benedict`

In [12]:
t0 = time()

eggs_benedict_reviews = mon_ami_gabi_reviews[mon_ami_gabi_reviews['text'].str.contains('benedict', case = False)]
t1 = np.round(time() - t0, 2)

print(f'Found {eggs_benedict_reviews.shape[0]} results in {t1} seconds')

Found 610 results in 0.08 seconds


#### Save Eggs Benedict reviews

In [13]:
eggs_benedict_reviews.to_csv('../data/interim/eggs_benedict_reviews.csv', index = False)

### Select restaurants with the most reviews

In [43]:
popular_restaurants = restaurant_df.nlargest(1000, 'review_count')

Vegas loves their restaurants.

#### Save 1000 most reviewed restaurants

In [46]:
popular_restaurants.to_csv('../data/interim/popular_restaurants.csv')


### Select restaurants with the lowest stars

In [22]:
worst_restaurants = popular_restaurants.nsmallest(1000, 'stars')

In [24]:
worst_restaurants.head(10)

Unnamed: 0,name,business_id,stars,review_count,categories,longitude,latitude,postal_code,city,state
51143,Sam Woo BBQ Restaurant,DVfCbJhJUDWRlUfrKzaKOA,2.0,532,"Noodles, Chinese, Restaurants, Barbeque",-115.19598,36.125381,89102,Las Vegas,NV
12483,The Buffet At TI,7EZ4Eu7YJ1ltRCC5jXFJrQ,2.5,748,"American (Traditional), Asian Fusion, Sushi Ba...",-115.171991,36.124829,89109,Las Vegas,NV
54600,Paradise Garden Buffet,_kb6GT4qawjwq47OsQ52xw,2.5,497,"Buffets, Restaurants",-115.171394,36.11569,89109,Las Vegas,NV
53464,Beijing Noodle No. 9,U9aA5H13y7t9xWnoQslV0Q,2.5,827,"Restaurants, Chinese",-115.175067,36.116165,89109,Las Vegas,NV
32371,MGM Grand Buffet,-U7tvCtaraTQ9b0zBhpBMA,2.5,1096,"Restaurants, Buffets, American (Traditional)",-115.171778,36.102091,89109,Las Vegas,NV
56940,Hakkasan Nightclub,X8c23dur0ll2D9XTu-I8Qg,2.5,1720,"Restaurants, Bars, Nightlife, Chinese, Dance C...",-115.172452,36.101375,89109,Las Vegas,NV
38038,The Buffet at Luxor,ABJjxuO6oh5D9R48-eAUdQ,2.5,747,"Arts & Entertainment, Buffets, Casinos, Restau...",-115.175835,36.095492,89109,Las Vegas,NV
30391,Riviera Hotel & Casino,5Zc41a446gV3K_o7CDs69Q,2.5,809,"Hotels & Travel, Hotels, Arts & Entertainment,...",-115.162176,36.135164,89109,Las Vegas,NV
13147,Saddle Ranch Chop House,IyVdd_IqwUtzQDTxw2W9qw,2.5,520,"Steakhouses, Restaurants, Bars, Nightlife",-112.260762,33.532952,85305,Glendale,AZ
47366,Blondies Sports Bar & Grill,4X0KWUPcD2EkHN83aa4cXg,2.5,543,"American (Traditional), Bars, Sports Bars, Res...",-115.169562,36.110526,89109,Las Vegas,NV


### Drop infrequent rows

In [2]:
def drop_less_than(df, col, n):
    '''
    Drops rows where value count of col is less than n.
    '''
    return df[df.groupby(col)[col].transform('count').ge(n)]