# Cleaning Data

## Table of Contents

1. Select restaurants in Business data
1. Select restaurant reviews in Review data
1. 
1. 
1. 



### Import Libraries

In [44]:
import csv
import os
import pandas as pd
from time import time

# For detecting language of document
from langdetect import detect, DetectorFactory 

# for consistent results
DetectorFactory.seed = 42 

### Set filepaths

In [16]:
raw_data_directory = os.path.join('..', 'data', 'raw')
interim_data_directory = os.path.join('..', 'data', 'interim')

review_filepath = os.path.join(raw_data_directory, 
                                   'yelp_academic_dataset_review.csv')
business_filepath = os.path.join(raw_data_directory, 
                                   'yelp_academic_dataset_business.csv')
restaurant_review_filepath = os.path.join(interim_data_directory, 
                                   'restaurant_review.csv')
restaurant_filepath = os.path.join(interim_data_directory, 
                                   'restaurant.csv')

### Find restaurants in Business data
Restaurants are businesses with "Restaurants" in the `categories` column.

In [17]:
%%time

restaurant_ids = set()

with open(restaurant_filepath, mode = 'w', encoding = 'utf_8') as f_out:
    with open(business_filepath, encoding = 'utf_8') as f_in:

        reader = csv.DictReader(f_in)
        columns = reader.fieldnames
        
        writer = csv.DictWriter(f_out, fieldnames = columns)
        writer.writeheader()

        # iterate through each line in the file
        for row in reader:

            # skip businesses that are not restaurants
            try: 
                if 'Restaurants' not in row['categories']:
                    continue
            except:
                continue

            # add business_id to restaurant_ids set
            restaurant_ids.add(row['business_id'])
            
            # write row
            writer.writerow(row)            

restaurant_ids = frozenset(restaurant_ids)

print (f'Found {len(restaurant_ids)} restaurants.')

Found 57173 restaurants.
CPU times: user 6.58 s, sys: 237 ms, total: 6.82 s
Wall time: 6.88 s


### Find restaurant reviews from Review data and save to separate file

In [38]:
%%time

# Change to True to run this code block. Estimated runtime: 3 minutes
run = False

if run:
    num_reviews = 0

    # Write restaurant reviews to csv file
    with open(restaurant_review_filepath, 'w', encoding = 'utf_8') as f_out:

        # Open all reviews csv file
        with open(review_csv_filepath, encoding = 'utf_8') as f_in:
            # Instantiate reader
            reader = csv.DictReader(f_in)

            # Get column names
            columns = reader.fieldnames

            # Instantiate writer
            writer = csv.DictWriter(f_out, fieldnames = columns)

            # Write column names
            writer.writeheader()

            # Loop through all reviews
            for row in reader:            
                # Skip reviews that are not about a restaurant
                if row['business_id'] not in restaurant_ids:
                    continue

                # Write row    
                writer.writerow(row)
                num_reviews += 1

    print (f'Found {num_reviews} restaurant reviews.')


Found 3654797 restaurant reviews.
CPU times: user 2min 10s, sys: 5.59 s, total: 2min 15s
Wall time: 2min 17s


In [15]:
review_df = pd.read_csv('../data/interim/restaurant_review.csv')
review_df.head()

Unnamed: 0,text,cool,funny,review_id,date,stars,business_id,useful,user_id
0,The pizza was okay. Not the best I've had. I p...,0,0,x7mDIiDB3jEiPGPHOmDzyw,2011-02-25,2,iCQpiavjjPzJ5_3gPD5Ebg,0,msQe1u7Z_XuqjGoqhB0J5g
1,I love this place! My fiance And I go here atl...,0,0,dDl8zu1vWPdKGihJrwQbpw,2012-11-13,5,pomGBqfbxcqPv14c3XH-ZQ,0,msQe1u7Z_XuqjGoqhB0J5g
2,Terrible. Dry corn bread. Rib tips were all fa...,1,1,LZp4UX5zK3e-c5ZGSeo3kA,2014-10-23,1,jtQARsP6P-LbkyjbO1qNGg,3,msQe1u7Z_XuqjGoqhB0J5g
3,Back in 2005-2007 this place was my FAVORITE t...,0,0,Er4NBWCmCD4nM8_p1GRdow,2011-02-25,2,elqbBhBfElMNSrjFqW3now,2,msQe1u7Z_XuqjGoqhB0J5g
4,Delicious healthy food. The steak is amazing. ...,0,0,jsDu6QEJHbwP2Blom1PLCA,2014-09-05,5,Ums3gaP2qM3W1XcA5r6SsQ,0,msQe1u7Z_XuqjGoqhB0J5g


### Get most reviewed restaurants

In [10]:
business_df.sort_values(by = 'review_count', ascending = False)[['business_id', 'name', 'review_count']].head()


Unnamed: 0,business_id,name,review_count
137635,4JNXUYY8wbaaDmk3BPzlWw,Mon Ami Gabi,7968
185167,RESDUcs7fIiihp38-d6_6g,Bacchanal Buffet,7866
62723,K7lWdNUhCbcnEvI0NhGewg,Wicked Spoon,6446
188309,cYwJA2A6I12KNkm2rtXd5g,Gordon Ramsay BurGR,5472
170129,f4x1YBxkLrZg652xt2KR5g,Hash House A Go Go,5382


#### Cross reference with review data

In [7]:
review_df['business_id'].value_counts().head()

4JNXUYY8wbaaDmk3BPzlWw    7968
RESDUcs7fIiihp38-d6_6g    7861
K7lWdNUhCbcnEvI0NhGewg    6447
cYwJA2A6I12KNkm2rtXd5g    5472
f4x1YBxkLrZg652xt2KR5g    5382
Name: business_id, dtype: int64

In [8]:
review_df.shape

(5996996, 9)

### Check for missing data

In [9]:
review_df.isna().sum()

text           1
cool           0
funny          0
review_id      0
date           0
stars          0
business_id    0
useful         0
user_id        0
dtype: int64

### Cleaning Review Data

#### Drop rows with missing reviews

In [7]:
review_df.drop(review_df[review_df['text'].isna()].index, axis = 0, inplace = True)

In [10]:
# review_df[review_df['text'].isna()]

#### Add actual `business_name` to Review Data

In [11]:
# Build a dictionary to map business_id : business_name 
business_dict = pd.Series(business_df['name'].values, index = business_df['business_id']).to_dict()


In [12]:
t0 = time()
review_df['business_name'] = review_df['business_id'].map(business_dict)
t1 = time() - t0

In [23]:
review_df.head()

Unnamed: 0,text,cool,funny,review_id,date,stars,business_id,useful,user_id,business_name
0,The pizza was okay. Not the best I've had. I p...,0,0,x7mDIiDB3jEiPGPHOmDzyw,2011-02-25,2,iCQpiavjjPzJ5_3gPD5Ebg,0,msQe1u7Z_XuqjGoqhB0J5g,Secret Pizza
1,I love this place! My fiance And I go here atl...,0,0,dDl8zu1vWPdKGihJrwQbpw,2012-11-13,5,pomGBqfbxcqPv14c3XH-ZQ,0,msQe1u7Z_XuqjGoqhB0J5g,Leticia's Mexican Cocina
2,Terrible. Dry corn bread. Rib tips were all fa...,1,1,LZp4UX5zK3e-c5ZGSeo3kA,2014-10-23,1,jtQARsP6P-LbkyjbO1qNGg,3,msQe1u7Z_XuqjGoqhB0J5g,H&H BBQ Plus 2
3,Back in 2005-2007 this place was my FAVORITE t...,0,0,Er4NBWCmCD4nM8_p1GRdow,2011-02-25,2,elqbBhBfElMNSrjFqW3now,2,msQe1u7Z_XuqjGoqhB0J5g,Pin Kaow Thai Restaurant
4,Delicious healthy food. The steak is amazing. ...,0,0,jsDu6QEJHbwP2Blom1PLCA,2014-09-05,5,Ums3gaP2qM3W1XcA5r6SsQ,0,msQe1u7Z_XuqjGoqhB0J5g,Braddah's Island Style


#### Remove `\n` characters

In [15]:
review_df['text'] = review_df['text'].apply(lambda row : row.replace('\n', ' '))

In [16]:
review_df[review_df['text'].str.contains('\n')]

Unnamed: 0,text,cool,funny,review_id,date,stars,business_id,useful,user_id,business_name


#### Drop Irrelevant Columns

In [17]:
# Drop irrelevant columns
# review_df = review_df[['date', 'stars', 'text', 'review_id', 'business_id', 'business_name']]

In [18]:
review_df.head()

Unnamed: 0,text,cool,funny,review_id,date,stars,business_id,useful,user_id,business_name
0,The pizza was okay. Not the best I've had. I p...,0,0,x7mDIiDB3jEiPGPHOmDzyw,2011-02-25,2,iCQpiavjjPzJ5_3gPD5Ebg,0,msQe1u7Z_XuqjGoqhB0J5g,Secret Pizza
1,I love this place! My fiance And I go here atl...,0,0,dDl8zu1vWPdKGihJrwQbpw,2012-11-13,5,pomGBqfbxcqPv14c3XH-ZQ,0,msQe1u7Z_XuqjGoqhB0J5g,Leticia's Mexican Cocina
2,Terrible. Dry corn bread. Rib tips were all fa...,1,1,LZp4UX5zK3e-c5ZGSeo3kA,2014-10-23,1,jtQARsP6P-LbkyjbO1qNGg,3,msQe1u7Z_XuqjGoqhB0J5g,H&H BBQ Plus 2
3,Back in 2005-2007 this place was my FAVORITE t...,0,0,Er4NBWCmCD4nM8_p1GRdow,2011-02-25,2,elqbBhBfElMNSrjFqW3now,2,msQe1u7Z_XuqjGoqhB0J5g,Pin Kaow Thai Restaurant
4,Delicious healthy food. The steak is amazing. ...,0,0,jsDu6QEJHbwP2Blom1PLCA,2014-09-05,5,Ums3gaP2qM3W1XcA5r6SsQ,0,msQe1u7Z_XuqjGoqhB0J5g,Braddah's Island Style


### Save clean review data

In [22]:
restaurant_review_df.to_csv(restaurant_review_filepath, index = False)

### Load restaurant data

In [39]:
restaurant_df = pd.read_csv(restaurant_filepath)

In [40]:
restaurant_df.head()

Unnamed: 0,name,business_id,stars,review_count,categories,longitude,latitude,postal_code,city,state
0,Minhas Micro Brewery,Apn5Q_b6Nz61Tq4XzPdf9A,4.0,24,"Tours, Breweries, Pizza, Restaurants, Food, Ho...",-114.031675,51.091813,T2E 6L6,Calgary,AB
1,CK'S BBQ & Catering,AjEbIBw6ZFfln7ePHha9PA,4.5,3,"Chicken Wings, Burgers, Caterers, Street Vendo...",-114.939821,35.960734,89002,Henderson,NV
2,La Bastringue,O8S5hYJ1SMc8fA4QBtVujA,4.0,5,"Breakfast & Brunch, Restaurants, French, Sandw...",-73.5993,45.540503,H2G 1K7,Montréal,QC
3,Thai One On,6OuOZAok8ikONMS_T3EzXg,2.0,7,"Restaurants, Thai",-79.632763,43.712946,L4T 1A8,Mississauga,ON
4,Filiberto's Mexican Food,8-NRKkPY1UiFXW20WXKiXg,2.5,40,"Mexican, Restaurants",-112.341302,33.448106,85323,Avondale,AZ


In [41]:
restaurant_df.shape

(57056, 10)

In [42]:
restaurant_df.columns

Index(['name', 'business_id', 'stars', 'review_count', 'categories',
       'longitude', 'latitude', 'postal_code', 'city', 'state'],
      dtype='object')

### Cleaning Restaurant Data

#### Drop rows with missing `postal_code` or `city`

In [25]:
restaurant_df['postal_code'].isna().sum() / restaurant_df.shape[0]

0.002028929739562381

In [26]:
restaurant_df['city'].isna().sum() / restaurant_df.shape[0]

3.498154723383415e-05

0.2% of restaurants are missing postal code and city.

In [27]:
restaurant_df = restaurant_df.dropna(subset=['postal_code', 'city'])

In [28]:
len(restaurant_df)

57056

#### Keep relevant columns

In [29]:
columns = ['name', 'business_id', 'stars', 'review_count', 
           'categories', 'longitude', 'latitude', 'postal_code', 'city', 'state']

restaurant_df = restaurant_df[columns]


In [30]:
restaurant_df.head()

Unnamed: 0,name,business_id,stars,review_count,categories,longitude,latitude,postal_code,city,state
0,Minhas Micro Brewery,Apn5Q_b6Nz61Tq4XzPdf9A,4.0,24,"Tours, Breweries, Pizza, Restaurants, Food, Ho...",-114.031675,51.091813,T2E 6L6,Calgary,AB
1,CK'S BBQ & Catering,AjEbIBw6ZFfln7ePHha9PA,4.5,3,"Chicken Wings, Burgers, Caterers, Street Vendo...",-114.939821,35.960734,89002,Henderson,NV
2,La Bastringue,O8S5hYJ1SMc8fA4QBtVujA,4.0,5,"Breakfast & Brunch, Restaurants, French, Sandw...",-73.5993,45.540503,H2G 1K7,Montréal,QC
3,Thai One On,6OuOZAok8ikONMS_T3EzXg,2.0,7,"Restaurants, Thai",-79.632763,43.712946,L4T 1A8,Mississauga,ON
4,Filiberto's Mexican Food,8-NRKkPY1UiFXW20WXKiXg,2.5,40,"Mexican, Restaurants",-112.341302,33.448106,85323,Avondale,AZ


#### Last check for missing values

In [31]:
restaurant_df.isna().sum()

name            0
business_id     0
stars           0
review_count    0
categories      0
longitude       0
latitude        0
postal_code     0
city            0
state           0
dtype: int64

### Save clean business data

In [32]:
restaurant_df.to_csv(restaurant_filepath, index = False)