In [1]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import csv

In [2]:
# sets the path to the json file
# business_json_path = './yelp_dataset/yelp_dataset_data/yelp_academic_dataset_business.json'
review_json_path = '../yelp_dataset/yelp_dataset_data/yelp_academic_dataset_review.json'

In [3]:
# loads the business data
business_df = pd.read_csv('../Data/business.csv')

In [4]:
business_df.head()

Unnamed: 0,business_id,name,city,state,postal_code,latitude,longitude,stars,review_count,attributes,...,cat_Nicaraguan,cat_Georgian,cat_Czech/Slovakian,cat_Sardinian,cat_PubFood,cat_FoodTrucks,cat_WineTours,cat_FoodTours,cat_Bistros,cat_Drive-ThruBars
0,6iYb2HFDywm3zjuRg0shjw,Oskar Blues Taproom,Boulder,CO,80302,40.017544,-105.283348,4.0,86,"{'RestaurantsTableService': 'True', 'WiFi': ""u...",...,0,0,0,0,0,0,0,0,0,0
1,tCbdrRPZA0oiIYSmHG3J0w,Flying Elephants at PDX,Portland,OR,97218,45.588906,-122.593331,4.0,126,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...",...,0,0,0,0,0,0,0,0,0,0
2,D4JtQNTI4X3KcbzacDJsMw,Bob Likes Thai Food,Vancouver,BC,V5V,49.251342,-123.101333,3.5,169,"{'GoodForKids': 'True', 'Alcohol': ""u'none'"", ...",...,0,0,0,0,0,0,0,0,0,0
3,jFYIsSb7r1QeESVUnXPHBw,Boxwood Biscuit,Columbus,OH,43206,39.947007,-82.997471,4.5,11,,...,0,0,0,0,0,0,0,0,0,0
4,rYs_1pNB_RMtn5WQh55QDA,Chautauqua General Store,Boulder,CO,80302,39.998449,-105.281006,3.5,5,"{'BikeParking': 'True', 'RestaurantsTakeOut': ...",...,0,0,0,0,0,0,0,0,0,0


In [7]:
business_df.shape

(43010, 199)

In [5]:
# creates the csv file from the dataframe  
# df_r.to_csv('./Data/review.csv')

In [6]:
# Loads the review data in chunks
size = 1000000
review = pd.read_json(review_json_path, lines=True,
                      dtype={'review_id':str,'user_id':str,
                             'business_id':str,'stars':int,
                             'date':str,'text':str,'useful':int,
                             'funny':int,'cool':int},
                      chunksize=size)

In [16]:
# Code below is inspired by: 
# https://towardsdatascience.com/converting-yelp-dataset-to-csv-using-pandas-2a4c8f03bd88

In [9]:
# Merging the review.json data and business.json data together 

# There are multiple chunks to be read 
chunk_list = []
for chunk_review in review: 
    # Drop columns that aren't needed 
#     chunk_review = chunk_review.drop(columns=['review_id'], axis=1)
    
    # Rename column names to avoid conflict with business overall star rating
    chunk_review = chunk_review.rename(columns={'stars':'review_stars'})
    
    # Inner merge with edited business file so only reviews related to the business remain
    chunk_merged = pd.merge(business_df, chunk_review, on='business_id', how='inner')
    
    # Show feedback on progress 
    print(f'{chunk_merged.shape[0]} out of {size:,} related reviews')
    chunk_list.append(chunk_merged)

# After trimming down the review file, concatenate all relevant data back to one dataframe
df = pd.concat(chunk_list, ignore_index=True, join='outer', axis=0)

559527 out of 1,000,000 related reviews
559212 out of 1,000,000 related reviews
551117 out of 1,000,000 related reviews
353463 out of 1,000,000 related reviews


In [12]:
# Drops the user_id column because it's not needed
df.drop(columns='user_id', inplace=True)

In [13]:
df.head()

Unnamed: 0,business_id,name,city,state,postal_code,latitude,longitude,stars,review_count,attributes,...,cat_FoodTours,cat_Bistros,cat_Drive-ThruBars,review_id,review_stars,useful,funny,cool,text,date
0,ci3ehWCkRPBnU27vwiU3Zg,The Back Porch,Longwood,FL,32779,28.688764,-81.404833,4.0,58,"{'BikeParking': 'True', 'Corkage': 'True', 'Bu...",...,0,0,0,7G-Jl-N2wQ1tqZqCtEJfRA,5,1,1,1,I have wanted to try this place for months and...,2020-08-22 06:07:17
1,ci3ehWCkRPBnU27vwiU3Zg,The Back Porch,Longwood,FL,32779,28.688764,-81.404833,4.0,58,"{'BikeParking': 'True', 'Corkage': 'True', 'Bu...",...,0,0,0,WIIPkelCNAVZ6b4IPkqphA,4,1,1,1,"Phew. A breath of fresh air, but still the sa...",2019-09-14 18:21:59
2,ci3ehWCkRPBnU27vwiU3Zg,The Back Porch,Longwood,FL,32779,28.688764,-81.404833,4.0,58,"{'BikeParking': 'True', 'Corkage': 'True', 'Bu...",...,0,0,0,tzSzAT50gEFzBkq8J4pFZw,5,1,0,1,Love this place. I got the early riser. And a ...,2019-11-05 19:50:55
3,ci3ehWCkRPBnU27vwiU3Zg,The Back Porch,Longwood,FL,32779,28.688764,-81.404833,4.0,58,"{'BikeParking': 'True', 'Corkage': 'True', 'Bu...",...,0,0,0,-DOjzYTxd5IkC9i8bRXQKw,5,3,0,1,Yum! This place is new and hopping! Loved the ...,2019-08-19 20:50:18
4,ci3ehWCkRPBnU27vwiU3Zg,The Back Porch,Longwood,FL,32779,28.688764,-81.404833,4.0,58,"{'BikeParking': 'True', 'Corkage': 'True', 'Bu...",...,0,0,0,caGzuMtdsYA9r973pONiVQ,3,0,0,0,Great concept but food just doesn't live up to...,2020-01-18 21:28:34


In [14]:
df.shape

(2023319, 206)

In [15]:
csv_name = 'yelp_food_business_reviews.csv'
df.to_csv('../Data/' + csv_name, index=False)