# Dataset Creation

In [1]:
import gzip
import json
import pandas as pd

In [2]:
# this function will parse the .json.gz files
def parse(path):
    with gzip.open(path, 'rt', encoding='utf-8') as file:
        for line in file:
            yield json.loads(line)

Our dataset contains review information on Google Map (ratings, text, images, etc.), business metadata (address, geographical info, descriptions, category information, price, open hours, and MISC info), and links (relative businesses) up to Sep 2021 in Massachusetts, US. The metadata is in `meta-Massachusetts.json.gz` while the reviews themselves are in `review-Massachusetts.json.gz`. This dataset is for all businesses (not just restaurants) so we extract the IDs of restaurants in Boston (our focus area) from `meta-Massachusetts.json.gz` and join it with `review-Massachusetts.json.gz` to only keep the relevant reviews.

In [3]:
ids, names, addresses, categories = [], [], [], []

# Parse the JSON file and extract required fields
for item in parse("dataset_materials/meta-Massachusetts.json.gz"):
    ids.append(item.get("gmap_id", ""))
    names.append(item.get("name", ""))
    addresses.append(item.get("address", ""))
    
    # Check if the category field is a list
    if isinstance(item.get("category", []), list):
        categories.append(", ".join(item.get("category", [])))
    else:
        categories.append(item.get("category", ""))

metadata_df = pd.DataFrame({
    "Id": ids,
    "Name": names,
    "Address": addresses,
    "Category": categories
})

metadata_df['Category'] = metadata_df['Category'].fillna('')
metadata_df['Address'] = metadata_df['Address'].fillna('')

metadata_df = metadata_df[metadata_df['Category'].str.contains('restaurant', case=False)]
metadata_df = metadata_df[metadata_df['Address'].str.contains('Boston', case=False)]

metadata_df = metadata_df.drop_duplicates(subset=['Id'])

metadata_df

Unnamed: 0,Id,Name,Address,Category
34,0x89e3706a62ba248f:0xdd2878d145d53400,La Sanghita Cafe,"La Sanghita Cafe, 389 Maverick St, Boston, MA ...",Restaurant
172,0x89e37bae6173982b:0x2b2156422703d5cb,Mi Casa Tu Casa,"Mi Casa Tu Casa, 299 Hancock St, Boston, MA 02125","Latin American restaurant, Guatemalan restaurant"
278,0x89e379c581008a4f:0x8d5e88d475f0c4c8,BOSTON KABOB COMPANY,"BOSTON KABOB COMPANY, 164 Brighton Ave, Allsto...","Middle Eastern restaurant, Sandwich shop"
279,0x89e37a0fb3130c95:0xd327bb0537dda8e,Qdoba,"Qdoba, 800 Boylston St, Boston, MA 02199","Mexican restaurant, Fast food restaurant, Vege..."
611,0x89e37fd5ed928c57:0xa0333b777aae0bfa,Fair Nutrition,"Fair Nutrition, 57 Fairmount Ave, Boston, MA 0...",Health food restaurant
...,...,...,...,...
92475,0x89e37a0dd4b22005:0x1d298662a3c2c1b2,Luke's Lobster Back Bay,"Luke's Lobster Back Bay, 75 Exeter St, Boston,...","Seafood restaurant, American restaurant, Cater..."
92495,0x89e3703d22d93cbb:0x5a6aed56fd3bbb16,Beer Works,"Beer Works, 300 Terminal Dr, Boston, MA 02128","Restaurant, Bar"
92505,0x89e379f5e3d9fe1b:0xa265a864b0f05b6a,Tasty Burger,"Tasty Burger, 1301 Boylston St, Boston, MA 02215","Hamburger restaurant, American restaurant, Fas..."
92508,0x89e37a7828570e3d:0x65a59e5e04b797bf,Hong Kong Eatery,"Hong Kong Eatery, 79 Harrison Ave, Boston, MA ...","Cantonese restaurant, Asian restaurant, Noodle..."


In [4]:
ids, reviews = [], []

for item in parse("dataset_materials/review-Massachusetts.json.gz"):
    ids.append(item.get("gmap_id", ""))
    reviews.append(item.get("text", ""))

reviews_df = pd.DataFrame({
    "Id": ids,
    "Review": reviews
})

reviews_df = reviews_df.dropna(subset=['Review'])
valid_ids = metadata_df['Id'].tolist()
reviews_df = reviews_df[reviews_df['Id'].isin(valid_ids)]
reviews_df

Unnamed: 0,Id,Review
469,0x89e3706a62ba248f:0xdd2878d145d53400,"Amazing food, decent price, healthy, friendly ..."
470,0x89e3706a62ba248f:0xdd2878d145d53400,"The food and ambiance are great, and there is ..."
471,0x89e3706a62ba248f:0xdd2878d145d53400,Went here with a big group of vegans and veget...
472,0x89e3706a62ba248f:0xdd2878d145d53400,"This place is so awesome. Great food, super he..."
473,0x89e3706a62ba248f:0xdd2878d145d53400,"Great location in East Boston, nice place. exc..."
...,...,...
10433465,0x89e37a86be93faff:0x5577fb629400bf46,(Translated by Google) Handsome\n\n(Original)\...
10433466,0x89e37a86be93faff:0x5577fb629400bf46,(Translated by Google) Very pretty\n\n(Origina...
10433467,0x89e37a86be93faff:0x5577fb629400bf46,(Translated by Google) Cool park with events\n...
10433468,0x89e37a86be93faff:0x5577fb629400bf46,(Translated by Google) Rxcellent\n\n(Original)...


In [5]:
merged_df = reviews_df.merge(metadata_df, on='Id', how='left')
merged_df = merged_df.drop(columns=['Category', 'Id', 'Address'])
merged_df

Unnamed: 0,Review,Name
0,"Amazing food, decent price, healthy, friendly ...",La Sanghita Cafe
1,"The food and ambiance are great, and there is ...",La Sanghita Cafe
2,Went here with a big group of vegans and veget...,La Sanghita Cafe
3,"This place is so awesome. Great food, super he...",La Sanghita Cafe
4,"Great location in East Boston, nice place. exc...",La Sanghita Cafe
...,...,...
395324,(Translated by Google) Handsome\n\n(Original)\...,The Lawn On D
395325,(Translated by Google) Very pretty\n\n(Origina...,The Lawn On D
395326,(Translated by Google) Cool park with events\n...,The Lawn On D
395327,(Translated by Google) Rxcellent\n\n(Original)...,The Lawn On D


Some reviews were translated by Google Translate. Below, we clean those reviews to only include the English translation.

In [6]:
def clean_translated_reviews(input_string):
    original_index = input_string.find("(Original)")
    cleaned_string = input_string[:original_index].replace("(Translated by Google)", "").strip()
    return cleaned_string

merged_df['Review'] = merged_df['Review'].apply(lambda x: (clean_translated_reviews(x)))
merged_df['Review'] = merged_df['Review'].apply(lambda x: x.replace("\n", " "))
merged_df

Unnamed: 0,Review,Name
0,"Amazing food, decent price, healthy, friendly ...",La Sanghita Cafe
1,"The food and ambiance are great, and there is ...",La Sanghita Cafe
2,Went here with a big group of vegans and veget...,La Sanghita Cafe
3,"This place is so awesome. Great food, super he...",La Sanghita Cafe
4,"Great location in East Boston, nice place. exc...",La Sanghita Cafe
...,...,...
395324,Handsome,The Lawn On D
395325,Very pretty,The Lawn On D
395326,Cool park with events,The Lawn On D
395327,Rxcellent,The Lawn On D


In [7]:
merged_df.to_csv('datasets/extracted_db.csv')