In [1]:
import re
import json
import string
from pathlib import Path

from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext.data import Field, BucketIterator, TabularDataset
from IPython.display import clear_output
import matplotlib.pyplot as plt

In [2]:
YELP_DIR = Path("data/yelp_dataset/")
DATA = Path("data")
REVIEWS_FOLDER = Path("data")/"reviews"

BUSINESS_FILE = "yelp_academic_dataset_business.json"
REVIEWS_FILE = "yelp_academic_dataset_review.json"
RESTAURANT = "restaurant"

RE_WHITESPACE = r"|".join([el+"+" for el in list(string.whitespace[1:])])

In [3]:
def get_total_rows(input_file):
    total_rows = sum(1 for line in input_file)
    input_file.seek(0)
    return total_rows

In [4]:
def get_business_id_by_label(path, label):
    business_ids = set()
    with open(path, "r") as jfile:
        total_rows = get_total_rows(jfile)

        for business in tqdm(jfile, total=total_rows):
#         for business in jfile:
            try:
                business = json.loads(business)
            except json.JSONDecodeError as ex:
                print("JSONDecodeError", ex)
                continue

            categories = business.get("categories", "")
            business_id = business.get("business_id")
            if categories and business_id and (label in categories.lower()):
                business_ids.add(business_id)
    
    return business_ids

In [5]:
def write_reviews_by_business_id(review_path, output_folder, business_ids):
    printable_set = set(string.printable)
    
    with open(review_path, "r") as review_file,\
         open(output_folder/"bad_review.json", "w") as bad_review,\
         open(output_folder/"good_review.json", "w") as good_review:

        total_rows = get_total_rows(review_file)

        for review in tqdm(review_file, total=total_rows):
            try:
                review = json.loads(review)
            except json.JSONDecodeError as ex:
                print("JSONDecodeError", ex)
                continue
            
            review_id = review.get("review_id")
            business_id = review.get("business_id")
            business_id_is_valid = business_id and (business_id in business_ids)
            stars = review.get("stars")
            stars_is_valid = stars and stars.is_integer()
            text = review.get("text")

            if business_id_is_valid and stars_is_valid and text and review_id:
                stars = int(stars)
                if stars == 5 and set(text).issubset(printable_set):
                    text = re.sub(r" +", " ", re.sub(RE_WHITESPACE, " ", text))
                    if len(text) <= 250:
                        good_review.write(json.dumps({"review_id": review_id, "text": text}))
                        good_review.write("\n")
                if stars in {1, 2} and set(text).issubset(printable_set):
                    text = re.sub(r" +", " ", re.sub(RE_WHITESPACE, " ", text))
                    if len(text) <= 250:
                        bad_review.write(json.dumps({"review_id": review_id, "text": text}))
                        bad_review.write("\n")

_________

In [7]:
business_ids = get_business_id_by_label(YELP_DIR/BUSINESS_FILE, RESTAURANT)

FileNotFoundError: [Errno 2] No such file or directory: 'data/yelp_dataset/yelp_academic_dataset_business.json'

In [8]:
write_reviews_by_business_id(YELP_DIR/REVIEWS_FILE, REVIEWS_FOLDER, business_ids)

100%|██████████| 6990280/6990280 [04:23<00:00, 26495.97it/s]


In [9]:
with open(REVIEWS_FOLDER/"bad_review.json", "r") as input_file:
    for line, _ in zip(input_file, range(5)):
        print(line)

{"review_id": "cCs7yPSyk8NdA-Ufoz_7hw", "text": "The service here has gone down. We used to go here a lot but now we go to another one. The service was slow and forgotten about."}

{"review_id": "22n_UbOS_iqKKR3IrYhNGw", "text": "I went here when if first opened, and have never desired to go back again, I was so very disappointed at the quality and the taste of the food. I would absolutely not recommend dining here."}

{"review_id": "ULzbgT5SPThiMOoo4a_Adw", "text": "Ordered Caramel frappe at Drive thru, BIG MISTAKE! Took 30 min and there were only 2 cars in front of me."}

{"review_id": "y1CYqccQLsn8GGXYS4q6tQ", "text": "They served me a frozen pizza and skunked beer. 'Nuff said."}

{"review_id": "lAjAKzVgkUHcq1aLHzDBDA", "text": "I can walk here and won't go back even if they were the only place open at 2am. I got the basic general tso's and it didn't come close to passing the test. Go to East Cuisine."}



In [10]:
with open(REVIEWS_FOLDER/"good_review.json", "r") as input_file:
    for line, _ in zip(input_file, range(5)):
        print(line)

{"review_id": "AqPFMleE6RsU23_auESxiA", "text": "Wow! Yummy, different, delicious. Our favorite is the lamb curry and korma. With 10 different kinds of naan!!! Don't let the outside deter you (because we almost changed our minds)...go in and try something new! You'll be glad you did!"}

{"review_id": "_ZeMknuYdlQcUqng_Im3yg", "text": "Amazingly amazing wings and homemade bleu cheese. Had the ribeye: tender, perfectly prepared, delicious. Nice selection of craft beers. Would DEFINITELY recommend checking out this hidden gem."}

{"review_id": "OAhBYw8IQ6wlfw1owXWRWw", "text": "Great place for breakfast! I had the waffle, which was fluffy and perfect, and home fries which were nice and smashed and crunchy. Friendly waitstaff. Will definitely be back!"}

{"review_id": "G_5UczbCBJriUAbxz3J7Tw", "text": "Best thai food in the area. Everything was authentic and delicious. Will definitely be back again and again."}

{"review_id": "-P5E9BYUaK7s3PwBF5oAyg", "text": "First time there and it was e