This notebook extracts restaurant reviews from the YELP dataset. We process the data and format it into a CSV file (shape: 3000x1), where each row corresponds to a different restaurant's 10 reviews, concatenated into a string. The reviews have been carefully processed and selected.

In [None]:
import json
import csv
import pandas as pd

In [None]:
NUM_REVIEWS_PER_RESTAURANT    = 30
NUM_RESTAURANTS               = 3000
USEFUL_THRESHOLD              = 2
TOTAL_REVIEWS_MIN_LENGTH      = 5000
TOTAL_REVIEWS_MAX_LENGTH      = 7500

In [None]:
# load businesses, convert to data frame
businesses_file = "/content/drive/MyDrive/review-summarizer/yelp_dataset/yelp_academic_dataset_business.json"
businesses_df = pd.read_json(businesses_file, lines=True)
print(businesses_df.shape) # 150346 x 14
businesses_df.head()

In [None]:
print("Before: " + str(businesses_df.shape))

# remove businesses without categories field
businesses_df = businesses_df.loc[businesses_df['categories'].notnull()]
print("After categories exists: " + str(businesses_df.shape))

# filter for restaurants
businesses_df = businesses_df.loc[businesses_df['categories'].str.contains('Restaurants')]
print("After only restaurants: " + str(businesses_df.shape))

# filter for review_count
businesses_df = businesses_df.loc[businesses_df['review_count'] >= NUM_REVIEWS_PER_RESTAURANT]
print("After review count restaurants: " + str(businesses_df.shape))

In [None]:
# remove unnecessary columns
businesses_df = businesses_df.drop(columns = ['latitude', 'longitude', 'is_open', 'attributes', 'hours', 'categories', 'review_count'])
businesses_df.head()

In [None]:
# open review file as json, CAN'T convert to data frame
review_file = '/content/drive/MyDrive/review-summarizer/yelp_dataset/yelp_academic_dataset_review.json'
review_data = open(review_file, 'r')

# load json objects into array
review_data_array = []
for line in review_data:
    review_data_array.append(json.loads(line))

print(len(review_data_array))
print(review_data_array[0]) # each entry = 1 json

In [None]:
# convert to data frame
reviews_df = pd.DataFrame(review_data_array)
print(reviews_df.shape)
reviews_df.head()

In [None]:
# merge businesses, reviews by business_id
merged_df = pd.merge(reviews_df, businesses_df, on= 'business_id', how='inner')
merged_df.head()

In [None]:
# remove unnecessary columns
merged_df = merged_df.drop(columns = ['review_id', 'user_id', 'stars_x', 'funny',
       'cool', 'date', 'name', 'address', 'city', 'state',
       'postal_code', 'stars_y'])
merged_df.head()

In [None]:
# remove missing values
merged_df = merged_df.dropna()
merged_df.head()

In [None]:
# sort by business_id and useful rating, condense into 1 string per row
# with 10 reviews each
merged_df = merged_df.sort_values(["business_id", "useful"], ascending=[True, False])\
.groupby("business_id")["text"]\
.apply(lambda x: x.head(10).tolist())\
.reset_index(level=0, drop=True).to_frame(name="Top 10 Reviews")
merged_df.head()

In [None]:
# concatenate the lists of 10 reviews into strings
merged_df['Top 10 Reviews'] = merged_df['Top 10 Reviews'].apply(lambda reviews: ' '.join(reviews))

In [None]:
merged_df['Review_Lengths'] = merged_df['Top 10 Reviews'].apply(lambda reviews_string: len(reviews_string))
print(merged_df.shape)
merged_df.head()

In [None]:
# filter by min and max threshold
filtered_df = merged_df.loc[(merged_df['Review_Lengths'] >= TOTAL_REVIEWS_MIN_LENGTH) & (merged_df['Review_Lengths'] <= TOTAL_REVIEWS_MAX_LENGTH)]
filtered_df.head()
print(filtered_df.shape)

In [None]:
filtered_df = filtered_df['Top 10 Reviews'].apply(lambda reviews_string: ''.join(char for char in reviews_string if ord(char) < 128))

In [None]:
# cut cleaned_df dataframe to 3,000 rows
final_df = filtered_df.head(NUM_RESTAURANTS)
final_df.head()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# create csv file
final_df.to_csv('/content/drive/MyDrive/review-summarizer/final_reviews.csv', index=False)