In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# .tar dataset extraction command:
# !tar xvzf '<path to>/yelp_dataset.tar'

# Imports

In [None]:
# IMPORTS

import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string
import pickle

# Data cleaning and prepping helper functions

In [None]:
# DATA CLEANING AND PREPPING FUNCTIONS

nltk.download('punkt')
nltk.download('stopwords')
stop_words = stopwords.words('english')
nltk.download('wordnet')

# punctuation_removal function
def punctuation_removal(tokens):
    token_lst = []
    for token in tokens:
        if token not in string.punctuation:
            token_lst.append(token)
    return token_lst

# stopword_removal function
def stopword_removal(tokens):
    token_lst = []
    for token in tokens:
        if token not in stop_words:
            token_lst.append(token)
    return token_lst

def stemm(tokens):
    token_lst = []
    for token in tokens:
        token_lst.append(PorterStemmer().stem(token))
    return token_lst

def review_clean(text):

    # lowercasing review text
    text = text.lower()
    # tokenizing review text
    tokens = word_tokenize(text)
    # use punctuation_removal function
    tokens = punctuation_removal(tokens)
    # use stopword_removal function
    tokens = stopword_removal(tokens)
    # use stemm function
    tokens = stemm(tokens)

    return tokens

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


# Loading Business and Review Datasets

These notebooks were created in Google Colab, and filepaths reflect that.  If not running in Google Colab, please replace filepaths with the correct location of the file.

In [None]:
#load business dataset
df_business = pd.read_json("/content/drive/MyDrive/UMich Milestone II Project/Final_Code_Submission/Dataset/yelp/yelp_academic_dataset_business.json", lines=True)
df_business.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


In [None]:
# load reviews dataset

# NOTE: it takes a while to load (about 10 mins)

review = "/content/drive/MyDrive/UMich Milestone II Project/Final_Code_Submission/Dataset/yelp/yelp_academic_dataset_review.json"

chunks = pd.read_json(review, lines=True, chunksize = 10000)
reviews_df = pd.DataFrame()
for chunk in chunks:
  reviews_df = pd.concat([reviews_df, chunk])

In [None]:
reviews_df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15


# Sampling from Review Dataset: Philadelphia

In [None]:
def get_restaurants(df):
    # load yelp businesses dataset
    df_business = df
    # get only businesses that are labeled as restaurants
    return df_business[df_business['categories'].str.contains('Restaurants')==True]
def get_restaurant_reviews(df, restaurants_df):
    # get dataframe of only reviews that are for restaurants
    return df[df['business_id'].isin(restaurants_df['business_id'])].reset_index(drop=True)

In [None]:
restaurants_df = get_restaurants(df_business)
restaurant_reviews_df = get_restaurant_reviews(reviews_df, restaurants_df)

In [None]:
# print the cities with the largest number of restaurant reviews
print(restaurants_df.groupby('city')['review_count'].sum().sort_values(ascending=False)[:5])

city
Philadelphia    665732
New Orleans     465982
Nashville       318548
Tampa           293122
Indianapolis    242024
Name: review_count, dtype: int64
state
PA    1060906
FL     763837
LA     544962
TN     423749
MO     341270
Name: review_count, dtype: int64


The city with the largest amount of restaurant reviews is Philadelphia, so we will narrow our data down to only restaurants in Philadelphia.

In [None]:
# creates a dataframe of reviews for one city
def get_city_reviews(city, restaurant_reviews_df=restaurant_reviews_df, restaurants_df=restaurants_df):
    city_ids = restaurants_df[restaurants_df['city']==city]['business_id']
    return restaurant_reviews_df[restaurant_reviews_df['business_id'].isin(city_ids)].reset_index(drop=True)

In [None]:
# get dataframe of reviews for restaurants in Philadelphia
PH_df = get_city_reviews("Philadelphia")
# save Philadelphia reviews dataframe to json file
PH_df.to_json("/content/drive/MyDrive/UMich Milestone II Project/Final_Code_Submission/Dataset/philadelphia_restaurant_reviews.json")

# Cleaning Philadelphia Dataset

In [None]:
# loading dataset
philadelphia_df = pd.read_json("/content/drive/MyDrive/UMich Milestone II Project/Final_Code_Submission/Dataset/Colab Notebooks/philadelphia_restaurant_reviews.json")
# replacing accented characters with their base characters
philadelphia_df['text'] = philadelphia_df['text'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
# cleaning using data cleaning helper functions
philadelphia_df['cleaned_text'] = philadelphia_df['text'].apply(lambda x: review_clean(x))
philadelphia_df.head()

In [None]:
# save cleaned df to pickle so we don't have to repeat cleaning
philadelphia_df.to_pickle("/content/drive/MyDrive/UMich Milestone II Project/Final_Code_Submission/Dataset/cleaned_philadelphia_restaurant_reviews.pkl")



---

