In [1]:
# global imports

import pandas as pd
import os
from google.colab import drive
import json
import re
import nltk
import string

In [2]:
# load the dataset to the colab

drive.mount('/content/drive')
dataset_path = '/content/drive/MyDrive/yelp_dataset/'

reviews = dataset_path + "yelp_academic_dataset_review.json"
business = dataset_path + "yelp_academic_dataset_business.json"


dataset = dataset_path + "yelp_data.json"
data_cleaned = dataset_path + 'yelp_clean.json'

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

contraction_mapping = {
    "ain't": "is not",
    "aren't": "are not",
    "can't": "cannot",
    "could've": "could have",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "gonna": "going to",
    "gotta": "got to",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'll": "he will",
    "he's": "he is",
    "how'd": "how did",
    "how'll": "how will",
    "how's": "how is",
    "I'd": "I would",
    "I'll": "I will",
    "I'm": "I am",
    "I've": "I have",
    "isn't": "is not",
    "it'd": "it would",
    "it'll": "it will",
    "it's": "it is",
    "let's": "let us",
    "might've": "might have",
    "must've": "must have",
    "mustn't": "must not",
    "shan't": "shall not",
    "she'd": "she would",
    "she'll": "she will",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "that'll": "that will",
    "that's": "that is",
    "there's": "there is",
    "they'd": "they would",
    "they'll": "they will",
    "they're": "they are",
    "they've": "they have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'll": "we will",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'd": "what did",
    "what'll": "what will",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "where'd": "where did",
    "where'll": "where will",
    "where's": "where is",
    "who'd": "who did",
    "who'll": "who will",
    "who's": "who is",
    "who've": "who have",
    "why'd": "why did",
    "why'll": "why will",
    "why's": "why is",
    "won't": "will not",
    "would've": "would have",
    "wouldn't": "would not",
    "you'd": "you would",
    "you'll": "you will",
    "you're": "you are",
    "you've": "you have"
}

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# **Note:** Only for one-time execution

# Condense the dataset to only Restuarents and Food stalls


food_related_items = ['Chicken Wings', 'Bakeries', 'Creperies', 'Asian Fusion', 'Dumplings', 'Chocolatiers & Shops', 'Indian', 'Hot Tub & Pool', 'Ethiopian', 'Hawaiian', 'Poke', 'Soup', 'Vietnamese', 'Waffles', 'Falafel', 'Vegetarian', 'Cheesesteaks', 'Caribbean', 'Pita', 'Sushi Bars', 'Soup', 'Halal', 'Turkish', 'Chinese', 'Japanese Curry', 'Breweries', 'Taiwanese', 'Russian', 'Thai', 'Afghan', 'Tex-Mex', 'Iberian', 'Peruvian', 'Salvadoran', 'Laotian', 'Korean', 'Mexican', 'Dim Sum', 'Hakka', 'Venezuelan', 'Pakistani', 'Malaysian', 'Brazilian', 'Colombian', 'Cajun/Creole', 'Bubble Tea', 'Kebab', 'Trinidadian', 'Cambodian', 'Japanese', 'Tapas Bars', 'Tapas/Small Plates', 'Greek', 'restaurants', 'food']

food_related_business_ids = {}
with open(business, "r") as f:
  for i in f:
    json_data = json.loads(i)
    if json_data["categories"] != None:
      for j in food_related_items:
        if j in json_data["categories"]:
          food_related_business_ids[json_data["business_id"]] = json_data["name"]


def filter_specific_ids(file_name, out_file_name):
    out_file = open(out_file_name, "w+")
    start = True
    with open(file_name, "r") as f:
        out_file.write('[')
        for i in f:
            if json.loads(i)["business_id"] in food_related_business_ids:
              if start:
                  start = False
              else:
                  out_file.write(",")
              data = json.loads(i)
              data["business_name"] = food_related_business_ids[data["business_id"]]
              del data["review_id"]
              del data["user_id"]
              del data["business_id"]
              del data["date"]
              out_file.write(json.dumps(data))
    out_file.write(']')
    out_file.close()


filter_specific_ids(reviews, dataset)

In [4]:
# **Note:** Only for one-time execution

# cleanse the data
dataframe = pd.read_json(dataset)
stop_words = set(nltk.corpus.stopwords.words('english'))
lemmatizer = nltk.stem.WordNetLemmatizer()

def cleanse_data(d):

  #convert to lower case
  o = d.lower()

  # replace contractions with full forms
  for j in contraction_mapping.keys():
    o = o.replace(j, contraction_mapping[j])

  # replace extra white spaces, special characters
  o = re.sub(r'\s+', ' ', o)
  o = re.sub(r'[^a-zA-Z0-9\s]', '', o)

  # remove all new lines
  o = o.replace("\n", "")

  # remove stop words and lemmatize the tokens
  tokens = o.split()
  cleaned_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
  o = ' '.join(cleaned_tokens)

  # remove punctuation
  o = o.translate(str.maketrans('', '', string.punctuation))
  return o

dataframe["text"] = dataframe["text"].apply(cleanse_data)
dataframe.to_json(data_cleaned)

In [6]:
# Run to load the dataset

json_data = pd.read_json(data_cleaned)
print(json_data)

         stars  useful  funny  cool  \
0            5       1      0     1   
1            1       1      2     1   
2            3       0      0     0   
3            4       0      2     0   
4            5       0      0     0   
...        ...     ...    ...   ...   
2693380      3       1      0     0   
2693381      3       2      0     2   
2693382      4       2      0     1   
2693383      1       0      0     0   
2693384      4       3      0     2   

                                                      text  \
0        wow yummy different delicious favorite lamb cu...   
1        long term frequent customer establishment went...   
2        party 6 hibachi waitress brought separate sush...   
3        bun make sonoran dog like snuggie pup first se...   
4        tremendous service big shout douglas complemen...   
...                                                    ...   
2693380  excited food saw unfortunately place close ear...   
2693381  later yelp ive love place 

In [None]:
## Topic modelling