In [15]:
import pandas as pd
import numpy as np
import json
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


In [13]:


with open('yelp_academic_dataset_business.json', 'r') as file:
    data = [json.loads(line) for line in file]

df = pd.DataFrame(data)
print(df.head())

              business_id                      name  \
0  Pns2l4eNsfO8kk83dixA6A  Abby Rappoport, LAC, CMQ   
1  mpf3x-BjTdTEA3yCZrAYPw             The UPS Store   
2  tUFrWirKiKi_TAnsVWINQQ                    Target   
3  MTSW4McQd7CbVtyjqoe9mw        St Honore Pastries   
4  mWMc6_wTdE0EUBKIGXDVfA  Perkiomen Valley Brewery   

                           address           city state postal_code  \
0           1616 Chapala St, Ste 2  Santa Barbara    CA       93101   
1  87 Grasso Plaza Shopping Center         Affton    MO       63123   
2             5255 E Broadway Blvd         Tucson    AZ       85711   
3                      935 Race St   Philadelphia    PA       19107   
4                    101 Walnut St     Green Lane    PA       18054   

    latitude   longitude  stars  review_count  is_open  \
0  34.426679 -119.711197    5.0             7        0   
1  38.551126  -90.335695    3.0            15        1   
2  32.223236 -110.880452    3.5            22        0   
3  39.9555

In [14]:


with open('yelp_academic_dataset_review.json', 'r') as file:
    data = [json.loads(line) for line in file]

df2 = pd.DataFrame(data)
print(df2.head())

                review_id                 user_id             business_id  \
0  KU_O5udG6zpxOg-VcAEodg  mh_-eMZ6K5RLWhZyISBhwA  XQfwVwDr-v0ZS3_CbbE5Xw   
1  BiTunyQ73aT9WBnpR9DZGw  OyoGAe7OKpv6SyGZT5g77Q  7ATYjTIgM3jUlt4UM3IypQ   
2  saUsX_uimxRlCVr67Z4Jig  8g_iMtfSiwikVnbP2etR0A  YjUWPpI6HXG530lwP-fb2A   
3  AqPFMleE6RsU23_auESxiA  _7bHUi9Uuf5__HHc_Q8guQ  kxX2SOes4o-D3ZQBkiMRfA   
4  Sx8TMOWLNuJBWer-0pcmoA  bcjbaE6dDog4jkNY91ncLQ  e4Vwtrqf-wpJfwesgvdgxQ   

   stars  useful  funny  cool  \
0    3.0       0      0     0   
1    5.0       1      0     1   
2    3.0       0      0     0   
3    5.0       1      0     1   
4    4.0       1      0     1   

                                                text                 date  
0  If you decide to eat here, just be aware it is...  2018-07-07 22:09:11  
1  I've taken a lot of spin classes over the year...  2012-01-03 15:28:18  
2  Family diner. Had the buffet. Eclectic assortm...  2014-02-05 20:30:30  
3  Wow!  Yummy, different,  delici

In [16]:
# Download stopwords from the Natural Language Toolkit (NLTK)- a Python packaged used for Natural Language Processing (NLP)
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Changing names for df and df2

In [28]:
business_data = df
review_data = df2


In [None]:
# Filter for restaurants in California (CA)
ny_restaurants = business_data[
    (business_data['state'] == 'CA')
]

In [30]:
# Merge the restaurant review data with business review data based on business id
merged_data = review_data.merge(ny_restaurants[['business_id', 'name']], on='business_id')


In [31]:
# Build a text pre-processing function to clean and preprocess review text by removing special characters, converting to lowercase, tokenizing, and removing stopwords
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())  # Remove special characters & lowercase
    tokens = text.split()  # Tokenize
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return ' '.join(tokens)



In [32]:
# Apply the text preprocessing to reviews
merged_data['cleaned_text'] = merged_data['text'].apply(preprocess_text)


In [33]:
# TF-IDF Analysis (Term Frequency-Inverse Document Frequency) identifies important words in reviews
vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2)) # Keep top 1000 words, consider unigrams and bigrams
tfidf_matrix = vectorizer.fit_transform(merged_data['cleaned_text']) # Text data into numerical matrix
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out()) # Convert to dataframe


In [34]:
# Extract average TF-IDF scores for high rated (4-5 stars) and low rated (1-2 stars) reviews
high_rating_tfidf = tfidf_df[merged_data['stars'] >= 4].mean()
low_rating_tfidf = tfidf_df[merged_data['stars'] <= 2].mean()


In [35]:
# Print the top words in high rated and low rated reviews
print("Top words in high-rated reviews:\n", high_rating_tfidf.sort_values(ascending=False).head(10))
print("Top words in low-rated reviews:\n", low_rating_tfidf.sort_values(ascending=False).head(10))


Top words in high-rated reviews:
 great        0.051274
place        0.037035
food         0.036686
good         0.036530
service      0.029825
best         0.027646
amazing      0.025979
delicious    0.024426
time         0.024176
love         0.023700
dtype: float64
Top words in low-rated reviews:
 food       0.037973
service    0.034107
would      0.032264
place      0.031295
like       0.031230
one        0.030449
get        0.030344
dont       0.028450
us         0.028168
never      0.027722
dtype: float64


In [36]:
# Word2Vec Analysis for word relationships and tokenize each review into a list of words
tokenized_reviews = [text.split() for text in merged_data['cleaned_text']]


In [37]:
# Word2Vec Model to learn word relationships: input tokenized text, size of word dimensions, 5 works before and after, ignore words that appear less than 5 times, and use 4 CPU threads for faster training
word2vec_model = Word2Vec(sentences=tokenized_reviews, vector_size=100, window=5, min_count=5, workers=4)


In [38]:
# Sentiment Analysis with Valence Aware Dictionary and Sentiment Reasoner (VADER) calculating sentiment polarity scores
analyzer = SentimentIntensityAnalyzer()


In [39]:
# Function to derive sentiment score– returns compound sentiment score for the text
def get_sentiment_score(text):
    return analyzer.polarity_scores(text)['compound']


In [45]:
# Apply VADER sentiment analysis to each review
merged_data['sentiment_score'] = merged_data['text'].apply(get_sentiment_score)


In [46]:
# Compare average sentiment score for each rating
print(merged_data.groupby('stars')['sentiment_score'].mean())


stars
1.0   -0.181314
2.0    0.252823
3.0    0.617274
4.0    0.848550
5.0    0.892009
Name: sentiment_score, dtype: float64


In [47]:
# Save processed data
merged_data.to_csv('nyc_restaurant_reviews_processed.csv', index=False)
