# Analyzing Review Text
Extract the sentiment (positive or negative) and gain insight from Yelp review text. 

In [1]:
import json

json_file_path = 'yelp.json'

# Open and read the JSON file
with open(json_file_path, 'r') as json_file:
    data = json.load(json_file)

stars = [row['stars'] for row in data]
print(data[0]['text'])
print(stars[:2])

I don't know what Dr. Goldberg was like before  moving to Arizona, but let me tell you, STAY AWAY from this doctor and this office. I was going to Dr. Johnson before he left and Goldberg took over when Johnson left. He is not a caring doctor. He is only interested in the co-pay and having you come in for medication refills every month. He will not give refills and could less about patients's financial situations. Trying to get your 90 days mail away pharmacy prescriptions through this guy is a joke. And to make matters even worse, his office staff is incompetent. 90% of the time when you call the office, they'll put you through to a voice mail, that NO ONE ever answers or returns your call. Both my adult children and husband have decided to leave this practice after experiencing such frustration. The entire office has an attitude like they are doing you a favor. Give me a break! Stay away from this doc and the practice. You deserve better and they will not be there when you really need

Build a linear model predicting the star rating based on the text reviews. 
Apply the bag-of-words model using the CountVectorizer to produce a feature matrix giving the counts of each word in each review.

In [2]:
# predicting the star rating based on the text reviews
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# convert dict to df
class ToDataFrame(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        # This transformer doesn't need to learn anything about the data,
        # so it can just return self without any further processing
        return self
    
    def transform(self, X):
        # Return a pandas data frame from X
        X = pd.DataFrame.from_dict(X)
        return X

#check the function works and give us the right format
to_data_frame = ToDataFrame()
X_t = to_data_frame.fit_transform(data[:5])
print((X_t == pd.DataFrame(data[:5])).all(axis=None))

selector = ColumnTransformer(
    transformers=[('text', to_data_frame, ['text'])
])
expected = np.array([data[0]['text']])

# Check the selector returns just two columns, the latitude and longitude
print((selector.fit_transform(X_t)[0] == expected).all())

True
True


In [3]:
#tune and get hyperparameter for countvectorizer

from sklearn.feature_extraction.text import CountVectorizer

X = data
y = [row['stars'] for row in data]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

to_data_frame = ToDataFrame()
ridge = Ridge()
text_vectorizer = CountVectorizer()

param_grid = {
    'ct1__text__text_vectorizer__max_df': [0.1, 0.25, 0.5, 0.6],
    'ct1__text__text_vectorizer__min_df': [5,10,20,30]
    
    } 

pipe1 = Pipeline([ 
    ('text_vectorizer', text_vectorizer)])

ct1 = ColumnTransformer([('text', pipe1, 'text')])

pipe2 = Pipeline(
    [('to_data_frame', to_data_frame), 
     ('ct1', ct1), 
     ('ridge', ridge)])

gs = GridSearchCV(pipe2, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, error_score='raise')

gs.fit(X_train, y_train)

print(gs.best_score_)
print(gs.best_params_)



-0.9223282322731482
{'ct1__text__text_vectorizer__max_df': 0.6, 'ct1__text__text_vectorizer__min_df': 30}


In [4]:
# tune and get hyperparameter for ridge regression

# selector = ColumnTransformer(
#     transformers=[('text_vectorizer', text_vectorizer, ['text'])
# ])

# pipe = Pipeline(
#     [('to_data_frame', to_data_frame), 
#      ('select', selector), 
#      ('ridge', ridge)])


# pipe.fit(X_train, y_train)
# pipe.predict(X_train[:20])

# gs = GridSearchCV(pipe2, {'ridge__alpha': np.logspace(-1.0, 1.0, num=7)})
# gs.fit(X_train, y_train)
# print(gs.best_params_)

#{'ridge__alpha': 10.0}

In [5]:
# Put in final hyperparameter and train the model
text_vectorizer = CountVectorizer(max_df=0.6, min_df=30)
ridge = Ridge(alpha=10)

X = data
y = [row['stars'] for row in data]

X_train, X_test, y_train, y_test = train_test_split(data, stars, test_size=0.33)

pipe1 = Pipeline([ 
    ('text_vectorizer', text_vectorizer)])

ct1 = ColumnTransformer([('text', pipe1, 'text')])


pipe2 = Pipeline(
    [('to_data_frame', to_data_frame), 
     ('ct1', ct1), 
     ('ridge', ridge)])

bag_of_words_model = pipe2.fit(X_train, y_train)
bag_of_words_model.predict(X_test)

bag_of_words_model.fit(data, stars)

In [6]:
from sklearn.metrics import mean_squared_error, r2_score
# Predict on the test data
predictions = bag_of_words_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, predictions)

print(f"MSE: {mse}, RMSE: {rmse}, R² score: {r2}")
# About 47.7% of the variance in the target variable can be explained by the features used in the model

MSE: 0.7450471336743405, RMSE: 0.8631611284542072, R² score: 0.559313122706556


Bigram_model
Consider word pairs and use a vectorizer(TfidfVectorizer) that applies normalization to avoid overfitting.

In [7]:
# idf inverse document frequency: the more this word happens among document, the less important it is, 
# coz just a generic word

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

to_data_frame = ToDataFrame()
ridge = Ridge()
text_vectorizer = CountVectorizer(ngram_range=(2, 2))
tfidf_transformer = TfidfTransformer()  # Add TfidfTransformer

X = data
y = [row['stars'] for row in data]

X_train, X_test, y_train, y_test = train_test_split(data, stars, test_size=0.33)



pipe1 = Pipeline([ 
    ('text_vectorizer', text_vectorizer),
    ('tfidf_transformer', tfidf_transformer)
])

ct1 = ColumnTransformer([('text', pipe1, 'text')])


pipe2 = Pipeline(
    [('to_data_frame', to_data_frame), 
     ('ct1', ct1), 
     ('ridge', ridge)])


bigram_model = pipe2.fit(X_train, y_train)
bigram_model.predict(X_test)

array([5.24013401, 4.26779933, 3.96754028, ..., 2.11079104, 3.84066754,
       2.62935204])

In [8]:
# Predict on the test data
predictions = bigram_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, predictions)

print(f"MSE: {mse}, RMSE: {rmse}, R² score: {r2}")
# About 63.9% of the variance in the target variable can be explained by the features used in the model, much better!

MSE: 0.6091988671587235, RMSE: 0.7805119263398372, R² score: 0.6399759926056632


Find the top 25 "polarizing words" in the corpus of reviews for both positive and negative.

In [9]:
# Keep only the "most polar" reviews
polar_data = [row for row in data if row.get('stars') == 1 or row.get('stars') == 5]
polar_star = [row['stars'] for row in polar_data]
set(polar_star)

{1, 5}

In [10]:
# Use the naive Bayes model, MultinomialNB, TF-IDF weighting, remove stop words.

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
#from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from spacy.lang.de.stop_words import STOP_WORDS
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# Filter the collection to only keep one-star and five-star reviews
polar_data = [row for row in data if row.get('stars') == 1 or row.get('stars') == 5]

# Convert the polar data to a DataFrame
polar_df = pd.DataFrame(polar_data)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(polar_df['text'], polar_df['stars'], test_size=0.2, random_state=42)



# Create a pipeline with TF-IDF weighting, removing stop words, and Multinomial Naive Bayes
text_clf = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=list(STOP_WORDS))),
    ('clf', MultinomialNB())
])


# Train the model
text_clf.fit(X_train, y_train)

# Get feature names from TF-IDF vectorizer
feature_names = text_clf.named_steps['tfidf'].get_feature_names_out()

# Get the log probabilities from the Multinomial Naive Bayes model
log_probs = text_clf.named_steps['clf'].feature_log_prob_

# Combine feature names with log probabilities
word_log_prob_pairs = zip(feature_names, log_probs[1] - log_probs[0])  # Calculate the difference for positive class

# Sort the pairs by log probability values (ascending for least positive, descending for most positive)
sorted_word_log_probs = sorted(word_log_prob_pairs, key=lambda x: x[1], reverse=True)

# Extract the top 25 most positive and least positive words
most_positive_words = [word for word, _ in sorted_word_log_probs[:25]]
least_positive_words = [word for word, _ in sorted_word_log_probs[-25:]]

# Display the results
print('Most Positive Words:', most_positive_words)
print('Least Positive Words:', least_positive_words)

Most Positive Words: ['perfection', 'delicious', 'fantastic', 'gem', 'yummy', 'impeccable', 'amazing', 'excellent', 'yum', 'delish', 'perfect', 'outstanding', 'awesome', 'refreshing', 'notch', 'incredible', 'wonderful', 'perfectly', 'terrific', 'superb', 'favorite', 'loved', 'heaven', 'favorites', 'divine']
Least Positive Words: ['disgusted', 'lukewarm', 'lousy', 'rancid', 'crooks', 'unfriendly', 'blamed', 'inedible', 'rudest', 'insult', 'disrespectful', 'terrible', 'refund', 'awful', 'disgusting', 'rude', 'tasteless', 'rudely', 'incompetent', 'poisoning', 'horrible', 'unhelpful', 'unprofessional', 'unacceptable', 'worst']


Look over all reviews of restaurants. We want to find collocations --- that is, bigrams that are "special" and appear more often than you'd expect from chance. Find word pairs that are unlikely to occur consecutively based on the underlying probability of their words: Basically find high p(w1w2)/p(w1)p(w2) for Top 100 bigrams 

In [11]:
json_file_path = 'biz_data.json'

# Open and read the JSON file
with open(json_file_path, 'r') as json_file:
    business_data = json.load(json_file)

len(business_data)

37938

In [12]:
restaurants = [business for business in business_data if 'categories' in business and any('restaurants' in category.lower() for category in business['categories'])]
restaurant_ids = [b['business_id'] for b in restaurants]
restaurant_reviews = [row['text'] for row in data if row['business_id'] in restaurant_ids]
len(restaurant_ids) #12876

12876

In [13]:
restaurant_data = [row for row in data if row['business_id'] in restaurant_ids]
restaurant_data[0]

{'votes': {'funny': 6, 'useful': 0, 'cool': 0},
 'user_id': 'ZYaumz29bl9qHpu-KVtMGA',
 'review_id': 'ow1c4Lcl3ObWxDC2yurwjQ',
 'stars': 4,
 'date': '2009-05-04',
 'text': "If you like lot lizards, you'll love the Pine Cone!",
 'type': 'review',
 'business_id': 'JwUE5GmEO-sH1FuwJgKBlQ'}

In [14]:
business_data[0]

{'business_id': 'vcNAWiLM4dR7D2nwwJ7nCA',
 'full_address': '4840 E Indian School Rd\nSte 101\nPhoenix, AZ 85018',
 'hours': {'Tuesday': {'close': '17:00', 'open': '08:00'},
  'Friday': {'close': '17:00', 'open': '08:00'},
  'Monday': {'close': '17:00', 'open': '08:00'},
  'Wednesday': {'close': '17:00', 'open': '08:00'},
  'Thursday': {'close': '17:00', 'open': '08:00'}},
 'open': True,
 'categories': ['Doctors', 'Health & Medical'],
 'city': 'Phoenix',
 'review_count': 7,
 'name': 'Eric Goldberg, MD',
 'neighborhoods': [],
 'longitude': -111.983758,
 'state': 'AZ',
 'stars': 3.5,
 'latitude': 33.499313,
 'attributes': {'By Appointment Only': True},
 'type': 'business'}

In [15]:
import pandas as pd
res_df = pd.DataFrame(restaurant_data)
res_df.head()

Unnamed: 0,votes,user_id,review_id,stars,date,text,type,business_id
0,"{'funny': 6, 'useful': 0, 'cool': 0}",ZYaumz29bl9qHpu-KVtMGA,ow1c4Lcl3ObWxDC2yurwjQ,4,2009-05-04,"If you like lot lizards, you'll love the Pine ...",review,JwUE5GmEO-sH1FuwJgKBlQ
1,"{'funny': 0, 'useful': 0, 'cool': 0}",EEYwj6_t1OT5WQGypqEPNg,4iPPOQIo5Mr1NAUPUgCUrQ,4,2011-03-31,Only went here once about a year and a half ag...,review,JwUE5GmEO-sH1FuwJgKBlQ
2,"{'funny': 0, 'useful': 1, 'cool': 0}",MnXcXwr0keJpkIiwuPsOKg,_utPYHIdXeq8CqQ4iYD1bw,3,2012-01-08,Ate a Saturday morning breakfast at the Pine C...,review,JwUE5GmEO-sH1FuwJgKBlQ
3,"{'funny': 0, 'useful': 1, 'cool': 0}",wC8r-m6KHifL6R2i8ok8yg,gksnzyc9jQ9hNXESjvTrQw,3,2012-08-26,This is definitely not your usual truck stop. ...,review,JwUE5GmEO-sH1FuwJgKBlQ
4,"{'funny': 0, 'useful': 0, 'cool': 0}",RvweNJFVkR3ttkWsIBy7nQ,PCa_K6ijV3Tzbp6nouEiJQ,4,2014-03-13,I like this location better than the one near ...,review,JwUE5GmEO-sH1FuwJgKBlQ


In [16]:
# Use CountVectorizer to compute the word and bigram frequencies for the documents in the corpus

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

CV = CountVectorizer(min_df = 10)
count_vector = CV.fit_transform(restaurant_reviews)
print(count_vector.shape)

CV2 = CountVectorizer(ngram_range = (2, 2), min_df = 10)
count_vector2 = CV2.fit_transform(restaurant_reviews)
print(count_vector2.shape)

(143361, 19632)
(143361, 155628)


In [17]:
# Compute the total word frequencies and bigram frequencies across all documents in the corpus 
# Then convert the results into single-column dataframes and then dictionaries

count_vector.toarray()
count_vector.sum(axis = 0)
pd.DataFrame(count_vector.sum(axis = 0), columns = CV.get_feature_names_out())
word_freqs_df = pd.DataFrame(count_vector.sum(axis = 0), columns = CV.get_feature_names_out()).T
word_freqs_df
word_freqs_dict = word_freqs_df.to_dict()[0]
bigram_freqs_df = pd.DataFrame(count_vector2.sum(axis = 0), columns = CV2.get_feature_names_out()).T
bigram_freqs_df
bigram_freqs_dict = bigram_freqs_df.to_dict()[0]

In [18]:
# Compute the total number of words, unique words, and bigrams across all documents; 
# Calculate the uniform word probability as 1/total unique words

CV_all = CountVectorizer()
count_vector_all = CV_all.fit_transform(restaurant_reviews)
total_words = count_vector_all.sum()
total_unique_words = count_vector_all.shape[1]

CV2_all = CountVectorizer(ngram_range = (2, 2))
count_vector2_all = CV2_all.fit_transform(restaurant_reviews)
total_bigrams = count_vector2_all.sum()

print(f"There are totally {total_words} words and {total_bigrams} bigrams in the reviews.  The number of unique words is {total_unique_words}.")

There are totally 17077355 words and 16934005 bigrams in the reviews.  The number of unique words is 87880.


$\dfrac{f}{17077355} \cdot (1 - s) + \dfrac{1}{87880} \cdot s \approx \dfrac{f+30}{17077355}$.

Let $f = 10$, we have:

$\dfrac{10}{17077355} \cdot (1 - s) + \dfrac{1}{87880} \cdot s \approx \dfrac{40}{17077355}$,

$\dfrac{10}{17077355} \cdot (-s) + \dfrac{1}{87880} \cdot s \approx \dfrac{30}{17077355}$,

$\left(\dfrac{1}{87880} - \dfrac{10}{17077355}\right)s \approx \dfrac{30}{17077355}$,

$s \approx 0.16275525810789912$.

In [19]:
smoothing = (30/17077355) / (1/87880 - 10/17077355)
print(smoothing)

uniform_word_prob = 1 / total_unique_words
print(f"Uniform word probability: {uniform_word_prob}")

# Check that each bigram is contain exactly one space
import collections
bigrams = list(bigram_freqs_dict.keys())
space_counts = collections.Counter([sum(ch == ' ' for ch in bigram) for bigram in bigrams])
space_counts
space_counts[1] == len(bigrams)

0.16275525810789912
Uniform word probability: 1.137915339098771e-05


True

In [20]:
# Compute the statistic for each bigram, with the specified smoothing_factor applied

def get_top100(smoothing_factor: float) -> list:
    prob_ratios = []
    
    for i in range(len(bigrams)):
        w1w2 = bigrams[i]
        w1 = bigrams_split[i][0]
        w2 = bigrams_split[i][1]
        
        p_w1w2 = bigram_freqs_dict[w1w2] / total_bigrams
        p_w1 = (word_freqs_dict[w1] / total_words) * (1. - smoothing_factor) + uniform_word_prob * smoothing_factor
        p_w2 = (word_freqs_dict[w2] / total_words) * (1. - smoothing_factor) + uniform_word_prob * smoothing_factor
        
        prob_ratios.append(p_w1w2 / (p_w1 * p_w2))
    
    prob_ratios_df = pd.DataFrame(prob_ratios, index = bigrams)
    prob_ratios_df = prob_ratios_df.sort_values(by = 0, ascending = False)
    
    top100 = list(prob_ratios_df.index[:100])
    
    return top100

In [21]:
bigrams_split = [bigram.split(' ') for bigram in bigrams]

smoothing_factors = [0.16275525810789912, 0.75, 0.85, 0.95, 0.99]
for sf in smoothing_factors:
    print(f"Smoothing factor: {sf}")
    print(f"Top 100 bigrams: {get_top100(sf)}\n")

Smoothing factor: 0.16275525810789912
Top 100 bigrams: ['rula bula', 'knick knacks', 'dac biet', 'ropa vieja', 'feng shui', 'cien agaves', 'gulab jamun', 'himal chuli', 'tammie coe', 'itty bitty', 'riff raff', 'khai hoan', 'roka akor', 'patatas bravas', 'nanay gloria', 'baskin robbins', 'puerto rican', 'reina pepiada', 'chicha morada', 'wal mart', 'dueling pianos', 'hoity toity', 'haricot vert', 'tutti santi', 'hodge podge', 'luc lac', 'lomo saltado', 'bradley ogden', 'nuoc mam', 'valle luna', 'hu tieu', 'alain ducasse', 'vice versa', 'porta alba', 'har gow', 'kao tod', 'pina colada', 'krispy kreme', 'artery clogging', 'pura vida', 'ore ida', 'chino bandido', 'sous vide', 'celine dion', 'holyrood 9a', 'lloyd wright', 'pin kaow', 'harry potter', 'molecular gastronomy', 'ping pang', 'casey moore', 'malai kofta', 'deja vu', 'cochinita pibil', 'aguas frescas', 'kilt lifter', 'lactose intolerant', 'hors oeuvres', 'moscow mule', 'ama ebi', 'yada yada', 'thit nuong', 'womp womp', 'yadda yadda

In [22]:
top100 = get_top100(smoothing) # smoothing = 0.16275525810789912
print(top100)

['rula bula', 'knick knacks', 'dac biet', 'ropa vieja', 'feng shui', 'cien agaves', 'gulab jamun', 'himal chuli', 'tammie coe', 'itty bitty', 'riff raff', 'khai hoan', 'roka akor', 'patatas bravas', 'nanay gloria', 'baskin robbins', 'puerto rican', 'reina pepiada', 'chicha morada', 'wal mart', 'dueling pianos', 'hoity toity', 'haricot vert', 'tutti santi', 'hodge podge', 'luc lac', 'lomo saltado', 'bradley ogden', 'nuoc mam', 'valle luna', 'hu tieu', 'alain ducasse', 'vice versa', 'porta alba', 'har gow', 'kao tod', 'pina colada', 'krispy kreme', 'artery clogging', 'pura vida', 'ore ida', 'chino bandido', 'sous vide', 'celine dion', 'holyrood 9a', 'lloyd wright', 'pin kaow', 'harry potter', 'molecular gastronomy', 'ping pang', 'casey moore', 'malai kofta', 'deja vu', 'cochinita pibil', 'aguas frescas', 'kilt lifter', 'lactose intolerant', 'hors oeuvres', 'moscow mule', 'ama ebi', 'yada yada', 'thit nuong', 'womp womp', 'yadda yadda', 'scantily clad', 'demi glace', 'duct tape', 'lindo m