<a href="https://colab.research.google.com/github/sterlinggutterman/CS3-DS4002/blob/main/CS3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

All code necessary for completing CS3.

## Set Up

In [None]:
! git clone https://github.com/sterlinggutterman/CS3-DS4002

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
final_df = pd.read_csv('/content/CS3-DS4002/DATA/final_restaurant_df.csv')
final_df.drop(['Unnamed: 0'], axis = 1, inplace=True)
final_df.head()

## EDA

In [None]:
# dimensions
print(final_df.shape,'\n')
# observations
print(final_df.dtypes, '\n')
# variables
print(final_df.columns,'\n')
# quick view
final_df.head()

In [None]:
final_df['Rating'].plot(kind='hist', bins=20, title='Rating')

In [None]:
# rating by restaurant
final_df = final_df.reset_index(drop=True)
plt.figure(figsize=(10, 6))
sns.boxplot(x='Restaurant', y='Rating', data=final_df)
plt.title('Rating Distribution by Restaurant')
plt.xlabel('Restaurant')
plt.ylabel('Rating')
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
# reviews by day of week
final_df['Date'] = pd.to_datetime(final_df['Date'], errors='coerce', format='mixed')
final_df['DayOfWeek'] = final_df['Date'].dt.day_name()
day_counts = final_df['DayOfWeek'].value_counts()

plt.figure(figsize=(8,5))
sns.barplot(x=day_counts.index, y=day_counts.values, order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
plt.title('Number of Reviews by Day of the Week')
plt.xlabel('Day of the Week')
plt.ylabel('Review Count')
plt.xticks(rotation=45)
plt.show()

In [None]:
# rating over time
avg_rating_per_month = final_df.groupby(final_df['Date'].dt.to_period('Y'))['Rating'].mean()

plt.figure(figsize=(12,5))
avg_rating_per_month.plot(kind='line', marker='o', title='Average Rating Over Time')
plt.xlabel('Month')
plt.ylabel('Average Rating')
plt.show()

In [None]:
# rating by year
avg_rating_per_year = final_df.groupby(final_df['Date'].dt.to_period('Y'))['Rating'].mean()

plt.figure(figsize=(12,5))
avg_rating_per_month.plot(kind='bar')
plt.title('Average Rating by Year')
plt.xlabel('Year')
plt.ylabel('Average Rating')
plt.show()

In [None]:
# rating by month
avg_rating_per_month = final_df.groupby(final_df['Date'].dt.month)['Rating'].mean()

# Plot as a bar chart
plt.figure(figsize=(12,5))
avg_rating_per_month.plot(kind='bar')
plt.xlabel('Month')
plt.ylabel('Average Rating')
plt.title('Average Rating by Month')
plt.show()

In [None]:
# sentiment
avg_rating_by_restaurant = final_df.groupby(final_df['Restaurant'])['Rating'].mean().sort_values(ascending=False)
avg_sentiment_by_restaurant = final_df.groupby(final_df['Restaurant'])['sentiment'].mean().sort_values(ascending=False)

from IPython.display import display_html

html1 = avg_rating_by_restaurant.to_frame().to_html()
html2 = avg_sentiment_by_restaurant.to_frame().to_html()

html_side_by_side = f"""
        {html1}
        {html2}
"""

display_html(html_side_by_side, raw=True)

In [None]:
# sentiment by restaurant
print(avg_sentiment_by_restaurant)

In [None]:
# sentiment vs. rating
final_df.groupby(final_df['Rating'])['sentiment'].mean()

In [None]:
# sentiment correlations
sentiment_rating = final_df['sentiment'].corr(final_df['Rating'])
sentiment_number = final_df['sentiment'].corr(final_df['Author Reviews'])
rating_number = final_df['Rating'].corr(final_df['Author Reviews'])
correlation = pd.DataFrame({'Feature': ['Sentiment vs Rating', 'Sentiment vs Number of Reviews', 'Rating vs Number of Reviews'],
                            'Correlaltion': [sentiment_rating, sentiment_number, rating_number]})
correlation

In [None]:
# violin plot
sns.violinplot(x='sentiment', y='Rating', data=final_df, orient='h').invert_yaxis()
plt.show()

## Regression

In [None]:
import statsmodels.api as sm

# regression analysis
avg_rating_by_restaurant = final_df.groupby(final_df['Restaurant'])['Rating'].mean()
avg_sentiment_by_restaurant = final_df.groupby(final_df['Restaurant'])['sentiment'].mean()
avg_df = pd.concat([avg_rating_by_restaurant, avg_sentiment_by_restaurant], axis=1)
avg_df.columns = ['avg_rating', 'avg_sentiment']

x = avg_df['avg_sentiment'].values.reshape(-1, 1)
y = avg_df['avg_rating'].values.reshape(-1, 1)

X = sm.add_constant(x)
model = sm.OLS(y, X).fit()
print(model.summary())

beta0 = model.params[0]
beta1 = model.params[1]
yhat = beta0 + beta1 * x

In [None]:
# regression plot
plt.scatter(avg_sentiment_by_restaurant, avg_rating_by_restaurant, label='Data')
plt.xlabel('Average Sentiment')
plt.ylabel('Average Rating')
plt.title('Average Sentiment vs Average Rating')
plt.plot(x,yhat,label='Regression Line')
plt.legend(loc='lower right')
plt.show()

## Word Sentiment

In [None]:
# import new packages
!pip install nltk
!pip install contractions
import nltk

nltk.download('punkt_tab')
nltk.download("vader_lexicon")

from nltk.tokenize import word_tokenize
from collections import Counter
from nltk.corpus import stopwords
import contractions

In [17]:
# organizing reviews into positive and negative categories
positive_reviews = pd.DataFrame(final_df[final_df['sentiment'] > 0.05])
negative_reviews = pd.DataFrame(final_df[final_df['sentiment'] < -0.05])

In [None]:
# eliminate filler words (and, my, i, but, etc.) from reviews
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [19]:
# fix contractions (don't, etc.)
def fix_contractions(text):
  text = str(text)
  return contractions.fix(text)

In [20]:
# update dataframes
positive_reviews['Message'] = positive_reviews['Message'].apply(fix_contractions)
negative_reviews['Message'] = negative_reviews['Message'].apply(fix_contractions)

In [21]:
# extract most popular words
def words(reviews, n=25):
  all_words = []
  for review in reviews:
    words = word_tokenize(review.lower())
    filtered_words = [
        word for word in words if word not in stop_words
        and len(word) > 2]
    all_words.extend(filtered_words)

  common_words = Counter(all_words).most_common(n)
  return common_words

In [22]:
# create positive and negative key word lists of 25 words
positive_words = words(positive_reviews['Message'], n=25)
negative_words = words(negative_reviews['Message'], n=25)

In [None]:
# show new word lists
print("common positive words:", positive_words)
print("common negative words:", negative_words)

In [23]:
# visualize as word cloud
from wordcloud import WordCloud

def wordcloud(words):
  words = dict(words)
  image = WordCloud(background_color='white').generate_from_frequencies(words)
  plt.imshow(image)
  plt.axis('off')
  plt.show()

In [None]:
# positive reviews
wordcloud(positive_words)

In [None]:
# negative reviews
wordcloud(negative_words)