# Data Loading & Sampling

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')

df = pd.read_csv('/content/yelp.csv')

reviews = df.sample(
    min(2000, len(df)),
    random_state=42
)

print("Sampled reviews:", len(reviews))



#Sentiment Model (Roberta)

Using a pretrained RoBERTa transformer fine-tuned for sentiment classification.The model outputs probabilities for negative, neutral, and positive tone for each review in the sample.

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [40]:
def polarity_scores_roberta(text):
    encoded_text = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=512
    )
    output = model(**encoded_text)
    scores = output.logits[0].detach().numpy()
    scores = softmax(scores)
    return {
        "neg": scores[0],
        "neu": scores[1],
        "pos": scores[2]
    }

reviews["clean_text"] = reviews["text"].astype(str).str.lower()
reviews["sentiment"] = reviews["clean_text"].apply(
    polarity_scores_roberta
)

reviews["pos_score"] = reviews["sentiment"].apply(
    lambda x: x["pos"]
)



#Validation

Verified model reliability by comparing predicted positivity
with user star ratings. Higher-rated reviews proved to exhibit
higher positive sentiment.

In [None]:
reviews.groupby("stars")["pos_score"].mean()


In [None]:
import matplotlib.pyplot as plt

reviews["pos_score"].hist(bins=20)
plt.title("Distribution of Positive Sentiment")
plt.xlabel("Positive Sentiment Score")
plt.ylabel("Frequency")
plt.show()


In [None]:
reviews[reviews["stars"] <= 2]["text"].head(10)

In [None]:
reviews[reviews["stars"] >= 4]["text"].head(10)
print(
    reviews[["stars", "pos_score"]].corr()
)

print("MOST NEGATIVE:")
print(reviews.sort_values("pos_score").iloc[0]["text"])

print("\nMOST POSITIVE:")
print(reviews.sort_values("pos_score", ascending=False).iloc[0]["text"])



#Angry Reviews

Sorted by filtering for low-star ratings.


In [None]:
angry = reviews[df["stars"] <= 2]

print("Angry reviews:", len(angry))
angry["text"].head()


In [None]:
import re

def clean_words(text):
    text = re.sub(r"[^a-z\s]", "", text.lower())
    return text

angry["clean"] = angry["text"].apply(clean_words).copy()


In [None]:
reviews["text"].sample(10, random_state=42)

In [None]:
words = angry["clean"].str.split().explode()

#NLTK
removing common filler words to find more meaningful reasons on what makes people give negative reviews

In [None]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

stop = set(stopwords.words("english"))

words = words[~words.isin(stop)]
extra_stopwords = {
    "food","place","like","one","get","go","back",
    "really","would","time","even","im","dont",
    "didnt","got","ordered","order","restaurant","us","great","bad","better","could","ive"
}
words = words[~words.isin(extra_stopwords)]



In [None]:
from collections import Counter

common = Counter(words).most_common(20)
common[:10]


In [None]:
import pandas as pd

common_df = pd.DataFrame(common, columns=["word", "count"])

plt.figure(figsize=(10,6))
plt.barh(common_df["word"], common_df["count"])
plt.gca().invert_yaxis()
plt.title("Most Frequent Words in Angry Reviews")
plt.ylabel("Frequency")
plt.xlabel("Words")
plt.show()
