<a href="https://colab.research.google.com/github/bhavyaB30/Comment-Classifier-for-Apps/blob/main/Comment_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:


import numpy as np
import pandas as pd


import os
for dirname, _, filenames in os.walk('/reviews.csv'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')

import nltk

In [None]:
df = pd.read_csv('/reviews.csv')
print(df.shape)
print(f'Number of samples: {df.shape[0]}\nNumber of features: {df.shape[1]}')

In [None]:
df = df.head(10000)
df

In [None]:
df.drop(['Time_submitted', 'Reply'], axis=1, inplace=True)
df


In [None]:
ax = df['Rating'].value_counts().sort_index() \
    .plot(kind='bar',
          title='Count of Reviews by Stars',
          figsize=(10, 5))
ax.set_xlabel('Review Stars')
plt.show()

In [None]:
import nltk
nltk.download('vader_lexicon')

from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()


In [None]:
df["sentiment"] = df["Review"].apply(lambda x:sia.polarity_scores(x))
df

In [None]:
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    text = row['Review']

    res[text] = sia.polarity_scores(text)
res

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.corpus import stopwords
import string

def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def clean_text(text):
    # lowercase
    text = text.lower()
    # remove punctuation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words with numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stopwords
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]


In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

df["sentiments"] = df["Review"].apply(lambda x: sid.polarity_scores(x))
df

In [None]:
df = pd.concat([df.drop(['sentiments'], axis=1), df['sentiments'].apply(pd.Series)], axis=1)
df.head()

In [None]:
ax = sns.barplot(data=df, x='Rating', y='compound')
ax.set_title('Compund Score by Spotify Star Review')
plt.show()

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(12, 3))
sns.barplot(data=df, x='Rating', y='pos', ax=axs[0])
sns.barplot(data=df, x='Rating', y='neu', ax=axs[1])
sns.barplot(data=df, x='Rating', y='neg', ax=axs[2])
axs[0].set_title('Positive')
axs[1].set_title('Neutral')
axs[2].set_title('Negative')
plt.tight_layout()
plt.show()

Roberta

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [None]:
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")


In [None]:
example= df['Review'][75]
encoded_text = tokenizer(example, return_tensors='pt')
output = model(**encoded_text)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores_dict = {
    'roberta_neg' : scores[0],
    'roberta_neu' : scores[1],
    'roberta_pos' : scores[2]
}
print(scores_dict)


In [None]:
def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'roberta_neg' : scores[0],
        'roberta_neu' : scores[1],
        'roberta_pos' : scores[2]
    }
    return scores_dict

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# use a smaller, faster model if you want
MODEL_NAME = "distilroberta-base"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

# build sentiment pipeline
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model=model,
    tokenizer=tokenizer,
    device=-1  # CPU
)

# Example: apply truncation with max length
result = sentiment_pipeline(
    "This is a very long review text ..." * 100,
    truncation=True,
    max_length=512   # RoBERTa/DistilRoBERTa usually max 512
)

print(result)


In [None]:
results_df = pd.DataFrame(res).T
results_df = results_df.reset_index().rename(columns={'index': 'Rating'})
results_df['Rating'] = pd.to_numeric(results_df['Rating'], errors='coerce')
results_df = results_df.merge(df, how='left')

In [None]:
results_df.columns

In [None]:
import numpy as np

results_df['vader_neg'] = pd.to_numeric(results_df['vader_neg'], errors='coerce')  # Optional: Convert to numeric if applicable
results_df['vader_neu'] = pd.to_numeric(results_df['vader_neu'], errors='coerce')  # Optional: Convert to numeric if applicable
results_df['vader_pos'] = pd.to_numeric(results_df['vader_pos'], errors='coerce')  # Optional: Convert to numeric if applicable
results_df['roberta_neg'] = pd.to_numeric(results_df['roberta_neg'], errors='coerce')
results_df['roberta_neu'] = pd.to_numeric(results_df['roberta_neu'], errors='coerce')
results_df['roberta_pos'] = pd.to_numeric(results_df['roberta_pos'], errors='coerce')

results_df.replace([np.inf, -np.inf], np.nan, inplace=True)




sns.pairplot(data=results_df,
             vars=['vader_neg', 'vader_neu', 'vader_pos',
                   'roberta_neg', 'roberta_neu', 'roberta_pos'])
plt.show()

In [None]:
Transformer Piperline

In [None]:
from transformers import pipeline

# create sentiment-analysis pipeline
sent_pipeline = pipeline("sentiment-analysis")

# run on one text
print(sent_pipeline("Nice app! It will be great if it has a Korean romanization to all Korean songs so I can sing along with it 😊"))

# run on multiple reviews
reviews = [
    "I love this app!",
    "This app crashes too often.",
    "It's okay, could be better."
]

print(sent_pipeline(reviews))


In [None]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# download once
nltk.download("vader_lexicon")

sia = SentimentIntensityAnalyzer()

while True:
    text = input("Enter a review (or type 'quit' to stop): ")
    if text.lower() == "quit":
        break
    score = sia.polarity_scores(text)["compound"]
    if score >= 0.05:
        sentiment = "Positive"
    elif score <= -0.05:
        sentiment = "Negative"
    else:
        sentiment = "Neutral"
    print(f"Sentiment → {sentiment} (score={score})\n")


In [None]:
from transformers import pipeline

sent_pipeline = pipeline("sentiment-analysis")

while True:
    text = input("Enter a review (or type 'quit' to stop): ")
    if text.lower() == "quit":
        break
    result = sent_pipeline(text)[0]
    print(f"Sentiment → {result['label']} (score={result['score']:.4f})\n")
