In [36]:
import pandas as pd
import torch
import re
import nltk
import nltk.data
import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import numpy as np
from collections import Counter
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from nltk.sentiment import SentimentIntensityAnalyzer
import seaborn as sns
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import datetime

In [17]:
# Ensure necessary NLTK resources are available
def download_nltk_resources():
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt')

    try:
        nltk.data.find('corpora/stopwords')
    except LookupError:
        nltk.download('stopwords')

    try:
        nltk.data.find('sentiment/vader_lexicon')
    except LookupError:
        nltk.download('vader_lexicon')

download_nltk_resources()


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\count\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [18]:
# Load pre-trained BERT sentiment model
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir="model_cache")
model = AutoModelForSequenceClassification.from_pretrained(model_name, cache_dir="model_cache")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set model to evaluation mode
model.eval()




# Initialize VADER Sentiment Analyzer
sia = SentimentIntensityAnalyzer()

In [29]:
# Function to apply sentiment prediction in batches
def bert_batch_predict(sentences):
    if not all(isinstance(s, str) for s in sentences):
        raise ValueError("All inputs must be strings")

    inputs = tokenizer(sentences, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)

    probabilities = torch.softmax(outputs.logits, dim=1)[:, 1].cpu().numpy()
    return probabilities.tolist(), ["Positive" if p > 0.6 else "Negative" if p < 0.4 else "Neutral" for p in probabilities]

In [31]:
# Function to compute VADER sentiment
def vader_sentiment(sentence):
    sentiment_score = sia.polarity_scores(sentence)['compound']
    if sentiment_score >= 0.05:
        return sentiment_score, "Positive"
    elif sentiment_score <= -0.05:
        return sentiment_score, "Negative"
    else:
        return sentiment_score, "Neutral"

In [48]:
print(datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S"))

27.Feb 2025 22:17:55


In [50]:
filelist = ['./datasets/__sprint1_filtered_gameplay_review_set',
            './datasets/__sprint1_filtered_functionality_review_set',
            './datasets/__sprint1_filtered_usability_review_set']

set_names = ['gameplay', 'functionality','usability']

for index in range(len(filelist)):
    df = pd.read_csv(filelist[index] + '.csv', encoding='utf-8')
    df = df.dropna(subset=['tokenized_sentence'])
    df['tokenized_sentence'] = df['tokenized_sentence'].astype(str)
    df = df[df['tokenized_sentence'].str.strip() != ""]
    
    # Convert the column to a NumPy array for efficient slicing
    df_sentences_list = df['tokenized_sentence'].values  # Use .values instead of .tolist() for better performance
    
    # Batch processing
    batch_size = 32
    num_batches = int(np.ceil(len(df_sentences_list) / batch_size))  # Compute the number of batches
    
    # Initialize arrays for efficiency
    bert_scores = np.zeros(len(df_sentences_list))
    bert_labels = np.empty(len(df_sentences_list), dtype=object)
    vader_scores = np.zeros(len(df_sentences_list))
    vader_labels = np.empty(len(df_sentences_list), dtype=object)

    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min(start_idx + batch_size, len(df_sentences_list))
        batch_sentences = df_sentences_list[start_idx:end_idx].tolist()  # Convert NumPy array to list
    
        # DistilBERT Sentiment Prediction
        scores, labels = bert_batch_predict(batch_sentences)  # Pass list to function
        bert_scores[start_idx:end_idx] = scores
        bert_labels[start_idx:end_idx] = labels
    
        # VADER Sentiment Prediction
        vader_results = np.array([vader_sentiment(sentence) for sentence in batch_sentences])
        vader_scores[start_idx:end_idx] = vader_results[:, 0]
        vader_labels[start_idx:end_idx] = vader_results[:, 1]
    
    # Assign predictions back to the DataFrame
    df["bert_score"] = bert_scores
    df["bert_label"] = bert_labels
    df["vader_score"] = vader_scores
    df["vader_label"] = vader_labels
    df.to_csv(filelist[index] + '_with_sentiment_scores.csv', index=False)

In [51]:
#Started execution at 21:50 

In [52]:
print(datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S"))

27.Feb 2025 22:50:28


In [41]:
df.tail(20)

Unnamed: 0.1,Unnamed: 0,recommendationid,clean_sentence,tokenized_sentence,voted_up,informative,heuristic,bert_score,bert_label,vader_score,vader_label
118711,523525,52918061,every experience unique different,every experience is unique and different,1,1.0,"[('G',)]",0.99978,Positive,0.0,Neutral
118712,523534,43980732,find gems spark inspiration rare,you will find gems here and there that will sp...,1,1.0,"[('G',)]",0.999736,Positive,0.3919,Positive
118713,523544,34593712,game like space minecraft stand game either,this game is like a space minecraft and i can ...,0,1.0,"[('G',)]",0.011154,Negative,0.3612,Positive
118714,523555,52284388,hopping planet planet explore tech make way core,you do this by hopping from planet to planet a...,0,1.0,"[('G',)]",0.999579,Positive,0.0,Neutral
118715,523557,25383677,slow pace makes relaxing gameplay,its slow pace makes for a relaxing gameplay,1,1.0,"[('G',)]",0.998674,Positive,0.4939,Positive
118716,523558,28126654,standard priced game divide number hours revie...,its a standard priced game divide it by the nu...,1,1.0,"[('G',)]",0.036033,Negative,0.4939,Positive
118717,523560,60575720,game feels pretty good still handles well,in vr this game feels pretty good and still ha...,1,1.0,"[('G',)]",0.999841,Positive,0.802,Positive
118718,523563,24867214,got bored quickly launch liking,i got bored quickly after launch but now im li...,1,1.0,"[('G', 'U')]",0.998245,Positive,0.4588,Positive
118719,523565,24973617,sure game repetitive love repetitive openworld...,sure this game is very repetitive but i love r...,1,1.0,"[('G',)]",0.991135,Positive,-0.4411,Negative
118720,523574,24913169,flying feels messy restrictive still feels lik...,flying feels messy and restrictive but it stil...,1,1.0,"[('G',)]",0.122557,Negative,0.3612,Positive
