In [1]:
import pandas as pd
df = pd.read_csv('final_cleaned_dataset.csv')
filtered_df = df[['review_text', 'voted_up']].copy()
print(filtered_df.head())

  from pandas.core import (


                                         review_text  voted_up
0  "yeah man, i'm making a game. it's gonna be a ...     False
1  i like the part where you jump on enemies and ...      True
2  to be perfectly honest, it ends up feeling lik...     False
3  didn't know this was planned as a series, so h...      True
4              how do you even open this walkthrough     False


In [2]:
from nltk.tokenize import sent_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

def analyze_review_sentiment(review):
    sentences = sent_tokenize(review)
    compound_scores = []
    for sentence in sentences:
        sentiment_score = analyzer.polarity_scores(sentence)
        compound_scores.append(sentiment_score['compound'])
    if compound_scores:  # Check if list is not empty
        average_compound = sum(compound_scores) / len(compound_scores)
    else:
        average_compound = 0  # Default to 0 if there are no sentences
    return average_compound

# Apply the function to each review in the DataFrame
filtered_df['compound_score'] = filtered_df['review_text'].apply(analyze_review_sentiment)

print(filtered_df)

                                              review_text  voted_up  \
0       "yeah man, i'm making a game. it's gonna be a ...     False   
1       i like the part where you jump on enemies and ...      True   
2       to be perfectly honest, it ends up feeling lik...     False   
3       didn't know this was planned as a series, so h...      True   
4                   how do you even open this walkthrough     False   
...                                                   ...       ...   
165847   you don't need a review to know what this is. :)      True   
165848  the h-content in this dlc is very nice with 1 ...     False   
165849  as usual with this stuff, i like the pin-ups, ...      True   
165850  i know its just a demo....but damn the game is...     False   
165851                    i like the module on this ship.      True   

        compound_score  
0            -0.177789  
1            -0.680800  
2             0.510333  
3             0.520586  
4             0.000000

In [3]:
print(filtered_df.head())
filtered_df = filtered_df[filtered_df['compound_score'] != 0]


                                         review_text  voted_up  compound_score
0  "yeah man, i'm making a game. it's gonna be a ...     False       -0.177789
1  i like the part where you jump on enemies and ...      True       -0.680800
2  to be perfectly honest, it ends up feeling lik...     False        0.510333
3  didn't know this was planned as a series, so h...      True        0.520586
4              how do you even open this walkthrough     False        0.000000


In [4]:
from sklearn.metrics import classification_report

# Convert compound scores to binary predictions (1 for recommended, 0 for not recommended)
# Adjust the threshold if needed
threshold = 0.2
filtered_df['predicted_voted_up'] = (filtered_df['compound_score'] > threshold).astype(int)

# Convert 'voted_up' from boolean to int (True to 1, False to 0)
filtered_df['voted_up'] = filtered_df['voted_up'].astype(int)

# Generate classification report
report = classification_report(filtered_df['voted_up'], filtered_df['predicted_voted_up'], target_names=['Not Recommended', 'Recommended'])

print(report)

                 precision    recall  f1-score   support

Not Recommended       0.47      0.80      0.59     37836
    Recommended       0.92      0.72      0.80    118193

       accuracy                           0.74    156029
      macro avg       0.69      0.76      0.70    156029
   weighted avg       0.81      0.74      0.75    156029



In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming 'filtered_df' is your DataFrame
# Let's split the dataset into training and test sets first
X_train, X_test, y_train, y_test = train_test_split(filtered_df['review_text'], filtered_df['voted_up'], test_size=0.2, random_state=42)

# TF-IDF Vectorization for a baseline model approach
tfidf_vectorizer = TfidfVectorizer(lowercase=True, max_features=10000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [6]:
from sklearn.linear_model import LogisticRegression

# Train a Logistic Regression model
model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(X_train_tfidf, y_train)

# Predictions and Evaluation
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.68      0.86      0.76      7499
           1       0.95      0.87      0.91     23707

    accuracy                           0.87     31206
   macro avg       0.82      0.87      0.84     31206
weighted avg       0.89      0.87      0.87     31206



In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
import torch

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Custom Dataset
class ReviewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.tokenizer = tokenizer
        self.texts = texts
        self.labels = labels
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors="pt")
        label = self.labels[idx]
        return {
            'input_ids': inputs['input_ids'][0],
            'attention_mask': inputs['attention_mask'][0],
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Convert data to torch Dataset
train_dataset = ReviewsDataset(X_train.tolist(), y_train.tolist(), tokenizer)
test_dataset = ReviewsDataset(X_test.tolist(), y_test.tolist(), tokenizer)

# Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
filtered_df2 = filtered_df[filtered_df['voted_up'] != filtered_df['predicted_voted_up']]

# Select the first 100 reviews if there are at least 100, otherwise select all that match the criteria
reviews_to_print = filtered_df2[['review_text', 'voted_up', 'predicted_voted_up', 'compound_score']][:100]

# Print the text of the filtered reviews along with real and predicted values
for index, row in reviews_to_print.iterrows():
    print(f"Review Text: {row['review_text']}")
    print(f"Real Value: {row['voted_up']}")
    print(f"Predicted Value: {row['predicted_voted_up']}\n")
    print(f"compound_score: {row['compound_score']}\n")

