In [7]:
import pandas as pd
import nltk
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import re
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [8]:
# Ensure necessary NLTK resources are available
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')  # Ensure full tokenizer support
stop_words = set(stopwords.words('english'))

# Add domain-specific words to the stop words list
stop_words.update(["steam"])

# Load the dataset
file_path = "sample_reviews_dataset.csv"
df = pd.read_csv(file_path, encoding='latin1')

#print(df.head())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Sabina\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Sabina\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sabina\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Sabina\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


FileNotFoundError: [Errno 2] No such file or directory: 'sample_reviews_dataset.csv'

Data Preprocessing: Splitting reviews into sentences & cleaning data

In [None]:
# Check if review column exists
if 'review' not in df.columns:
    raise ValueError("The dataset does not contain a 'review' column")

# Remove rows with empty review strings
df = df.dropna(subset=['review'])

# Select relevant columns
#df = df[['recommendationid', 'review', 'voted_up']].dropna()
df = df.dropna()

# Convert sentiment labels (True -> 1, False -> 0)
df['label'] = df['voted_up'].astype(int)

# Function to clean text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    words = word_tokenize(text)  # Tokenize words
    words = [word for word in words if len(word) > 2 and word not in stop_words]  # Filter stopwords & short words
    return ' '.join(words)

# Function to clean text
def tokenize_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    words = word_tokenize(text)  # Tokenize words
    return ' '.join(words)

# Function to split reviews into unique sentences
def split_into_sentences(row):
    #sentences = list(set(sent_tokenize(row['review'])))  # Ensure unique sentences per review
    sentences = list(sent_tokenize(row['review']))
    cleaned_sentences = [clean_text(sentence) for sentence in sentences]
    token_sentences = [tokenize_text(sentence) for sentence in sentences]
    return [{'recommendationid': row['recommendationid'],
             'clean_sentence': sentence,
             'tokenized_sentence': to_sentence,
             'voted_up': row['label'],
            'language' : row['language'],
            'timestamp_created' : row['timestamp_created'],
            'timestamp_updated' : row['timestamp_updated'],
            'votes_up' : row['votes_up'],
            'votes_funny' : row['votes_funny'],
            'weighted_vote_score' : row['weighted_vote_score'],
            'comment_count' : row['comment_count'],
            'steam_purchase' : row['steam_purchase'],
            'received_for_free' : row['received_for_free'],
            'written_during_early_access' : row['written_during_early_access'],
            'author_num_games_owned' : row['author_num_games_owned'],
            'author_num_reviews' : row['author_num_reviews'],
            'author_playtime_forever' : row['author_playtime_forever'],
            'author_playtime_last_two_weeks' : row['author_playtime_last_two_weeks'],
            'author_last_played' : row['author_last_played']
            } for sentence, to_sentence in zip(cleaned_sentences, token_sentences)]


# Expand reviews into individual unique sentences
sentence_data = []
for _, row in df.iterrows():
    sentence_data.extend(split_into_sentences(row))

df_sentences = pd.DataFrame(sentence_data).drop_duplicates()

# Display the first few rows
print(df_sentences.head(20))

    recommendationid                                     clean_sentence  \
0           70427607  game elements many games sewn one incredibly well   
1           70427607  bit survival fps space sim trading farming bas...   
2           70427607  result beautifully presented journey discovery...   
3           70427607               would recommend everyone adventurous   
4           70426209                            game random gen presets   
5           70426209                                       voice acting   
6           70426209                                           gets old   
7           70426209                               none less swell time   
8           70425814                     first played years ago fun bit   
9           70425814                    started playing recently better   
10          70425814  criticisms many asteroids asteroids spaced bet...   
11          70425814                        deeper space less asteroids   
12          70425814  clu

In [None]:
df_sentences.describe
df_sentences.to_csv('./alt_no_mans_sky_steam_review_data_split_reviews_no_index_FULL.csv', index=False) #Save the dataframe with all the extracted sentences and extra columns 
#df_sentences.to_csv('./split_reviews_no_index.csv', index=False)
#df_sentences.to_csv('./split_reviews_with_index.csv', index=True)
df_sentences = df_sentences[['recommendationid', 'clean_sentence', 'tokenized_sentence', 'voted_up']]
df_sentences.to_csv('./alt_no_mans_sky_steam_review_data_split_reviews_no_index_reduced_columns.csv', index=False) #Save the dataframe with just the extracted sentences

Vocabulary creation for LSTM model

In [None]:
# Build Vocabulary
vocab = {'<PAD>': 0, '<UNK>': 1}
word_freq = {}
for sentence in df_sentences['sentence']:
    for word in sentence.split():
        word_freq[word] = word_freq.get(word, 0) + 1

for word, freq in word_freq.items():
    if len(word) > 2 and freq >= 5:  # Exclude short and rare words
        vocab[word] = len(vocab)

vocab_size = len(vocab)

# Convert text to indices
def text_to_indices(text):
    return [vocab.get(word, vocab['<UNK>']) for word in text.split()]

df_sentences['indexed_sentence'] = df_sentences['sentence'].apply(text_to_indices)

# Dynamically determine max sequence length
max_len = int(df_sentences['indexed_sentence'].apply(len).quantile(0.95))

# Pad sequences
def pad_sequence(seq, max_len=max_len):
    return seq[:max_len] + [vocab['<PAD>']] * max(0, max_len - len(seq))

df_sentences['padded_sentence'] = df_sentences['indexed_sentence'].apply(lambda x: pad_sequence(x, max_len=max_len))

# Split into train and test sets
df_train, df_test = train_test_split(df_sentences, test_size=0.2, random_state=42)

# Convert DataFrame to PyTorch Dataset
class SentimentDataset(Dataset):
    def __init__(self, data):
        self.sentences = torch.tensor(data['padded_sentence'].tolist(), dtype=torch.long)
        self.labels = torch.tensor(data['label'].tolist(), dtype=torch.float)

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx], self.labels[idx]

# Initialize Dataloaders
batch_size = 128
dataset_train = SentimentDataset(df_train)
dataset_test = SentimentDataset(df_test)
dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
dataloader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=False)


GPU/CPU setup

In [None]:
# Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Training LSTM model

In [None]:

# Define LSTM Model
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim=100, hidden_dim=128, output_dim=1, n_layers=2, drop_prob=0.5):
        super(SentimentLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        out = self.fc(lstm_out.mean(dim=1))  # Mean pooling instead of last output
        return out

# Initialize model, loss function, and optimizer
model = SentimentLSTM(vocab_size).to(device)
criterion = nn.BCEWithLogitsLoss()  # More numerically stable
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
def train_model(model, dataloader, criterion, optimizer, epochs=5):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs).squeeze()
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(dataloader):.4f}')

# Train the model
train_model(model, dataloader_train, criterion, optimizer, epochs=5)

# Display the first few rows
print(df_sentences.head())

Epoch 1/5, Loss: 0.6487
Epoch 2/5, Loss: 0.4859
Epoch 3/5, Loss: 0.4261
Epoch 4/5, Loss: 0.4340
Epoch 5/5, Loss: 0.4190
   recommendationid                                           sentence  label  \
0          70427607  result beautifully presented journey discovery...      1   
1          70427607               would recommend everyone adventurous      1   
2          70427607  game elements many games sewn one incredibly well      1   
3          70427607  bit survival fps space sim trading farming bas...      1   
4          70426209                                       voice acting      1   

                                    indexed_sentence  \
0                  [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1]   
1                                       [3, 4, 5, 1]   
2                          [6, 1, 7, 8, 1, 9, 1, 10]   
3  [11, 12, 1, 13, 1, 14, 1, 15, 16, 1, 1, 1, 17,...   
4                                             [1, 1]   

                                     padded_sentence  
0

Evaluating LSTM model

In [None]:
# Evaluation function
def evaluate_model(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs).squeeze()
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            predicted = (torch.sigmoid(outputs) >= 0.5).long()
            correct += (predicted == labels.long()).sum().item()
            total += labels.size(0)
    print(f'Test Loss: {total_loss/len(dataloader):.4f}, Accuracy: {correct/total:.4f}')

# Evaluate the model
evaluate_model(model, dataloader_test, criterion)

Test Loss: 0.4111, Accuracy: 0.8560


Data Analysis

In [None]:
# Initialize Dataloaders
batch_size = 128
dataset_sentences = SentimentDataset(df_sentences)
dataloader_sentences = DataLoader(dataset_sentences, batch_size=batch_size, shuffle=False)

# Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Predict sentiment scores
def predict_sentiment(model, dataloader):
    model.eval()
    predictions = []
    with torch.no_grad():
        for inputs, _ in dataloader:
            inputs = inputs.to(device)
            outputs = model(inputs).squeeze()
            predictions.extend(torch.sigmoid(outputs).tolist())
    return predictions

# Get predictions for entire dataset
df_sentences['predicted_lstm_score'] = predict_sentiment(model, dataloader_sentences)
df_sentences['predicted_lstm_label'] = (df_sentences['predicted_lstm_score'] >= 0.5).astype(int)

# Append predictions next to label column
df_sentences = df_sentences[['recommendationid', 'sentence', 'label', 'predicted_lstm_score', 'predicted_lstm_label']]

# Compute accuracy
accuracy = (df_sentences['predicted_lstm_label'] == df_sentences['label']).mean()
print(f'Overall Accuracy: {accuracy:.4f}')

# Save the updated dataset
#df_sentences.to_csv("predicted_sentiment_dataset.csv", index=False)

print(df_sentences.head(20))

Overall Accuracy: 0.8464
    recommendationid                                           sentence  \
0           70427607  result beautifully presented journey discovery...   
1           70427607               would recommend everyone adventurous   
2           70427607  game elements many games sewn one incredibly well   
3           70427607  bit survival fps space sim trading farming bas...   
4           70426209                                       voice acting   
5           70426209                            game random gen presets   
6           70426209                               none less swell time   
7           70426209                                           gets old   
8           70425814  criticisms many asteroids asteroids spaced bet...   
9           70425814                                      civilisations   
10          70425814                           waste time trying figure   
11          70425814                           weird stuff gives dejavu   
