In [16]:
import pandas as pd
import nltk
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import re
from sklearn.model_selection import train_test_split

# Ensure necessary NLTK resources are available
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load the dataset
file_path = "sample reviews dataset.csv"
df = pd.read_csv(file_path, encoding='latin1')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Sabina\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sabina\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Data Preprocessing: Splitting reviews into sentences & cleaning data

In [17]:
# Remove rows with empty review strings
df = df.dropna(subset=['review'])

# Select relevant columns
df_individual = df[['recommendationid', 'review', 'voted_up']].dropna()

# Convert sentiment labels (True -> 1, False -> 0)
df_individual['label'] = df_individual['voted_up'].astype(int)

# Function to clean text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    words = word_tokenize(text)  # Tokenize words
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    return ' '.join(words)

# Function to split reviews into sentences
def split_into_sentences(row):
    sentences = sent_tokenize(row['review'])
    cleaned_sentences = [clean_text(sentence) for sentence in sentences]
    return [{'recommendationid': row['recommendationid'], 'sentence': sentence, 'label': row['label']} for sentence in cleaned_sentences]

# Expand reviews into individual sentences
sentence_data = []
for _, row in df_individual.iterrows():
    sentence_data.extend(split_into_sentences(row))

# Create a new DataFrame with split sentences
df_sentences = pd.DataFrame(sentence_data)

# Display the first few rows
print(df_sentences.head())

   recommendationid                                           sentence  label
0          70427607  game elements many games sewn one incredibly well      1
1          70427607  bit survival fps space sim trading farming bas...      1
2          70427607  result beautifully presented journey discovery...      1
3          70427607               would recommend everyone adventurous      1
4          70426209                          game k random gen presets      1


Vocabulary creation for LSTM model

In [18]:
# Build Vocabulary
vocab = {'<PAD>': 0, '<UNK>': 1}
word_freq = {}
for sentence in df_sentences['sentence']:
    for word in sentence.split():
        word_freq[word] = word_freq.get(word, 0) + 1

for word, freq in word_freq.items():
    if freq >= 5:  # Threshold for rare words
        vocab[word] = len(vocab)

vocab_size = len(vocab)

# Convert text to indices
def text_to_indices(text):
    return [vocab.get(word, vocab['<UNK>']) for word in text.split()]

df_sentences['indexed_sentence'] = df_sentences['sentence'].apply(text_to_indices)

# Pad sequences
def pad_sequence(seq, max_len=50):
    return seq[:max_len] + [vocab['<PAD>']] * max(0, max_len - len(seq))

df_sentences['padded_sentence'] = df_sentences['indexed_sentence'].apply(lambda x: pad_sequence(x, max_len=50))

# Split into train and test sets
df_train, df_test = train_test_split(df_sentences, test_size=0.2, random_state=42)

# Convert DataFrame to PyTorch Dataset
class SentimentDataset(Dataset):
    def __init__(self, data):
        self.sentences = torch.tensor(data['padded_sentence'].tolist(), dtype=torch.long)
        self.labels = torch.tensor(data['label'].tolist(), dtype=torch.long)

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx], self.labels[idx]

# Create train and test DataLoaders
batch_size = 128
dataset_train = SentimentDataset(df_train)
dataset_test = SentimentDataset(df_test)
dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
dataloader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=False)


Training LSTM model

In [19]:

# Define LSTM Model
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim=100, hidden_dim=128, output_dim=1, n_layers=2, drop_prob=0.5):
        super(SentimentLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        out = self.fc(lstm_out[:, -1])
        return self.sigmoid(out)

# Initialize model, loss function, and optimizer
model = SentimentLSTM(vocab_size)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
def train_model(model, dataloader, criterion, optimizer, epochs=5):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for inputs, labels in dataloader:
            optimizer.zero_grad()
            outputs = model(inputs).squeeze()
            loss = criterion(outputs, labels.float())
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(dataloader):.4f}')

# Train the model
train_model(model, dataloader, criterion, optimizer, epochs=5)

# Display the first few rows
print(df_sentences.head())

Epoch 1/5, Loss: 0.5905
Epoch 2/5, Loss: 0.4376
Epoch 3/5, Loss: 0.4396
Epoch 4/5, Loss: 0.4281
Epoch 5/5, Loss: 0.4299
   recommendationid                                           sentence  label  \
0          70427607  game elements many games sewn one incredibly well      1   
1          70427607  bit survival fps space sim trading farming bas...      1   
2          70427607  result beautifully presented journey discovery...      1   
3          70427607               would recommend everyone adventurous      1   
4          70426209                          game k random gen presets      1   

                                  indexed_sentence  \
0                         [2, 1, 3, 4, 1, 5, 1, 6]   
1  [7, 8, 1, 9, 1, 10, 1, 11, 12, 1, 1, 1, 13, 14]   
2               [1, 1, 1, 1, 1, 1, 1, 15, 1, 1, 1]   
3                                  [16, 17, 18, 1]   
4                                  [2, 1, 1, 1, 1]   

                                     padded_sentence  
0  [2, 1, 3, 

Evaluating LSTM model

In [20]:
# Evaluate Model
def evaluate_model(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in dataloader:
            outputs = model(inputs).squeeze()
            loss = criterion(outputs, labels.float())
            total_loss += loss.item()
            predicted = (outputs >= 0.5).long()
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
    print(f'Test Loss: {total_loss/len(dataloader):.4f}, Accuracy: {correct/total:.4f}')


# Evaluate the model
evaluate_model(model, dataloader_test, criterion)

Test Loss: 0.4250, Accuracy: 0.8492


Insights