In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
import matplotlib.pyplot as plt
import re
import string
from nltk.corpus import stopwords
import nltk
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence

In [None]:
data = pd.read_csv('youtube_comments_cleaned.csv')
data.head()

Unnamed: 0,CommentID,VideoID,VideoTitle,AuthorName,AuthorChannelID,CommentText,Sentiment,Likes,Replies,PublishedAt,CountryCode,CategoryID
0,UgyRjrEdJIPrf68uND14AaABAg,mcY4M9gjtsI,They killed my friend.#tales #movie #shorts,@OneWhoWandered,UC_-UEXaBL1dqqUPGkDll49A,Anyone know what movie this is?,Neutral,0,2,2025-01-15 00:54:55,NZ,1
1,UgxXxEIySAwnMNw8D7N4AaABAg,2vuXcw9SZbA,Man Utd conceding first penalty at home in yea...,@chiefvon3068,UCZ1LcZESjYqzaQRhjdZJFwg,The fact they're holding each other back while...,Positive,0,0,2025-01-13 23:51:46,AU,17
2,UgxB0jh2Ur41mcXr5IB4AaABAg,papg2tsoFzg,Welcome to Javascript Course,@Abdulla-ip8qr,UCWBK35w5Swy1iF5xIbEyw3A,waiting next video will be?,Neutral,1,0,2020-07-06 13:18:16,IN,27
3,UgwMOh95MfK0GuXLLrF4AaABAg,31KTdfRH6nY,Building web applications in Java with Spring ...,@finnianthehuman,UCwQ2Z03nOcMxWozBb_Cv66w,Thanks for the great video.\n\nI don't underst...,Neutral,0,1,2024-09-18 12:04:12,US,27
4,UgxJuUe5ysG8OSbABAl4AaABAg,-hV6aeyPHPA,After a new engine her car dies on her way hom...,@ryoutubeplaylistb6137,UCTTcJ0tsAKQokmHB2qVb1qQ,Good person helping good people.\nThis is how ...,Positive,3,1,2025-01-10 19:39:03,US,2


In [None]:
data = data[["CommentText", "Sentiment"]]
label_map = {"Negative": 0, "Positive": 1, "Neutral": 2}
rev_map = {1: "Positive", 0: "Negative", 2: "Neutral"}
data["Sentiment"] = data["Sentiment"].map(label_map)
data.head()

Unnamed: 0,CommentText,Sentiment
0,Anyone know what movie this is?,2
1,The fact they're holding each other back while...,1
2,waiting next video will be?,2
3,Thanks for the great video.\n\nI don't underst...,2
4,Good person helping good people.\nThis is how ...,1


In [None]:
def clean_text(text):
    if isinstance(text, str):
        text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
        text = text.lower()
        text = re.sub(r"http\S+", "", text)
        text = re.sub(r"@\S+", "", text)
        text = text.translate(str.maketrans("", "", string.punctuation))
        text = text.replace("\n", " ")
        text = re.sub(r"\s+", " ", text).strip()

    else:
        text = ""
    return text


data["cleaned_comment"] = data["CommentText"].apply(clean_text)
print(data[["CommentText", "cleaned_comment"]].head())

                                         CommentText  \
0                    Anyone know what movie this is?   
1  The fact they're holding each other back while...   
2                        waiting next video will be?   
3  Thanks for the great video.\n\nI don't underst...   
4  Good person helping good people.\nThis is how ...   

                                     cleaned_comment  
0                     anyone know what movie this is  
1  the fact theyre holding each other back while ...  
2                         waiting next video will be  
3  thanks for the great video i dont understand w...  
4  good person helping good people this is how it...  


In [None]:
data = data.drop("CommentText", axis=1)
data = data.dropna()
data.head()

Unnamed: 0,Sentiment,cleaned_comment
0,2,anyone know what movie this is
1,1,the fact theyre holding each other back while ...
2,2,waiting next video will be
3,2,thanks for the great video i dont understand w...
4,1,good person helping good people this is how it...


In [None]:
texts = data["cleaned_comment"].astype(str).tolist()
labels = data["Sentiment"].tolist()

tokenized_texts = [text.lower().split() for text in texts]
vocab = Counter(word for sent in tokenized_texts for word in sent)
vocab = {word: idx + 2 for idx, (word, _) in enumerate(vocab.most_common(5000))}
vocab["<PAD>"] = 0
vocab["<UNK>"] = 1


def encode(seq):
    return [vocab.get(word, vocab["<UNK>"]) for word in seq]


encoded_texts = [torch.tensor(encode(seq)) for seq in tokenized_texts]
padded_texts = pad_sequence(encoded_texts, batch_first=True, padding_value=0)
labels_tensor = torch.tensor(labels)

In [None]:
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)
num_classes = len(label_encoder.classes_)

In [None]:
# Split
X_train, X_test, y_train, y_test = train_test_split(
    padded_texts, labels_tensor, test_size=0.2, stratify=labels, random_state=42
)


# Dataset and DataLoader
class LengthAwareDataset(Dataset):
    def __init__(self, encoded_seqs, labels):
        self.seqs = encoded_seqs
        self.labels = labels

    def __len__(self):
        return len(self.seqs)

    def __getitem__(self, idx):
        return self.seqs[idx], self.labels[idx], len(self.seqs[idx])

def collate_fn(batch):
    batch.sort(key=lambda x: x[2], reverse=True)
    seqs, labels, lengths = zip(*batch)
    padded = pad_sequence(seqs, batch_first=True, padding_value=0)
    return padded, torch.tensor(labels), torch.tensor(lengths)

In [None]:
train_dataset = LengthAwareDataset(X_train, y_train)
test_dataset = LengthAwareDataset(X_test, y_test)
train_loader = DataLoader(
    train_dataset, batch_size=256, shuffle=True, collate_fn=collate_fn
)
test_loader = DataLoader(test_dataset, batch_size=256, collate_fn=collate_fn)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
class LSTM(nn.Module):
    def __init__(
        self, vocab_size, embed_dim, hidden_dim, num_classes, dropout_rate=0.3
    ):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc1 = nn.Linear(hidden_dim * 2, 64)  # bidirectional doubles the hidden_dim
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(64, num_classes)

    def forward(self, x, lengths):
        embedded = self.embedding(x)
        packed = pack_padded_sequence(
            embedded, lengths, batch_first=True, enforce_sorted=False
        )
        packed_output, (h_n, _) = self.lstm(packed)
        h_n = torch.cat(
            (h_n[-2, :, :], h_n[-1, :, :]), dim=1
        )  # concatenate last hidden states (bi-directional)
        out = self.dropout(h_n)
        out = self.relu(self.fc1(out))
        return self.fc2(out)

In [None]:
lstm = LSTM(len(vocab), embed_dim=64, hidden_dim=128, num_classes=num_classes).to(
    device
)
lstm_criterion = nn.CrossEntropyLoss()
lstm_optimizer = optim.Adam(lstm.parameters(), lr=1e-3)

In [None]:
train_acc, val_acc = [], []

In [None]:
for epoch in range(10):
    lstm.train()
    all_preds, all_labels = [], []
    for X_batch, y_batch, lengths in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        lengths = lengths.to(device)
        lstm_optimizer.zero_grad()
        out = lstm(X_batch, lengths.cpu())
        loss = lstm_criterion(out, y_batch)
        loss.backward()
        lstm_optimizer.step()
        all_preds.extend(out.argmax(1).cpu().numpy())
        all_labels.extend(y_batch.cpu().numpy())
    acc = accuracy_score(all_labels, all_preds)
    train_acc.append(acc)

    # Validation
    lstm.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for X_batch, y_batch, lengths in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            lengths = lengths.to(device)
            out = lstm(X_batch, lengths.cpu())
            all_preds.extend(out.argmax(1).cpu().numpy())
            all_labels.extend(y_batch.cpu().numpy())
    val_acc.append(accuracy_score(all_labels, all_preds))
    print(
        f"Epoch {epoch+1} - Train Acc: {train_acc[-1]:.4f}, Val Acc: {val_acc[-1]:.4f}"
    )