<a href="https://colab.research.google.com/github/rizz778/btp-project/blob/main/notebooks/supervised_comment_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **LOADING DEPENDENCIES**

In [1]:
!pip install transformers datasets scikit-learn --quiet

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import pandas as pd
import numpy as np


# **DATASET PREPROCESSING**

In [22]:
df = pd.read_csv('/content/revised_review_time.csv')
df.columns

Index(['Name', 'Review', 'sentiment_score', 'sentiment', 'topic', 'translated',
       'review_time'],
      dtype='object')

In [28]:
# Shift integrity labels: -1 → 0, 0 → 1, 1 → 2
df['sentiment'] = df['sentiment'].map({-1: 0, 0: 1, 1: 2})
df['topic'] = df['topic'].astype(int)


In [29]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical labels
label_encoder_topic = LabelEncoder()
topic_labels = label_encoder_topic.fit_transform(df['topic'])

label_encoder_sentiment = LabelEncoder()
integrity_labels = label_encoder_sentiment.fit_transform(df['sentiment'])

# Train-test split
train_texts, val_texts, train_topics, val_topics, train_integrity, val_integrity = train_test_split(
    df['translated'].tolist(),
    topic_labels,
    integrity_labels,
    test_size=0.2,
    stratify=topic_labels, # Stratify by topic labels
    random_state=42
)

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize(texts):
    return tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=128)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [33]:
class CommentDataset(Dataset):
    def __init__(self, texts, topics, integrities):
        self.encodings = tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=128)
        self.topics = torch.tensor(topics, dtype=torch.long)
        self.integrities = torch.tensor(integrities, dtype=torch.long)  # multi-class now

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['topic'] = self.topics[idx]
        item['integrity'] = self.integrities[idx]
        return item

    def __len__(self):
        return len(self.topics)


train_dataset = CommentDataset(train_texts, train_topics, train_integrity)
val_dataset = CommentDataset(val_texts, val_topics, val_integrity)


In [34]:
class CommentEmbeddingModel(nn.Module):
    def __init__(self, num_topics, num_integrity_classes):
        super(CommentEmbeddingModel, self).__init__()
        self.encoder = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(0.3)
        hidden_size = self.encoder.config.hidden_size
        self.topic_head = nn.Linear(hidden_size, num_topics)
        self.integrity_head = nn.Linear(hidden_size, num_integrity_classes)

    def forward(self, input_ids, attention_mask):
        output = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        pooled = self.dropout(output.pooler_output)
        topic_logits = self.topic_head(pooled)
        integrity_logits = self.integrity_head(pooled)
        return topic_logits, integrity_logits, pooled




In [57]:
print(np.unique(train_integrity, return_counts=True))


(array([0]), array([1476]))


In [56]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from sklearn.utils.class_weight import compute_class_weight
from transformers import get_linear_schedule_with_warmup
import torch.nn.functional as F
import os

# Compute class weights for topic
# Ensure all unique classes from the original data are considered
all_topic_classes = np.unique(train_topics)  # FULL dataset's topic labels
topic_weights = compute_class_weight(class_weight='balanced', classes=all_topic_classes, y=train_topics)
topic_weights = torch.tensor(topic_weights, dtype=torch.float).to(device)

# Compute class weights for integrity
# Ensure all unique classes from the original data are considered (0, 1, 2)
all_integrity_classes = np.array([0, 1, 2]) # Convert list to numpy array
integrity_weights = compute_class_weight(class_weight='balanced', classes=all_integrity_classes, y=train_integrity)
integrity_weights = torch.tensor(integrity_weights, dtype=torch.float).to(device)


topic_loss_fn = nn.CrossEntropyLoss(weight=topic_weights)
integrity_loss_fn = nn.CrossEntropyLoss(weight=integrity_weights)


train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

ValueError: classes should have valid labels that are in y

In [50]:
model = CommentEmbeddingModel(num_topics=5, num_integrity_classes=3).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

num_epochs = 10
total_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=int(0.1 * total_steps),
                                            num_training_steps=total_steps)


In [51]:
best_val_loss = float('inf')

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    topic_epoch_loss = 0
    integrity_epoch_loss = 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        topic_labels = batch['topic'].to(device)
        integrity_labels = batch['integrity'].to(device)

        optimizer.zero_grad()
        topic_logits, integrity_logits, _ = model(input_ids, attention_mask)

        loss1 = topic_loss_fn(topic_logits, topic_labels)
        loss2 = integrity_loss_fn(integrity_logits, integrity_labels)

        loss = loss1 + loss2
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        topic_epoch_loss += loss1.item()
        integrity_epoch_loss += loss2.item()

    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"  🔹 Total Loss:      {total_loss:.4f}")
    print(f"  🔸 Topic Loss:      {topic_epoch_loss:.4f}")
    print(f"  🔸 Integrity Loss:  {integrity_epoch_loss:.4f}")

    # Optional: save best model (add validation loss if available)
    if total_loss < best_val_loss:
        best_val_loss = total_loss
        torch.save(model.state_dict(), 'best_model.pt')
        print("✅ Model checkpointed.")





  return forward_call(*args, **kwargs)


RuntimeError: weight tensor should be defined either for all 3 classes or no classes but got weight tensor of shape: [1]