In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import json
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import os
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import wandb
from torch.optim.lr_scheduler import ExponentialLR

In [2]:
MODEL_NAME = "FacebookAI/roberta-base"

In [3]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("WANDB_API_KEY")
wandb.login(key = secret_value_0)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mshobhitshukla6535[0m ([33mshobhitshukla6535-iit-kharagpur[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [4]:
class QuestionsDataset(Dataset):
    def __init__(self, dataset, tokenizer, is_test=False, max_length=62):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_test = is_test

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text = str(self.dataset.iloc[idx, 1])

        if not self.is_test:
            target = self.dataset.iloc[idx, 2]
            # assert isinstance(target, np.int16)
        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            add_special_tokens=True,
            return_tensors="pt",
        )

        if self.is_test:
            return {
                "input_ids": encoding["input_ids"].squeeze(),
                "attention_mask": encoding["attention_mask"].squeeze(),
            }

        else:
            return {
                "input_ids": encoding["input_ids"].squeeze(),
                "attention_mask": encoding["attention_mask"].squeeze(),
                "targets": torch.FloatTensor([target]),
            }

In [5]:
class BERT_MODEL(nn.Module):
    def __init__(self, model_name):
        super(BERT_MODEL, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.linear = nn.Sequential(
            nn.Linear(768, 1024), nn.ReLU(), nn.Dropout(0.3), nn.Linear(1024, 1)
        )

    def forward(self, input_ids, attention_mask):
        output = self.model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = output.pooler_output
        output = self.linear(pooled_output)
        return output

In [6]:
class Trainer:
    def __init__(
        self,
        dataset_path,
        model_card,
        checkpoint_dir="checkpoints",
        device="cuda" if torch.cuda.is_available() else "cpu",
        pos_weight=[1, 70],
        length_percentile=99.9,
        batch_size=32,
        epochs=2,
        seed=42,
    ):
        self.device = device
        self.batch_size = batch_size
        self.epochs = epochs
        self.seed = seed
        print(f"Using {self.device} device")
        self.model_name = model_card
        self.get_dataloader(dataset_path, length_percentile)
        print(f"Train dataset size: {len(self.train_dl.dataset)}")
        print(f"Validation dataset size: {len(self.val_dl.dataset)}")
        self.model = BERT_MODEL(self.model_name)
        print(f"Model: {self.model_name} initialized")
        self.num_epochs = epochs
        self.criterion = nn.BCEWithLogitsLoss()
        self.checkpoint_dir = checkpoint_dir
        os.makedirs(checkpoint_dir, exist_ok=True)
        self.optimizer = optim.AdamW(self.model.parameters(), lr=1e-5)
        self.scheduler = ExponentialLR(self.optimizer, gamma=0.9)

        wandb.init(
            project="question-classification",
            config={
                "epochs": self.epochs,
                "batch_size": self.batch_size,
                "learning_rate": 1e-5,
                "model": self.model_name,
            },
        )

    @staticmethod
    def get_max_length(df_train, tokenizer, length_percentile=99.9):
        df_train["question_length"] = tokenizer(
            df_train.question_text.tolist(), truncation=True
        )["input_ids"]
        df_train["question_length"] = df_train["question_length"].apply(
            lambda x: len(x)
        )
        max_length = np.percentile(df_train["question_length"], length_percentile)

        return int(max_length)

    def get_dataloader(self, path, length_percentile=99.9):
        df_train = pd.read_csv(path)
        df_train.target = df_train.target.astype("int16")
        # df_train = df_train.iloc[:100,:]
        tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_fast=True)
        max_length = 62  # self.get_max_length(df_train, tokenizer, length_percentile)
        train_df, val_df = train_test_split(
            df_train,
            stratify=df_train.target,
            test_size=0.2,
            random_state=42,
        )

        train_ds = QuestionsDataset(
            train_df, tokenizer, is_test=False, max_length=max_length
        )
        val_ds = QuestionsDataset(
            val_df, tokenizer, is_test=False, max_length=max_length
        )

        self.train_dl = DataLoader(train_ds, batch_size=self.batch_size, shuffle=True)
        self.val_dl = DataLoader(val_ds, batch_size=self.batch_size)

    @staticmethod
    def find_best_f1(outputs, labels):
        tmp = [0, 0, 0]  # idx, current, max
        threshold = 0

        for tmp[0] in np.arange(0.25, 0.85, 0.01):
            tmp[1] = f1_score(labels, outputs > tmp[0], zero_division=0)
            if tmp[1] > tmp[2]:
                threshold = tmp[0]
                tmp[2] = tmp[1]

        return tmp[2], threshold

    @staticmethod
    def get_preds(logits, threshold):
        """Convert logits to binary predictions based on the threshold"""
        predictions = (torch.sigmoid(logits) > threshold).float()
        return predictions

    def train(self):
        torch.manual_seed(self.seed)
        self.model.to(self.device)
        history = {"Train_Loss": [], "Val_Loss": [], "F1": []}
        agg_loss = 0
        step = 0
        for epoch in range(self.num_epochs):
            print("*" * 10 + f" Epoch - {epoch + 1} " + "*" * 10)
            progress_bar = tqdm(
                self.train_dl,
                desc="Optimizing",
                unit="batch",
                leave=True,
                dynamic_ncols=True,
            )

            for i, batch in enumerate(progress_bar):
                input_ids, attention_mask, targets = (
                    batch["input_ids"].to(self.device),
                    batch["attention_mask"].to(self.device),
                    batch["targets"].to(self.device).squeeze(1),
                )

                self.optimizer.zero_grad()
                logits = self.model(input_ids, attention_mask)
                loss = self.criterion(logits.squeeze(1), targets)
                agg_loss += loss.item()
                loss.backward()

                nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)

                self.optimizer.step()

                history["Train_Loss"].append(agg_loss / (step + 1))
                wandb.log({"Train Loss": history["Train_Loss"][-1]}, step=step + 1)

                progress_bar.set_postfix(
                    {"Train loss": history["Train_Loss"][-1]}, refresh=True
                )

                step = step + 1

                if (step + 1) % 2000 == 0:
                    checkpoint_path = os.path.join(
                        self.checkpoint_dir, f"model_epoch_{epoch + 1}.pth"
                    )
                    self.scheduler.step()
                    print(f"Learning Rate after step {step}: {self.scheduler.get_last_lr()}")
                    wandb.log({"last_lr": self.scheduler.get_last_lr()}, step = step)
                    
                    torch.save(
                        {
                            "epoch": epoch + 1,
                            "model_state_dict": self.model.state_dict(),
                            "optimizer_state_dict": self.optimizer.state_dict(),
                            "loss": history["Train_Loss"][-1],
                        },
                        checkpoint_path,
                    )
                    print(f"Checkpoint saved at {checkpoint_path}")
                    val_loss, best_f1 = self.validate()
                    wandb.log({"Validation Loss": val_loss, "F1 Score": best_f1}, step=step)
                    history["Val_Loss"].append(val_loss)
                    history["F1"].append(best_f1)
            
        history_path = os.path.join(self.checkpoint_dir, "training_history.json")
        with open(history_path, "w") as f:
            json.dump(history, f)
    
        wandb.save(history_path) 
    
        model_path = os.path.join(self.checkpoint_dir, "final_model.pth")
        torch.save(self.model.state_dict(), model_path)
    
        wandb.save(model_path) 
    
        print("Training complete! History and model saved to W&B.")
        wandb.finish()
        return history

    def validate(self):
        """Validation logic separated into its own function."""
        self.model.eval()
        agg_val_loss = 0
        all_logits, all_targets = [], []

        with torch.no_grad():
            for batch in tqdm(self.val_dl, desc="Validating", unit="batch"):
                input_ids, attention_mask, targets = (
                    batch["input_ids"].to(self.device),
                    batch["attention_mask"].to(self.device),
                    batch["targets"].to(self.device).squeeze(1),
                )

                logits = self.model(input_ids, attention_mask)
                loss = self.criterion(logits.squeeze(1), targets)
                agg_val_loss += loss.item()

                all_logits.append(logits.squeeze(1).cpu())
                all_targets.append(targets.cpu())

        all_logits = torch.cat(all_logits, dim=0)
        all_targets = torch.cat(all_targets, dim=0)

        val_loss = agg_val_loss / len(self.val_dl)
        best_f1, threshold = self.find_best_f1(
            torch.sigmoid(all_logits).numpy(), all_targets.numpy()
        )

        print(
            f"Validation - Loss: {val_loss:.4f}, F1 Score: {best_f1:.4f} (Threshold: {threshold:.2f})"
        )

        return val_loss, best_f1

In [7]:
trainer = Trainer(
    dataset_path="/kaggle/input/quora-insincere-questions-classification/train.csv",
    model_card=MODEL_NAME,
    checkpoint_dir="checkpoints",
    pos_weight=[1, 20],
    length_percentile=99.9,
    batch_size=64,
    epochs=2
)

Using cuda device
Train dataset size: 1044897
Validation dataset size: 261225


Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: FacebookAI/roberta-base initialized


In [None]:
history = trainer.train()