In [1]:
"""
EACL 2026 - Project 1: Hate Speech Intent and Impact Classification
Multi-Label Classification using 15 Transformer Models
"""

'\nEACL 2026 - Project 1: Hate Speech Intent and Impact Classification\nMulti-Label Classification using 15 Transformer Models\n'

In [2]:
# ============================================================================
# IMPORTS
# ============================================================================

import os
import gc
import json
import warnings
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    classification_report,
    hamming_loss
)

from torch.optim import AdamW

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup,
    LongformerTokenizer,
    LongformerForSequenceClassification,
    BigBirdTokenizer,
    BigBirdForSequenceClassification,
    BartTokenizer,
    BartForSequenceClassification,
    T5Tokenizer,
    T5ForConditionalGeneration,
    AutoModel
)
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm
from typing import Dict, List, Tuple, Optional
from dataclasses import dataclass

warnings.filterwarnings('ignore')

2025-10-04 15:54:21.412198: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759593261.576121      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759593261.620418      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# ============================================================================
# EXPERIMENTAL SETUP AND HYPERPARAMETERS
# ============================================================================

@dataclass
class ExperimentConfig:
    """Configuration for experimental setup."""
    
    # Paths
    DATASET_PATH: str = "/kaggle/input/eacl-26-dataset-1/EACL_26-HateSpeechDataset.json"
    OUTPUT_DIR: str = "/kaggle/working"
    
    # Reproducibility
    RANDOM_SEED: int = 42
    
    # Data splits
    TRAIN_SIZE: float = 0.6
    VAL_SIZE: float = 0.2
    TEST_SIZE: float = 0.2
    
    # Training hyperparameters (following M-HELP)
    BATCH_SIZE: int = 16
    LEARNING_RATE: float = 5e-5
    NUM_EPOCHS: int = 10
    WEIGHT_DECAY: float = 1e-2
    WARMUP_STEPS: int = 500
    MAX_GRAD_NORM: float = 1.0
    
    # Early stopping
    EARLY_STOPPING_PATIENCE: int = 3
    
    # Optimizer parameters
    ADAM_BETA1: float = 0.9
    ADAM_BETA2: float = 0.999
    ADAM_EPSILON: float = 1e-8
    
    # Model-specific parameters
    DROPOUT: float = 0.2
    
    # Max sequence lengths per model type
    MAX_SEQ_LENGTH_STANDARD: int = 512
    MAX_SEQ_LENGTH_LONG: int = 1024
    
    # Label definitions
    INTENT_LABELS: List[str] = None
    IMPACT_LABELS: List[str] = None
    
    def __post_init__(self):
        """Initialize label lists after dataclass creation."""
        self.INTENT_LABELS = [
            "Affective Aggression",
            "Derisive Trolling",
            "Dominance & Subjugation",
            "Ideological Expression",
            "Performative Reinforcement",
            "Strategic Incitement",
            "Threat & Intimidation"
        ]
        
        self.IMPACT_LABELS = [
            "Disruption of Public Discourse",
            "Glorification of Hate",
            "Incitement to Discrimination/Exclusion",
            "Incitement to Violence",
            "Misinformation/Disinformation Nexus",
            "Normalization of Prejudice",
            "Psychological Harm",
            "Stigmatization & Dehumanization"
        ]

CONFIG = ExperimentConfig()

In [4]:
# ============================================================================
# UTILITY FUNCTIONS
# ============================================================================

def set_seed(seed: int) -> None:
    """Set random seed for reproducibility."""
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def clear_memory() -> None:
    """Clear GPU and CPU memory."""
    gc.collect()
    torch.cuda.empty_cache()

def validate_dataset_structure(data: List[Dict]) -> None:
    """Validate dataset structure and raise errors for malformed data."""
    if not isinstance(data, list):
        raise ValueError(f"Dataset must be a list, got {type(data)}")
    
    if len(data) == 0:
        raise ValueError("Dataset is empty")
    
    required_keys = {'sample_id', 'text', 'labels'}
    for idx, sample in enumerate(data):
        missing_keys = required_keys - set(sample.keys())
        if missing_keys:
            raise ValueError(f"Sample {idx} missing keys: {missing_keys}")
        
        if not isinstance(sample['labels'], dict):
            raise ValueError(f"Sample {idx}: 'labels' must be dict, got {type(sample['labels'])}")
        
        if 'intent' not in sample['labels'] or 'impact' not in sample['labels']:
            raise ValueError(f"Sample {idx}: labels must have 'intent' and 'impact' keys")
        
        if not isinstance(sample['labels']['intent'], list):
            raise ValueError(f"Sample {idx}: intent must be list")
        
        if not isinstance(sample['labels']['impact'], list):
            raise ValueError(f"Sample {idx}: impact must be list")


In [5]:
# ============================================================================
# DATA LOADING AND PREPROCESSING
# ============================================================================

class DataProcessor:
    """Handle data loading, validation, and preprocessing."""
    
    def __init__(self, config: ExperimentConfig):
        self.config = config
        self.intent_mlb = MultiLabelBinarizer(classes=config.INTENT_LABELS)
        self.impact_mlb = MultiLabelBinarizer(classes=config.IMPACT_LABELS)
        
    def load_data(self) -> List[Dict]:
        """Load and validate dataset."""
        try:
            with open(self.config.DATASET_PATH, 'r', encoding='utf-8') as f:
                data = json.load(f)
            print(f"Successfully loaded {len(data)} samples from {self.config.DATASET_PATH}")
            
            validate_dataset_structure(data)
            print("Dataset structure validation passed")
            
            return data
            
        except FileNotFoundError:
            raise FileNotFoundError(f"Dataset not found at {self.config.DATASET_PATH}")
        except json.JSONDecodeError as e:
            raise ValueError(f"Invalid JSON format: {e}")
        except Exception as e:
            raise RuntimeError(f"Error loading dataset: {e}")
    
    def prepare_labels(self, data: List[Dict]) -> Tuple[np.ndarray, np.ndarray]:
        """Prepare multi-label encoded arrays."""
        intent_labels = [sample['labels']['intent'] for sample in data]
        impact_labels = [sample['labels']['impact'] for sample in data]
        
        intent_encoded = self.intent_mlb.fit_transform(intent_labels)
        impact_encoded = self.impact_mlb.fit_transform(impact_labels)
        
        print(f"Intent labels shape: {intent_encoded.shape}")
        print(f"Impact labels shape: {impact_encoded.shape}")
        
        return intent_encoded, impact_encoded
    
    def split_data(
        self, 
        data: List[Dict], 
        intent_labels: np.ndarray, 
        impact_labels: np.ndarray
    ) -> Dict[str, Tuple[List[Dict], np.ndarray, np.ndarray]]:
        """Split data into train/val/test sets."""
        
        # First split: train + temp
        train_data, temp_data, train_intent, temp_intent, train_impact, temp_impact = train_test_split(
            data,
            intent_labels,
            impact_labels,
            test_size=(self.config.VAL_SIZE + self.config.TEST_SIZE),
            random_state=self.config.RANDOM_SEED,
            shuffle=True
        )
        
        # Second split: val + test
        val_size_adjusted = self.config.VAL_SIZE / (self.config.VAL_SIZE + self.config.TEST_SIZE)
        val_data, test_data, val_intent, test_intent, val_impact, test_impact = train_test_split(
            temp_data,
            temp_intent,
            temp_impact,
            test_size=(1 - val_size_adjusted),
            random_state=self.config.RANDOM_SEED,
            shuffle=True
        )
        
        splits = {
            'train': (train_data, train_intent, train_impact),
            'val': (val_data, val_intent, val_impact),
            'test': (test_data, test_intent, test_impact)
        }
        
        print(f"Train: {len(train_data)} samples")
        print(f"Val: {len(val_data)} samples")
        print(f"Test: {len(test_data)} samples")
        
        return splits

In [6]:
# ============================================================================
# DATASET CLASS
# ============================================================================

class MultiLabelTextDataset(Dataset):
    """PyTorch Dataset for multi-label text classification."""
    
    def __init__(
        self, 
        texts: List[str], 
        labels: np.ndarray, 
        tokenizer, 
        max_length: int
    ):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self) -> int:
        return len(self.texts)
    
    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.FloatTensor(label)
        }

In [7]:
# ============================================================================
# MODEL WRAPPER FOR SENTENCE TRANSFORMERS
# ============================================================================

class SentenceTransformerClassifier(nn.Module):
    """Wrapper for Sentence Transformer models with classification head."""
    
    def __init__(self, model_name: str, num_labels: int, dropout: float = 0.2):
        super().__init__()
        self.encoder = SentenceTransformer(model_name)
        embedding_dim = self.encoder.get_sentence_embedding_dimension()
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(embedding_dim, num_labels)
        
    def forward(self, input_ids, attention_mask):
        # Sentence transformers expect text, not token IDs
        # This will be handled differently in the trainer
        pass

In [8]:
# ============================================================================
# EARLY STOPPING
# ============================================================================

class EarlyStopping:
    """Early stopping to prevent overfitting."""
    
    def __init__(self, patience: int = 3, min_delta: float = 0.0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False
        
    def __call__(self, val_loss: float) -> bool:
        if self.best_loss is None:
            self.best_loss = val_loss
        elif val_loss > self.best_loss - self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_loss = val_loss
            self.counter = 0
            
        return self.early_stop

In [9]:
# ============================================================================
# METRICS CALCULATOR
# ============================================================================

class MetricsCalculator:
    """Calculate and store evaluation metrics."""
    
    @staticmethod
    def calculate_metrics(
        y_true: np.ndarray, 
        y_pred: np.ndarray, 
        label_names: List[str]
    ) -> Dict[str, float]:
        """Calculate all metrics for multi-label classification."""
        
        metrics = {
            'subset_accuracy': accuracy_score(y_true, y_pred),
            'macro_f1': f1_score(y_true, y_pred, average='macro', zero_division=0),
            'weighted_f1': f1_score(y_true, y_pred, average='weighted', zero_division=0),
            'micro_f1': f1_score(y_true, y_pred, average='micro', zero_division=0),
            'hamming_loss': hamming_loss(y_true, y_pred)
        }
        
        # Per-label metrics
        per_label_f1 = f1_score(y_true, y_pred, average=None, zero_division=0)
        per_label_precision = precision_score(y_true, y_pred, average=None, zero_division=0)
        per_label_recall = recall_score(y_true, y_pred, average=None, zero_division=0)
        
        per_label_metrics = {}
        for i, label in enumerate(label_names):
            per_label_metrics[f'{label}_f1'] = per_label_f1[i]
            per_label_metrics[f'{label}_precision'] = per_label_precision[i]
            per_label_metrics[f'{label}_recall'] = per_label_recall[i]
        
        metrics.update(per_label_metrics)
        
        return metrics

In [10]:
# ============================================================================
# MODEL TRAINER
# ============================================================================

class ModelTrainer:
    """Handle model training and evaluation."""
    
    def __init__(self, config: ExperimentConfig):
        self.config = config
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.use_data_parallel = torch.cuda.device_count() > 1  # ADD THIS LINE
        print(f"Using device: {self.device}")
        if self.use_data_parallel:  # ADD THESE 3 LINES
            print(f"DataParallel enabled with {torch.cuda.device_count()} GPUs")
        else:
            print("Single GPU mode")
        
    def get_model_and_tokenizer(
        self, 
        model_name: str, 
        num_labels: int
    ) -> Tuple[nn.Module, any, int]:
        """Load model and tokenizer with appropriate configuration."""
        
        try:
            # Long-context models
            if 'longformer' in model_name.lower():
                tokenizer = LongformerTokenizer.from_pretrained(model_name)
                model = LongformerForSequenceClassification.from_pretrained(
                    model_name,
                    num_labels=num_labels,
                    problem_type="multi_label_classification",
                    ignore_mismatched_sizes=True
                )
                max_length = self.config.MAX_SEQ_LENGTH_LONG
                
            elif 'bigbird' in model_name.lower():
                tokenizer = BigBirdTokenizer.from_pretrained(model_name)
                model = BigBirdForSequenceClassification.from_pretrained(
                    model_name,
                    num_labels=num_labels,
                    problem_type="multi_label_classification",
                    ignore_mismatched_sizes=True
                )
                max_length = self.config.MAX_SEQ_LENGTH_LONG
                
            # Encoder-decoder models
            elif 'bart' in model_name.lower():
                tokenizer = BartTokenizer.from_pretrained(model_name)
                model = BartForSequenceClassification.from_pretrained(
                    model_name,
                    num_labels=num_labels,
                    problem_type="multi_label_classification",
                    ignore_mismatched_sizes=True
                )
                max_length = self.config.MAX_SEQ_LENGTH_STANDARD


            # LED (Longformer Encoder-Decoder)
            elif 'led' in model_name.lower():
                from transformers import LEDTokenizer, LEDForSequenceClassification
                tokenizer = LEDTokenizer.from_pretrained(model_name)
                
                # LED can be used directly for classification
                try:
                    model = LEDForSequenceClassification.from_pretrained(
                        model_name,
                        num_labels=num_labels,
                        problem_type="multi_label_classification"
                    )
                except:
                    # If classification head doesn't exist, use custom wrapper
                    from transformers import LEDModel
                    base_model = LEDModel.from_pretrained(model_name)
                    
                    class LEDClassifierWrapper(nn.Module):
                        def __init__(self, led_model, num_labels):
                            super().__init__()
                            self.led = led_model
                            self.classifier = nn.Linear(led_model.config.d_model, num_labels)
                            
                        def forward(self, input_ids, attention_mask, **kwargs):
                            # Set global attention on first token
                            global_attention_mask = torch.zeros_like(input_ids)
                            global_attention_mask[:, 0] = 1
                            
                            encoder_outputs = self.led.encoder(
                                input_ids=input_ids,
                                attention_mask=attention_mask,
                                global_attention_mask=global_attention_mask
                            )
                            pooled = encoder_outputs.last_hidden_state[:, 0, :]
                            logits = self.classifier(pooled)
                            
                            from transformers.modeling_outputs import SequenceClassifierOutput
                            return SequenceClassifierOutput(logits=logits)
                    
                    model = LEDClassifierWrapper(base_model, num_labels)
                
                max_length = self.config.MAX_SEQ_LENGTH_LONG  # 1024 for LED
                
            elif 't5' in model_name.lower() or 'flan' in model_name.lower() or 'pegasus' in model_name.lower():
                tokenizer = AutoTokenizer.from_pretrained(model_name)
                # Use AutoModelForSeq2SeqLM wrapper for classification
                from transformers import AutoModelForSeq2SeqLM
                base_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
                
                # Create classification wrapper
                class T5ClassifierWrapper(nn.Module):
                    def __init__(self, t5_model, num_labels):
                        super().__init__()
                        self.t5 = t5_model
                        self.classifier = nn.Linear(t5_model.config.d_model, num_labels)
                        
                    def forward(self, input_ids, attention_mask, **kwargs):
                        encoder_outputs = self.t5.encoder(
                            input_ids=input_ids,
                            attention_mask=attention_mask
                        )
                        pooled = encoder_outputs.last_hidden_state.mean(dim=1)  # Mean pooling
                        logits = self.classifier(pooled)
                        
                        # Return compatible output
                        from transformers.modeling_outputs import SequenceClassifierOutput
                        return SequenceClassifierOutput(logits=logits)
                
                model = T5ClassifierWrapper(base_model, num_labels)
                max_length = self.config.MAX_SEQ_LENGTH_STANDARD
                
            # Sentence transformers
            elif 'sentence-transformers' in model_name:
                # Special handling for sentence transformers
                return None, None, self.config.MAX_SEQ_LENGTH_STANDARD
                
            # Standard transformers
            else:
                tokenizer = AutoTokenizer.from_pretrained(model_name)
                model = AutoModelForSequenceClassification.from_pretrained(
                    model_name,
                    num_labels=num_labels,
                    problem_type="multi_label_classification",
                    ignore_mismatched_sizes=True
                )
                max_length = self.config.MAX_SEQ_LENGTH_STANDARD
            
            model.to(self.device)
            print(f"Successfully loaded {model_name}")
            print(f"Max sequence length: {max_length}")

            if self.use_data_parallel and model is not None:
                model = nn.DataParallel(model, device_ids=[0, 1])
                print(f"Model wrapped in DataParallel")
            
            return model, tokenizer, max_length
            

        except Exception as e:
            raise RuntimeError(f"Failed to load model {model_name}: {str(e)}")



    def train_sentence_transformer_model(
        self,
        model_name: str,
        train_texts: List[str],
        train_labels: np.ndarray,
        val_texts: List[str],
        val_labels: np.ndarray,
        task_name: str
    ) -> Dict[str, float]:
        """
        Special training pipeline for sentence transformer models.
        Uses pre-computed embeddings + classification head.
        """
        print(f"\n{'='*80}")
        print(f"Training {model_name} for {task_name}")
        print(f"{'='*80}")
        
        try:
            from sentence_transformers import SentenceTransformer
            
            # Load sentence transformer
            print(f"Loading sentence transformer: {model_name}")
            encoder = SentenceTransformer(model_name)
            embedding_dim = encoder.get_sentence_embedding_dimension()
            print(f"Embedding dimension: {embedding_dim}")
            
            # Create classification head
            classifier = nn.Sequential(
                nn.Dropout(self.config.DROPOUT),
                nn.Linear(embedding_dim, train_labels.shape[1])
            )
            
            # Wrap in DataParallel if available
            if self.use_data_parallel:
                classifier = nn.DataParallel(classifier, device_ids=[0, 1])
                print("Classifier wrapped in DataParallel")
            
            classifier = classifier.to(self.device)
            
            # Encode all texts (batch processing)
            print("Encoding texts...")
            train_embeddings = encoder.encode(
                train_texts, 
                batch_size=32,
                show_progress_bar=True, 
                convert_to_tensor=True,
                device='cuda'
            )
            val_embeddings = encoder.encode(
                val_texts, 
                batch_size=32,
                show_progress_bar=True, 
                convert_to_tensor=True,
                device='cuda'
            )
            
            # Move to device
            train_embeddings = train_embeddings.to(self.device)
            val_embeddings = val_embeddings.to(self.device)
            train_labels_torch = torch.FloatTensor(train_labels).to(self.device)
            val_labels_torch = torch.FloatTensor(val_labels).to(self.device)
            
            # Setup training
            optimizer = AdamW(
                classifier.parameters(),
                lr=self.config.LEARNING_RATE,
                betas=(self.config.ADAM_BETA1, self.config.ADAM_BETA2),
                eps=self.config.ADAM_EPSILON,
                weight_decay=self.config.WEIGHT_DECAY
            )
            
            total_steps = (len(train_embeddings) // self.config.BATCH_SIZE) * self.config.NUM_EPOCHS
            scheduler = get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=self.config.WARMUP_STEPS,
                num_training_steps=total_steps
            )
            
            criterion = nn.BCEWithLogitsLoss()
            early_stopping = EarlyStopping(patience=self.config.EARLY_STOPPING_PATIENCE)
            
            best_val_loss = float('inf')
            
            # Training loop
            for epoch in range(self.config.NUM_EPOCHS):
                print(f"\nEpoch {epoch + 1}/{self.config.NUM_EPOCHS}")
                
                # Train
                classifier.train()
                total_loss = 0
                num_batches = 0
                
                progress_bar = tqdm(
                    range(0, len(train_embeddings), self.config.BATCH_SIZE),
                    desc="Training",
                    leave=False
                )
                
                for i in progress_bar:
                    batch_embeddings = train_embeddings[i:i+self.config.BATCH_SIZE]
                    batch_labels = train_labels_torch[i:i+self.config.BATCH_SIZE]
                    
                    optimizer.zero_grad()
                    logits = classifier(batch_embeddings)
                    loss = criterion(logits, batch_labels)
                    loss.backward()
                    
                    torch.nn.utils.clip_grad_norm_(classifier.parameters(), self.config.MAX_GRAD_NORM)
                    optimizer.step()
                    scheduler.step()
                    
                    total_loss += loss.item()
                    num_batches += 1
                    progress_bar.set_postfix({'loss': loss.item()})
                
                train_loss = total_loss / num_batches
                
                # Validate
                classifier.eval()
                with torch.no_grad():
                    val_logits = classifier(val_embeddings)
                    val_loss = criterion(val_logits, val_labels_torch).item()
                    val_preds = (torch.sigmoid(val_logits) > 0.5).cpu().numpy()
                
                print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
                
                if early_stopping(val_loss):
                    print(f"Early stopping triggered at epoch {epoch + 1}")
                    break
                
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
            
            print(f"\nTraining completed for {model_name}")
            print(f"Best validation loss: {best_val_loss:.4f}")
            
            # Calculate metrics
            metrics = MetricsCalculator.calculate_metrics(
                val_labels,
                val_preds,
                self.config.INTENT_LABELS if 'intent' in task_name.lower() 
                else self.config.IMPACT_LABELS
            )
            
            # Clean up
            del encoder
            del classifier
            del train_embeddings
            del val_embeddings
            clear_memory()
            
            return metrics
            
        except Exception as e:
            print(f"\n{'!'*80}")
            print(f"TRAINING FAILED for {model_name}")
            print(f"{'!'*80}")
            print(f"Error: {str(e)}")
            clear_memory()
            return None

    
    def train_epoch(
        self, 
        model: nn.Module, 
        dataloader: DataLoader, 
        optimizer: torch.optim.Optimizer,
        scheduler,
        criterion: nn.Module
    ) -> float:
        """Train for one epoch."""
        model.train()
        total_loss = 0
        
        progress_bar = tqdm(dataloader, desc="Training", leave=False)
        
        for batch in progress_bar:
            try:
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['labels'].to(self.device)
                
                optimizer.zero_grad()
                
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                )
                
                logits = outputs.logits if hasattr(outputs, 'logits') else outputs[0]
                loss = criterion(logits, labels)
                
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), self.config.MAX_GRAD_NORM)
                optimizer.step()
                scheduler.step()
                
                total_loss += loss.item()
                progress_bar.set_postfix({'loss': loss.item()})
                
            except RuntimeError as e:
                if "out of memory" in str(e):
                    raise MemoryError(f"OOM Error during training: {str(e)}")
                else:
                    raise RuntimeError(f"Training error: {str(e)}")
        
        return total_loss / len(dataloader)
    
    def evaluate(
        self, 
        model: nn.Module, 
        dataloader: DataLoader, 
        criterion: nn.Module
    ) -> Tuple[float, np.ndarray, np.ndarray]:
        """Evaluate model on validation/test set."""
        model.eval()
        total_loss = 0
        all_preds = []
        all_labels = []
        
        progress_bar = tqdm(dataloader, desc="Evaluating", leave=False)
        
        with torch.no_grad():
            for batch in progress_bar:
                try:
                    input_ids = batch['input_ids'].to(self.device)
                    attention_mask = batch['attention_mask'].to(self.device)
                    labels = batch['labels'].to(self.device)
                    
                    outputs = model(
                        input_ids=input_ids,
                        attention_mask=attention_mask
                    )
                    
                    logits = outputs.logits if hasattr(outputs, 'logits') else outputs[0]
                    loss = criterion(logits, labels)
                    
                    total_loss += loss.item()
                    
                    # Convert logits to predictions (threshold at 0.5)
                    preds = torch.sigmoid(logits) > 0.5
                    
                    all_preds.append(preds.cpu().numpy())
                    all_labels.append(labels.cpu().numpy())
                    
                except RuntimeError as e:
                    if "out of memory" in str(e):
                        raise MemoryError(f"OOM Error during evaluation: {str(e)}")
                    else:
                        raise RuntimeError(f"Evaluation error: {str(e)}")
        
        all_preds = np.vstack(all_preds)
        all_labels = np.vstack(all_labels)
        
        return total_loss / len(dataloader), all_preds, all_labels
    
    def train_model(
        self,
        model_name: str,
        train_texts: List[str],
        train_labels: np.ndarray,
        val_texts: List[str],
        val_labels: np.ndarray,
        task_name: str
    ) -> Dict[str, float]:
        """Complete training pipeline for a single model."""
        
        print(f"\n{'='*80}")
        print(f"Training {model_name} for {task_name}")
        print(f"{'='*80}")
        
        try:
            # Get model and tokenizer
            model, tokenizer, max_length = self.get_model_and_tokenizer(
                model_name, 
                train_labels.shape[1]
            )
            
            # Create datasets
            train_dataset = MultiLabelTextDataset(
                train_texts, train_labels, tokenizer, max_length
            )
            val_dataset = MultiLabelTextDataset(
                val_texts, val_labels, tokenizer, max_length
            )
            
            train_loader = DataLoader(
                train_dataset, 
                batch_size=self.config.BATCH_SIZE, 
                shuffle=True
            )
            val_loader = DataLoader(
                val_dataset, 
                batch_size=self.config.BATCH_SIZE, 
                shuffle=False
            )
            
            # Setup optimizer and scheduler
            optimizer = AdamW(
                model.parameters(),
                lr=self.config.LEARNING_RATE,
                betas=(self.config.ADAM_BETA1, self.config.ADAM_BETA2),
                eps=self.config.ADAM_EPSILON,
                weight_decay=self.config.WEIGHT_DECAY
            )
            
            total_steps = len(train_loader) * self.config.NUM_EPOCHS
            scheduler = get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=self.config.WARMUP_STEPS,
                num_training_steps=total_steps
            )
            
            criterion = nn.BCEWithLogitsLoss()
            early_stopping = EarlyStopping(patience=self.config.EARLY_STOPPING_PATIENCE)
            
            # Training loop
            best_val_loss = float('inf')
            
            for epoch in range(self.config.NUM_EPOCHS):
                print(f"\nEpoch {epoch + 1}/{self.config.NUM_EPOCHS}")
                
                train_loss = self.train_epoch(
                    model, train_loader, optimizer, scheduler, criterion
                )
                val_loss, val_preds, val_labels_actual = self.evaluate(
                    model, val_loader, criterion
                )
                
                print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
                
                if early_stopping(val_loss):
                    print(f"Early stopping triggered at epoch {epoch + 1}")
                    break
                
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
            
            print(f"\nTraining completed for {model_name}")
            print(f"Best validation loss: {best_val_loss:.4f}")
            
            # Final metrics (return validation metrics)
            metrics = MetricsCalculator.calculate_metrics(
                val_labels_actual,
                val_preds,
                self.config.INTENT_LABELS if 'intent' in task_name.lower() 
                else self.config.IMPACT_LABELS
            )
            
            # Clean up
            del model
            del tokenizer
            del optimizer
            del scheduler
            clear_memory()
            
            return metrics
            
        except MemoryError as e:
            print(f"\n{'!'*80}")
            print(f"OUT OF MEMORY ERROR for {model_name}")
            print(f"{'!'*80}")
            print(f"Error: {str(e)}")
            print("Consider:")
            print("- Reducing batch size")
            print("- Using a smaller model variant")
            print("- Using gradient accumulation")
            clear_memory()
            return None
            
        except Exception as e:
            print(f"\n{'!'*80}")
            print(f"TRAINING FAILED for {model_name}")
            print(f"{'!'*80}")
            print(f"Error: {str(e)}")
            clear_memory()
            return None

In [11]:
# ============================================================================
# T5 CLASSIFICATION WRAPPER
# ============================================================================

class T5ClassificationWrapper(nn.Module):
    """Wrapper for T5 models to enable classification."""
    
    def __init__(self, t5_model, num_labels: int):
        super().__init__()
        self.t5 = t5_model
        self.classifier = nn.Linear(t5_model.config.d_model, num_labels)
        
    def forward(self, input_ids, attention_mask):
        encoder_outputs = self.t5.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        hidden_states = encoder_outputs.last_hidden_state
        pooled = hidden_states[:, 0, :]  # Use first token
        logits = self.classifier(pooled)
        
        # Return in same format as other models
        from transformers.modeling_outputs import SequenceClassifierOutput
        return SequenceClassifierOutput(logits=logits)

In [12]:
# ============================================================================
# EXPERIMENT RUNNER
# ============================================================================

class ExperimentRunner:
    """Orchestrate the complete experiment."""
    
    def __init__(self, config: ExperimentConfig):
        self.config = config
        self.processor = DataProcessor(config)
        self.trainer = ModelTrainer(config)
        self.results = {
            'intent': [],
            'impact': []
        }
        
    def get_model_list(self) -> List[Tuple[str, str, str]]:
        models = [
            ("BERT-base", "Encoder", "bert-base-uncased"),
            ("HateBERT", "Encoder (Domain)", "Hate-speech-CNERG/bert-base-uncased-hatexplain"),
            ("RoBERTa-base", "Encoder", "roberta-base"),
            ("Twitter-RoBERTa-Hate", "Encoder (Domain)", "cardiffnlp/twitter-roberta-base-hate"),
            ("DistilRoBERTa", "Encoder (Light)", "distilroberta-base"),  # NEW
            ("DistilBERT", "Encoder (Light)", "distilbert-base-uncased"),
            ("DeBERTa-v3-small", "Advanced Encoder", "microsoft/deberta-v3-small"),
            ("XLM-RoBERTa-base", "Advanced Encoder", "xlm-roberta-base"),  # NEW
            ("ToxicBERT", "Advanced Encoder (Domain)", "unitary/toxic-bert"),
            ("ELECTRA-small", "Advanced Encoder", "google/electra-small-discriminator"),
            ("ELECTRA-base", "Advanced Encoder", "google/electra-base-discriminator"),
            ("ConvBERT-base", "Advanced Encoder", "YituTech/conv-bert-base"),  # NEW
            ("ALBERT-base-v2", "Advanced Encoder", "albert-base-v2"),
            ("BART-base", "Encoder-Decoder", "facebook/bart-base"),
            ("DistilBART-cnn", "Encoder-Decoder (Light)", "sshleifer/distilbart-cnn-6-6"),
        ]
        return models


    def save_label_distribution(self, splits):
        """Save label distribution statistics for train/val/test splits."""
        
        # Extract data from splits
        train_data, train_intent, train_impact = splits['train']
        val_data, val_intent, val_impact = splits['val']
        test_data, test_intent, test_impact = splits['test']
        
        # Calculate intent label distribution
        intent_dist = []
        for i, label in enumerate(self.config.INTENT_LABELS):
            intent_dist.append({
                'Label': label,
                'Train': train_intent[:, i].sum(),
                'Val': val_intent[:, i].sum(),
                'Test': test_intent[:, i].sum(),
                'Total': train_intent[:, i].sum() + val_intent[:, i].sum() + test_intent[:, i].sum()
            })
        
        # Calculate impact label distribution
        impact_dist = []
        for i, label in enumerate(self.config.IMPACT_LABELS):
            impact_dist.append({
                'Label': label,
                'Train': train_impact[:, i].sum(),
                'Val': val_impact[:, i].sum(),
                'Test': test_impact[:, i].sum(),
                'Total': train_impact[:, i].sum() + val_impact[:, i].sum() + test_impact[:, i].sum()
            })
        
        # Create DataFrames
        intent_dist_df = pd.DataFrame(intent_dist)
        impact_dist_df = pd.DataFrame(impact_dist)
        
        # Save to CSV
        os.makedirs(self.config.OUTPUT_DIR, exist_ok=True)
        
        intent_dist_path = os.path.join(self.config.OUTPUT_DIR, 'project1_intent_label_distribution.csv')
        impact_dist_path = os.path.join(self.config.OUTPUT_DIR, 'project1_impact_label_distribution.csv')
        
        intent_dist_df.to_csv(intent_dist_path, index=False)
        impact_dist_df.to_csv(impact_dist_path, index=False)
        
        print(f"Saved: {intent_dist_path}")
        print(f"Saved: {impact_dist_path}")
        
        # Print summary
        print("\n" + "="*80)
        print("INTENT LABEL DISTRIBUTION")
        print("="*80)
        print(intent_dist_df.to_string(index=False))
        
        print("\n" + "="*80)
        print("IMPACT LABEL DISTRIBUTION")
        print("="*80)
        print(impact_dist_df.to_string(index=False))    

    
    
    def run_experiments(self):
        """Run all experiments."""
        
        print("\n" + "="*80)
        print("STARTING EXPERIMENT: Project 1 - Hate Speech Classification")
        print("="*80)
        
        # Load and prepare data
        print("\n[1/5] Loading dataset...")
        data = self.processor.load_data()
        
        print("\n[2/5] Preparing labels...")
        intent_labels, impact_labels = self.processor.prepare_labels(data)
        
        print("\n[3/5] Splitting data...")
        splits = self.processor.split_data(data, intent_labels, impact_labels)
        self.save_label_distribution(splits)

        
        train_data, train_intent, train_impact = splits['train']
        val_data, val_intent, val_impact = splits['val']
        test_data, test_intent, test_impact = splits['test']
        
        train_texts = [s['text'] for s in train_data]
        val_texts = [s['text'] for s in val_data]
        test_texts = [s['text'] for s in test_data]
        
        # Get model list
        models = self.get_model_list()
        
        print(f"\n[4/5] Training {len(models)} models on 2 tasks...")
        print(f"Total training runs: {len(models) * 2}")
        
        # Train all models
        for idx, (model_name, model_type, hf_id) in enumerate(models, 1):
            print(f"\n{'#'*80}")
            print(f"MODEL {idx}/{len(models)}: {model_name} ({model_type})")
            print(f"{'#'*80}")
            
            # Check if sentence transformer
            if 'sentence-transformers' in hf_id:
                # Use special training method for sentence transformers
                intent_metrics = self.trainer.train_sentence_transformer_model(
                    hf_id,
                    train_texts,
                    train_intent,
                    val_texts,
                    val_intent,
                    "Intent Classification"
                )
                
                if intent_metrics is not None:
                    intent_metrics['model_name'] = model_name
                    intent_metrics['model_type'] = model_type
                    intent_metrics['hf_id'] = hf_id
                    self.results['intent'].append(intent_metrics)
                
                impact_metrics = self.trainer.train_sentence_transformer_model(
                    hf_id,
                    train_texts,
                    train_impact,
                    val_texts,
                    val_impact,
                    "Impact Classification"
                )
                
                if impact_metrics is not None:
                    impact_metrics['model_name'] = model_name
                    impact_metrics['model_type'] = model_type
                    impact_metrics['hf_id'] = hf_id
                    self.results['impact'].append(impact_metrics)
            else:
                # Normal training for non-sentence-transformer models
                intent_metrics = self.trainer.train_model(
                    hf_id,
                    train_texts,
                    train_intent,
                    val_texts,
                    val_intent,
                    "Intent Classification"
                )
                
                if intent_metrics is not None:
                    intent_metrics['model_name'] = model_name
                    intent_metrics['model_type'] = model_type
                    intent_metrics['hf_id'] = hf_id
                    self.results['intent'].append(intent_metrics)
                
                impact_metrics = self.trainer.train_model(
                    hf_id,
                    train_texts,
                    train_impact,
                    val_texts,
                    val_impact,
                    "Impact Classification"
                )
                
                if impact_metrics is not None:
                    impact_metrics['model_name'] = model_name
                    impact_metrics['model_type'] = model_type
                    impact_metrics['hf_id'] = hf_id
                    self.results['impact'].append(impact_metrics)
        
        print("\n[5/5] Saving results...")
        self.save_results()
        
        print("\n" + "="*80)
        print("EXPERIMENT COMPLETED")
        print("="*80)
    
    def save_results(self):
        """Save all results to CSV files."""
        
        os.makedirs(self.config.OUTPUT_DIR, exist_ok=True)
        
        # Main results
        intent_df = pd.DataFrame(self.results['intent'])
        impact_df = pd.DataFrame(self.results['impact'])
        
        # Reorder columns for main metrics
        main_cols = ['model_name', 'model_type', 'subset_accuracy', 'macro_f1', 
                     'weighted_f1', 'micro_f1', 'hamming_loss']
        
        intent_main = intent_df[main_cols]
        impact_main = impact_df[main_cols]
        
        # Save main results
        intent_main_path = os.path.join(self.config.OUTPUT_DIR, 'project1_intent_results.csv')
        impact_main_path = os.path.join(self.config.OUTPUT_DIR, 'project1_impact_results.csv')
        
        intent_main.to_csv(intent_main_path, index=False)
        impact_main.to_csv(impact_main_path, index=False)
        
        print(f"Saved: {intent_main_path}")
        print(f"Saved: {impact_main_path}")
        
        # Per-label results
        intent_label_cols = ['model_name', 'model_type'] + \
                           [col for col in intent_df.columns if any(label in col for label in self.config.INTENT_LABELS)]
        impact_label_cols = ['model_name', 'model_type'] + \
                           [col for col in impact_df.columns if any(label in col for label in self.config.IMPACT_LABELS)]
        
        intent_per_label = intent_df[intent_label_cols]
        impact_per_label = impact_df[impact_label_cols]
        
        intent_per_label_path = os.path.join(self.config.OUTPUT_DIR, 'project1_intent_per_label.csv')
        impact_per_label_path = os.path.join(self.config.OUTPUT_DIR, 'project1_impact_per_label.csv')
        
        intent_per_label.to_csv(intent_per_label_path, index=False)
        impact_per_label.to_csv(impact_per_label_path, index=False)
        
        print(f"Saved: {intent_per_label_path}")
        print(f"Saved: {impact_per_label_path}")

In [13]:
# ============================================================================
# MAIN EXECUTION
# ============================================================================

    
# Set seed for reproducibility
set_seed(CONFIG.RANDOM_SEED)

# Initialize and run experiment
runner = ExperimentRunner(CONFIG)
runner.run_experiments()

print("\n" + "="*80)
print("All results saved to:", CONFIG.OUTPUT_DIR)
print("="*80)

Using device: cuda
DataParallel enabled with 2 GPUs

STARTING EXPERIMENT: Project 1 - Hate Speech Classification

[1/5] Loading dataset...
Successfully loaded 3296 samples from /kaggle/input/eacl-26-dataset-1/EACL_26-HateSpeechDataset.json
Dataset structure validation passed

[2/5] Preparing labels...
Intent labels shape: (3296, 7)
Impact labels shape: (3296, 8)

[3/5] Splitting data...
Train: 1977 samples
Val: 659 samples
Test: 660 samples
Saved: /kaggle/working/project1_intent_label_distribution.csv
Saved: /kaggle/working/project1_impact_label_distribution.csv

INTENT LABEL DISTRIBUTION
                     Label  Train  Val  Test  Total
      Affective Aggression    251   90   100    441
         Derisive Trolling    229   75    79    383
   Dominance & Subjugation    316  107    97    520
    Ideological Expression    422  145   150    717
Performative Reinforcement    258   80    86    424
      Strategic Incitement    455  154   144    753
     Threat & Intimidation    278   84  

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Successfully loaded bert-base-uncased
Max sequence length: 512
Model wrapped in DataParallel

Epoch 1/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.5788 | Val Loss: 0.4214

Epoch 2/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.3326 | Val Loss: 0.2378

Epoch 3/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1898 | Val Loss: 0.1644

Epoch 4/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1121 | Val Loss: 0.1363

Epoch 5/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0781 | Val Loss: 0.1689

Epoch 6/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0563 | Val Loss: 0.1160

Epoch 7/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0376 | Val Loss: 0.1246

Epoch 8/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0250 | Val Loss: 0.1232

Epoch 9/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0184 | Val Loss: 0.1210
Early stopping triggered at epoch 9

Training completed for bert-base-uncased
Best validation loss: 0.1160

Training bert-base-uncased for Impact Classification


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Successfully loaded bert-base-uncased
Max sequence length: 512
Model wrapped in DataParallel

Epoch 1/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.5768 | Val Loss: 0.4775

Epoch 2/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.4147 | Val Loss: 0.3501

Epoch 3/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.3078 | Val Loss: 0.2944

Epoch 4/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.2258 | Val Loss: 0.2915

Epoch 5/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1611 | Val Loss: 0.2581

Epoch 6/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1039 | Val Loss: 0.2522

Epoch 7/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0672 | Val Loss: 0.2641

Epoch 8/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0426 | Val Loss: 0.2651

Epoch 9/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0310 | Val Loss: 0.2648
Early stopping triggered at epoch 9

Training completed for bert-base-uncased
Best validation loss: 0.2522

################################################################################
MODEL 2/15: HateBERT (Encoder (Domain))
################################################################################

Training Hate-speech-CNERG/bert-base-uncased-hatexplain for Intent Classification


tokenizer_config.json:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Hate-speech-CNERG/bert-base-uncased-hatexplain and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([7, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([7]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Successfully loaded Hate-speech-CNERG/bert-base-uncased-hatexplain
Max sequence length: 512
Model wrapped in DataParallel

Epoch 1/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.5138 | Val Loss: 0.3805

Epoch 2/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.3247 | Val Loss: 0.2672

Epoch 3/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.2179 | Val Loss: 0.1929

Epoch 4/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1433 | Val Loss: 0.1776

Epoch 5/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0956 | Val Loss: 0.1439

Epoch 6/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0646 | Val Loss: 0.1328

Epoch 7/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0476 | Val Loss: 0.1345

Epoch 8/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0338 | Val Loss: 0.1275

Epoch 9/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0250 | Val Loss: 0.1263

Epoch 10/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0198 | Val Loss: 0.1278

Training completed for Hate-speech-CNERG/bert-base-uncased-hatexplain
Best validation loss: 0.1263

Training Hate-speech-CNERG/bert-base-uncased-hatexplain for Impact Classification


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Hate-speech-CNERG/bert-base-uncased-hatexplain and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([8, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([8]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Successfully loaded Hate-speech-CNERG/bert-base-uncased-hatexplain
Max sequence length: 512
Model wrapped in DataParallel

Epoch 1/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.5829 | Val Loss: 0.4926

Epoch 2/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.4290 | Val Loss: 0.3686

Epoch 3/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.3185 | Val Loss: 0.3154

Epoch 4/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.2407 | Val Loss: 0.2782

Epoch 5/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1693 | Val Loss: 0.2808

Epoch 6/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1121 | Val Loss: 0.2836

Epoch 7/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0733 | Val Loss: 0.2825
Early stopping triggered at epoch 7

Training completed for Hate-speech-CNERG/bert-base-uncased-hatexplain
Best validation loss: 0.2782

################################################################################
MODEL 3/15: RoBERTa-base (Encoder)
################################################################################

Training roberta-base for Intent Classification


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Successfully loaded roberta-base
Max sequence length: 512
Model wrapped in DataParallel

Epoch 1/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.5626 | Val Loss: 0.3433

Epoch 2/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.2564 | Val Loss: 0.1783

Epoch 3/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1653 | Val Loss: 0.1557

Epoch 4/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1219 | Val Loss: 0.1436

Epoch 5/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1042 | Val Loss: 0.1329

Epoch 6/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0826 | Val Loss: 0.1573

Epoch 7/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0584 | Val Loss: 0.1203

Epoch 8/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0413 | Val Loss: 0.1359

Epoch 9/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0297 | Val Loss: 0.1245

Epoch 10/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0207 | Val Loss: 0.1211
Early stopping triggered at epoch 10

Training completed for roberta-base
Best validation loss: 0.1203

Training roberta-base for Impact Classification


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Successfully loaded roberta-base
Max sequence length: 512
Model wrapped in DataParallel

Epoch 1/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.5791 | Val Loss: 0.4439

Epoch 2/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.3630 | Val Loss: 0.3050

Epoch 3/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.2710 | Val Loss: 0.2583

Epoch 4/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.2149 | Val Loss: 0.2505

Epoch 5/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1665 | Val Loss: 0.2460

Epoch 6/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1144 | Val Loss: 0.2667

Epoch 7/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0750 | Val Loss: 0.2502

Epoch 8/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0481 | Val Loss: 0.2542
Early stopping triggered at epoch 8

Training completed for roberta-base
Best validation loss: 0.2460

################################################################################
MODEL 4/15: Twitter-RoBERTa-Hate (Encoder (Domain))
################################################################################

Training cardiffnlp/twitter-roberta-base-hate for Intent Classification


config.json:   0%|          | 0.00/700 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-hate and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([7, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([7]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Successfully loaded cardiffnlp/twitter-roberta-base-hate
Max sequence length: 512
Model wrapped in DataParallel

Epoch 1/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.5089 | Val Loss: 0.3306

Epoch 2/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.2602 | Val Loss: 0.2020

Epoch 3/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1747 | Val Loss: 0.1526

Epoch 4/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1398 | Val Loss: 0.1689

Epoch 5/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1070 | Val Loss: 0.1273

Epoch 6/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0768 | Val Loss: 0.1460

Epoch 7/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0574 | Val Loss: 0.1361

Epoch 8/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0416 | Val Loss: 0.1331
Early stopping triggered at epoch 8

Training completed for cardiffnlp/twitter-roberta-base-hate
Best validation loss: 0.1273

Training cardiffnlp/twitter-roberta-base-hate for Impact Classification


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-hate and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([8, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([8]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Successfully loaded cardiffnlp/twitter-roberta-base-hate
Max sequence length: 512
Model wrapped in DataParallel

Epoch 1/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.5569 | Val Loss: 0.4336

Epoch 2/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.3714 | Val Loss: 0.3119

Epoch 3/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.2841 | Val Loss: 0.2879

Epoch 4/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.2305 | Val Loss: 0.2692

Epoch 5/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1780 | Val Loss: 0.2469

Epoch 6/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1234 | Val Loss: 0.2561

Epoch 7/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0824 | Val Loss: 0.2741

Epoch 8/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0547 | Val Loss: 0.2647
Early stopping triggered at epoch 8

Training completed for cardiffnlp/twitter-roberta-base-hate
Best validation loss: 0.2469

################################################################################
MODEL 5/15: DistilRoBERTa (Encoder (Light))
################################################################################

Training distilroberta-base for Intent Classification


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Successfully loaded distilroberta-base
Max sequence length: 512
Model wrapped in DataParallel

Epoch 1/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.5546 | Val Loss: 0.3624

Epoch 2/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.2795 | Val Loss: 0.1968

Epoch 3/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1727 | Val Loss: 0.1708

Epoch 4/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1366 | Val Loss: 0.1429

Epoch 5/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0976 | Val Loss: 0.1241

Epoch 6/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0714 | Val Loss: 0.1217

Epoch 7/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0538 | Val Loss: 0.1207

Epoch 8/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0369 | Val Loss: 0.1218

Epoch 9/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0286 | Val Loss: 0.1244

Epoch 10/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0213 | Val Loss: 0.1238
Early stopping triggered at epoch 10

Training completed for distilroberta-base
Best validation loss: 0.1207

Training distilroberta-base for Impact Classification


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Successfully loaded distilroberta-base
Max sequence length: 512
Model wrapped in DataParallel

Epoch 1/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.5953 | Val Loss: 0.4669

Epoch 2/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.3802 | Val Loss: 0.3182

Epoch 3/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.2870 | Val Loss: 0.2820

Epoch 4/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.2395 | Val Loss: 0.2582

Epoch 5/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1849 | Val Loss: 0.2791

Epoch 6/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1359 | Val Loss: 0.2666

Epoch 7/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0892 | Val Loss: 0.2771
Early stopping triggered at epoch 7

Training completed for distilroberta-base
Best validation loss: 0.2582

################################################################################
MODEL 6/15: DistilBERT (Encoder (Light))
################################################################################

Training distilbert-base-uncased for Intent Classification


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Successfully loaded distilbert-base-uncased
Max sequence length: 512
Model wrapped in DataParallel

Epoch 1/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.5523 | Val Loss: 0.4095

Epoch 2/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.3225 | Val Loss: 0.2371

Epoch 3/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1873 | Val Loss: 0.1713

Epoch 4/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1185 | Val Loss: 0.1445

Epoch 5/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0792 | Val Loss: 0.1415

Epoch 6/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0574 | Val Loss: 0.1485

Epoch 7/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0382 | Val Loss: 0.1337

Epoch 8/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0240 | Val Loss: 0.1374

Epoch 9/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0162 | Val Loss: 0.1347

Epoch 10/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0122 | Val Loss: 0.1347
Early stopping triggered at epoch 10

Training completed for distilbert-base-uncased
Best validation loss: 0.1337

Training distilbert-base-uncased for Impact Classification


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Successfully loaded distilbert-base-uncased
Max sequence length: 512
Model wrapped in DataParallel

Epoch 1/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.5968 | Val Loss: 0.4935

Epoch 2/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.4256 | Val Loss: 0.3758

Epoch 3/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.3101 | Val Loss: 0.3038

Epoch 4/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.2340 | Val Loss: 0.2802

Epoch 5/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1653 | Val Loss: 0.2723

Epoch 6/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1075 | Val Loss: 0.2749

Epoch 7/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0649 | Val Loss: 0.2792

Epoch 8/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0397 | Val Loss: 0.2856
Early stopping triggered at epoch 8

Training completed for distilbert-base-uncased
Best validation loss: 0.2723

################################################################################
MODEL 7/15: DeBERTa-v3-small (Advanced Encoder)
################################################################################

Training microsoft/deberta-v3-small for Intent Classification


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/286M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/286M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Successfully loaded microsoft/deberta-v3-small
Max sequence length: 512
Model wrapped in DataParallel

Epoch 1/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.5877 | Val Loss: 0.3921

Epoch 2/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.3199 | Val Loss: 0.2168

Epoch 3/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1852 | Val Loss: 0.1689

Epoch 4/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1311 | Val Loss: 0.2027

Epoch 5/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1129 | Val Loss: 0.1576

Epoch 6/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0843 | Val Loss: 0.1378

Epoch 7/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0599 | Val Loss: 0.1546

Epoch 8/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0440 | Val Loss: 0.1312

Epoch 9/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0307 | Val Loss: 0.1330

Epoch 10/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0227 | Val Loss: 0.1347

Training completed for microsoft/deberta-v3-small
Best validation loss: 0.1312

Training microsoft/deberta-v3-small for Impact Classification


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Successfully loaded microsoft/deberta-v3-small
Max sequence length: 512
Model wrapped in DataParallel

Epoch 1/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.6045 | Val Loss: 0.5005

Epoch 2/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.4187 | Val Loss: 0.3423

Epoch 3/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.3025 | Val Loss: 0.2982

Epoch 4/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.2444 | Val Loss: 0.2801

Epoch 5/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1955 | Val Loss: 0.2732

Epoch 6/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1482 | Val Loss: 0.2866

Epoch 7/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1051 | Val Loss: 0.2784

Epoch 8/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0696 | Val Loss: 0.2964
Early stopping triggered at epoch 8

Training completed for microsoft/deberta-v3-small
Best validation loss: 0.2732

################################################################################
MODEL 8/15: XLM-RoBERTa-base (Advanced Encoder)
################################################################################

Training xlm-roberta-base for Intent Classification


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Successfully loaded xlm-roberta-base
Max sequence length: 512
Model wrapped in DataParallel

Epoch 1/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.5364 | Val Loss: 0.3997

Epoch 2/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.3398 | Val Loss: 0.2452

Epoch 3/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.2270 | Val Loss: 0.1903

Epoch 4/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1732 | Val Loss: 0.1601

Epoch 5/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1392 | Val Loss: 0.1571

Epoch 6/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0968 | Val Loss: 0.1418

Epoch 7/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0784 | Val Loss: 0.1516

Epoch 8/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0583 | Val Loss: 0.1445

Epoch 9/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0461 | Val Loss: 0.1439
Early stopping triggered at epoch 9

Training completed for xlm-roberta-base
Best validation loss: 0.1418

Training xlm-roberta-base for Impact Classification


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Successfully loaded xlm-roberta-base
Max sequence length: 512
Model wrapped in DataParallel

Epoch 1/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.5971 | Val Loss: 0.4990

Epoch 2/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.4366 | Val Loss: 0.3546

Epoch 3/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.3371 | Val Loss: 0.3338

Epoch 4/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.2773 | Val Loss: 0.2857

Epoch 5/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.2282 | Val Loss: 0.2730

Epoch 6/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1742 | Val Loss: 0.2682

Epoch 7/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1292 | Val Loss: 0.2753

Epoch 8/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0970 | Val Loss: 0.2742

Epoch 9/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0695 | Val Loss: 0.2701
Early stopping triggered at epoch 9

Training completed for xlm-roberta-base
Best validation loss: 0.2682

################################################################################
MODEL 9/15: ToxicBERT (Advanced Encoder (Domain))
################################################################################

Training unitary/toxic-bert for Intent Classification


tokenizer_config.json:   0%|          | 0.00/174 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/811 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at unitary/toxic-bert and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([6]) in the checkpoint and torch.Size([7]) in the model instantiated
- classifier.weight: found shape torch.Size([6, 768]) in the checkpoint and torch.Size([7, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Successfully loaded unitary/toxic-bert
Max sequence length: 512
Model wrapped in DataParallel

Epoch 1/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.4984 | Val Loss: 0.4098

Epoch 2/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.3430 | Val Loss: 0.2706

Epoch 3/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.2054 | Val Loss: 0.1774

Epoch 4/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1277 | Val Loss: 0.1716

Epoch 5/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0934 | Val Loss: 0.1474

Epoch 6/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0671 | Val Loss: 0.1440

Epoch 7/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0484 | Val Loss: 0.1338

Epoch 8/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0345 | Val Loss: 0.1465

Epoch 9/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0245 | Val Loss: 0.1368

Epoch 10/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0198 | Val Loss: 0.1349
Early stopping triggered at epoch 10

Training completed for unitary/toxic-bert
Best validation loss: 0.1338

Training unitary/toxic-bert for Impact Classification


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at unitary/toxic-bert and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([6]) in the checkpoint and torch.Size([8]) in the model instantiated
- classifier.weight: found shape torch.Size([6, 768]) in the checkpoint and torch.Size([8, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Successfully loaded unitary/toxic-bert
Max sequence length: 512
Model wrapped in DataParallel

Epoch 1/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.5930 | Val Loss: 0.4926

Epoch 2/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.4189 | Val Loss: 0.3550

Epoch 3/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.3019 | Val Loss: 0.3063

Epoch 4/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.2222 | Val Loss: 0.2784

Epoch 5/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1606 | Val Loss: 0.2758

Epoch 6/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1058 | Val Loss: 0.2684

Epoch 7/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0688 | Val Loss: 0.2711

Epoch 8/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0458 | Val Loss: 0.2653

Epoch 9/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0332 | Val Loss: 0.2681

Epoch 10/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0277 | Val Loss: 0.2706

Training completed for unitary/toxic-bert
Best validation loss: 0.2653

################################################################################
MODEL 10/15: ELECTRA-small (Advanced Encoder)
################################################################################

Training google/electra-small-discriminator for Intent Classification


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/54.2M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Successfully loaded google/electra-small-discriminator
Max sequence length: 512
Model wrapped in DataParallel

Epoch 1/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/54.2M [00:00<?, ?B/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.6564 | Val Loss: 0.5991

Epoch 2/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.5338 | Val Loss: 0.4551

Epoch 3/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.4215 | Val Loss: 0.3821

Epoch 4/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.3471 | Val Loss: 0.3139

Epoch 5/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.2654 | Val Loss: 0.2480

Epoch 6/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.2028 | Val Loss: 0.2084

Epoch 7/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1584 | Val Loss: 0.1902

Epoch 8/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1291 | Val Loss: 0.1771

Epoch 9/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1102 | Val Loss: 0.1704

Epoch 10/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0995 | Val Loss: 0.1678

Training completed for google/electra-small-discriminator
Best validation loss: 0.1678

Training google/electra-small-discriminator for Impact Classification


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Successfully loaded google/electra-small-discriminator
Max sequence length: 512
Model wrapped in DataParallel

Epoch 1/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.6718 | Val Loss: 0.6359

Epoch 2/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.5953 | Val Loss: 0.5444

Epoch 3/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.5096 | Val Loss: 0.4726

Epoch 4/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.4402 | Val Loss: 0.4113

Epoch 5/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.3675 | Val Loss: 0.3567

Epoch 6/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.3055 | Val Loss: 0.3285

Epoch 7/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.2620 | Val Loss: 0.3070

Epoch 8/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.2294 | Val Loss: 0.2987

Epoch 9/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.2080 | Val Loss: 0.2909

Epoch 10/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1948 | Val Loss: 0.2921

Training completed for google/electra-small-discriminator
Best validation loss: 0.2909

################################################################################
MODEL 11/15: ELECTRA-base (Advanced Encoder)
################################################################################

Training google/electra-base-discriminator for Intent Classification


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Successfully loaded google/electra-base-discriminator
Max sequence length: 512
Model wrapped in DataParallel

Epoch 1/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.5746 | Val Loss: 0.4355

Epoch 2/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.3785 | Val Loss: 0.2889

Epoch 3/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.2191 | Val Loss: 0.1801

Epoch 4/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1400 | Val Loss: 0.1529

Epoch 5/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0970 | Val Loss: 0.1469

Epoch 6/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0788 | Val Loss: 0.1342

Epoch 7/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0565 | Val Loss: 0.1246

Epoch 8/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0400 | Val Loss: 0.1219

Epoch 9/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0268 | Val Loss: 0.1250

Epoch 10/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0193 | Val Loss: 0.1176

Training completed for google/electra-base-discriminator
Best validation loss: 0.1176

Training google/electra-base-discriminator for Impact Classification


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Successfully loaded google/electra-base-discriminator
Max sequence length: 512
Model wrapped in DataParallel

Epoch 1/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.6140 | Val Loss: 0.5208

Epoch 2/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.4604 | Val Loss: 0.3823

Epoch 3/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.3332 | Val Loss: 0.2939

Epoch 4/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.2414 | Val Loss: 0.2768

Epoch 5/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1894 | Val Loss: 0.2729

Epoch 6/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1311 | Val Loss: 0.2482

Epoch 7/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0861 | Val Loss: 0.2737

Epoch 8/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0538 | Val Loss: 0.2694

Epoch 9/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0341 | Val Loss: 0.2782
Early stopping triggered at epoch 9

Training completed for google/electra-base-discriminator
Best validation loss: 0.2482

################################################################################
MODEL 12/15: ConvBERT-base (Advanced Encoder)
################################################################################

Training YituTech/conv-bert-base for Intent Classification


config.json:   0%|          | 0.00/674 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/423M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/423M [00:00<?, ?B/s]

Some weights of ConvBertForSequenceClassification were not initialized from the model checkpoint at YituTech/conv-bert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Successfully loaded YituTech/conv-bert-base
Max sequence length: 512
Model wrapped in DataParallel

Epoch 1/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.5689 | Val Loss: 0.4290

Epoch 2/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.3536 | Val Loss: 0.2546

Epoch 3/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1999 | Val Loss: 0.1640

Epoch 4/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1285 | Val Loss: 0.2285

Epoch 5/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1133 | Val Loss: 0.1540

Epoch 6/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0738 | Val Loss: 0.1526

Epoch 7/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0545 | Val Loss: 0.1309

Epoch 8/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0380 | Val Loss: 0.1280

Epoch 9/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0269 | Val Loss: 0.1276

Epoch 10/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0201 | Val Loss: 0.1270

Training completed for YituTech/conv-bert-base
Best validation loss: 0.1270

Training YituTech/conv-bert-base for Impact Classification


Some weights of ConvBertForSequenceClassification were not initialized from the model checkpoint at YituTech/conv-bert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Successfully loaded YituTech/conv-bert-base
Max sequence length: 512
Model wrapped in DataParallel

Epoch 1/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.6268 | Val Loss: 0.5221

Epoch 2/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.4572 | Val Loss: 0.3776

Epoch 3/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.3227 | Val Loss: 0.2912

Epoch 4/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.2334 | Val Loss: 0.2830

Epoch 5/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1848 | Val Loss: 0.2773

Epoch 6/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1271 | Val Loss: 0.2718

Epoch 7/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0833 | Val Loss: 0.2555

Epoch 8/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0508 | Val Loss: 0.2589

Epoch 9/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0330 | Val Loss: 0.2704

Epoch 10/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0250 | Val Loss: 0.2680
Early stopping triggered at epoch 10

Training completed for YituTech/conv-bert-base
Best validation loss: 0.2555

################################################################################
MODEL 13/15: ALBERT-base-v2 (Advanced Encoder)
################################################################################

Training albert-base-v2 for Intent Classification


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Successfully loaded albert-base-v2
Max sequence length: 512
Model wrapped in DataParallel

Epoch 1/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.4945 | Val Loss: 0.3427

Epoch 2/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.2856 | Val Loss: 0.2232

Epoch 3/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1864 | Val Loss: 0.1796

Epoch 4/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1431 | Val Loss: 0.1930

Epoch 5/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1205 | Val Loss: 0.1459

Epoch 6/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0891 | Val Loss: 0.1666

Epoch 7/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0657 | Val Loss: 0.1399

Epoch 8/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0446 | Val Loss: 0.1339

Epoch 9/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0296 | Val Loss: 0.1368

Epoch 10/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0205 | Val Loss: 0.1337

Training completed for albert-base-v2
Best validation loss: 0.1337

Training albert-base-v2 for Impact Classification


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Successfully loaded albert-base-v2
Max sequence length: 512
Model wrapped in DataParallel

Epoch 1/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.5466 | Val Loss: 0.4449

Epoch 2/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.3895 | Val Loss: 0.3387

Epoch 3/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.3094 | Val Loss: 0.3059

Epoch 4/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.2653 | Val Loss: 0.2797

Epoch 5/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.2137 | Val Loss: 0.2784

Epoch 6/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1566 | Val Loss: 0.2568

Epoch 7/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1041 | Val Loss: 0.2712

Epoch 8/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0674 | Val Loss: 0.2650

Epoch 9/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0432 | Val Loss: 0.2626
Early stopping triggered at epoch 9

Training completed for albert-base-v2
Best validation loss: 0.2568

################################################################################
MODEL 14/15: BART-base (Encoder-Decoder)
################################################################################

Training facebook/bart-base for Intent Classification


vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Successfully loaded facebook/bart-base
Max sequence length: 512
Model wrapped in DataParallel

Epoch 1/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.4581 | Val Loss: 0.2878

Epoch 2/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.2122 | Val Loss: 0.1571

Epoch 3/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1436 | Val Loss: 0.1853

Epoch 4/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1121 | Val Loss: 0.1214

Epoch 5/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0916 | Val Loss: 0.1346

Epoch 6/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0681 | Val Loss: 0.1235

Epoch 7/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0468 | Val Loss: 0.1194

Epoch 8/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0329 | Val Loss: 0.1187

Epoch 9/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0237 | Val Loss: 0.1154

Epoch 10/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0176 | Val Loss: 0.1148

Training completed for facebook/bart-base
Best validation loss: 0.1148

Training facebook/bart-base for Impact Classification


Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Successfully loaded facebook/bart-base
Max sequence length: 512
Model wrapped in DataParallel

Epoch 1/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.5311 | Val Loss: 0.3984

Epoch 2/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.3296 | Val Loss: 0.3003

Epoch 3/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.2587 | Val Loss: 0.2692

Epoch 4/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.2176 | Val Loss: 0.2691

Epoch 5/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1715 | Val Loss: 0.2623

Epoch 6/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1186 | Val Loss: 0.2544

Epoch 7/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0799 | Val Loss: 0.2678

Epoch 8/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0524 | Val Loss: 0.2565

Epoch 9/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0346 | Val Loss: 0.2620
Early stopping triggered at epoch 9

Training completed for facebook/bart-base
Best validation loss: 0.2544

################################################################################
MODEL 15/15: DistilBART-cnn (Encoder-Decoder (Light))
################################################################################

Training sshleifer/distilbart-cnn-6-6 for Intent Classification


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/460M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/460M [00:00<?, ?B/s]

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at sshleifer/distilbart-cnn-6-6 and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Successfully loaded sshleifer/distilbart-cnn-6-6
Max sequence length: 512
Model wrapped in DataParallel

Epoch 1/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.4430 | Val Loss: 0.2145

Epoch 2/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1687 | Val Loss: 0.1534

Epoch 3/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1181 | Val Loss: 0.1953

Epoch 4/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0945 | Val Loss: 0.1318

Epoch 5/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0640 | Val Loss: 0.1439

Epoch 6/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0393 | Val Loss: 0.1344

Epoch 7/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0238 | Val Loss: 0.1357
Early stopping triggered at epoch 7

Training completed for sshleifer/distilbart-cnn-6-6
Best validation loss: 0.1318

Training sshleifer/distilbart-cnn-6-6 for Impact Classification


Some weights of BartForSequenceClassification were not initialized from the model checkpoint at sshleifer/distilbart-cnn-6-6 and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Successfully loaded sshleifer/distilbart-cnn-6-6
Max sequence length: 512
Model wrapped in DataParallel

Epoch 1/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.5342 | Val Loss: 0.3663

Epoch 2/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.3002 | Val Loss: 0.2847

Epoch 3/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.2313 | Val Loss: 0.2875

Epoch 4/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1743 | Val Loss: 0.2621

Epoch 5/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1129 | Val Loss: 0.2817

Epoch 6/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0681 | Val Loss: 0.2894

Epoch 7/10


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0334 | Val Loss: 0.3011
Early stopping triggered at epoch 7

Training completed for sshleifer/distilbart-cnn-6-6
Best validation loss: 0.2621

[5/5] Saving results...
Saved: /kaggle/working/project1_intent_results.csv
Saved: /kaggle/working/project1_impact_results.csv
Saved: /kaggle/working/project1_intent_per_label.csv
Saved: /kaggle/working/project1_impact_per_label.csv

EXPERIMENT COMPLETED

All results saved to: /kaggle/working
