In [1]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
import time
import os
import shutil
from torch.utils.data import DataLoader
from tqdm.auto import tqdm




In [2]:
class DataProcessor:
    """Handles all data preprocessing and loading operations"""
    
    def __init__(self, tokenizer, max_length=128):
        self.tokenizer = tokenizer
        self.max_length = max_length

    def preprocess_data(self, examples, is_training=True):
        """Preprocesses the data by tokenizing inputs and labels"""
        inputs = [f"classify if this sentence is humorous: {text}" for text in examples['text']]
        targets = examples['label_text']
        
        # Tokenize inputs
        model_inputs = self.tokenizer(
            inputs,
            max_length=self.max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )

        if is_training:
            # Tokenize labels
            with self.tokenizer.as_target_tokenizer():
                labels = self.tokenizer(
                    targets,
                    max_length=10,
                    truncation=True,
                    padding="max_length",
                    return_tensors="pt"
                ).input_ids
                labels[labels == self.tokenizer.pad_token_id] = -100
            model_inputs["labels"] = labels

        return model_inputs

    def create_dataloader(self, dataset, batch_size=8, shuffle=True):
        """Creates a DataLoader from the dataset"""
        preprocessed_dataset = Dataset.from_dict({
            'text': dataset['text'],
            'label_text': dataset['label_text']
        })
        
        def collate_fn(examples):
            return self.preprocess_data({
                'text': [ex['text'] for ex in examples],
                'label_text': [ex['label_text'] for ex in examples]
            })
        
        return DataLoader(
            preprocessed_dataset,
            batch_size=batch_size,
            shuffle=shuffle,
            collate_fn=collate_fn
        )

    @staticmethod
    def prepare_dataset(csv_path):
        """Prepares the initial dataset from CSV"""
        data = pd.read_csv(csv_path)
        data.columns = ["text", "label"]
        data['label_text'] = data['label'].apply(lambda x: "humorous" if x == 1 else "not humorous")
        dataset = Dataset.from_pandas(data)
        return dataset.train_test_split(test_size=0.2, seed=42) 

In [3]:
class ModelTrainer:
    """Handles model training and evaluation"""
    
    def __init__(self, model, device):
        self.model = model
        self.device = device
        self.temp_checkpoint_dir = './temp_checkpoint'

    def train(self, train_dataloader, val_dataloader=None, 
              epochs=3, learning_rate=2e-5, save_best_only=True):
        """Trains the model with the given parameters"""
        
        optimizer = torch.optim.AdamW(self.model.parameters(), lr=learning_rate)
        best_loss = float('inf')
        
        # Create temporary checkpoint directory
        os.makedirs(self.temp_checkpoint_dir, exist_ok=True)
        
        print(f"Training on {self.device}")
        
        try:
            for epoch in range(epochs):
                epoch_start = time.time()
                self._train_epoch(train_dataloader, optimizer, epoch, epochs)
                
                # Validation
                if val_dataloader is not None:
                    val_loss = self.evaluate(val_dataloader)
                    print(f"\nEpoch {epoch + 1}: Val Loss = {val_loss:.4f}")
                    
                    if save_best_only and val_loss < best_loss:
                        best_loss = val_loss
                        self._save_checkpoint(epoch, optimizer, best_loss)
                
                print(f"Epoch {epoch + 1} completed in {time.time() - epoch_start:.2f} seconds")

            # Load best model if it exists
            if save_best_only:
                self._load_best_model()
                
        finally:
            # Cleanup
            self._cleanup()

    def _train_epoch(self, train_dataloader, optimizer, epoch, epochs):
        """Trains for one epoch"""
        self.model.train()
        total_loss = 0
        train_progress_bar = tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{epochs}')
        
        for batch in train_progress_bar:
            batch = {k: v.to(self.device) for k, v in batch.items()}
            
            optimizer.zero_grad()
            outputs = self.model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            train_progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})
        
        return total_loss / len(train_dataloader)

    def evaluate(self, val_dataloader):
        """Evaluates the model on validation data"""
        self.model.eval()
        total_loss = 0
        
        with torch.no_grad():
            for batch in val_dataloader:
                batch = {k: v.to(self.device) for k, v in batch.items()}
                outputs = self.model(**batch)
                total_loss += outputs.loss.item()
                
        return total_loss / len(val_dataloader)

    def _save_checkpoint(self, epoch, optimizer, loss):
        """Saves model checkpoint"""
        torch.save({
            'epoch': epoch,
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
        }, os.path.join(self.temp_checkpoint_dir, 'best_model.pt'))

    def _load_best_model(self):
        """Loads the best model from checkpoint"""
        checkpoint_path = os.path.join(self.temp_checkpoint_dir, 'best_model_for_t5_base.pt')
        if os.path.exists(checkpoint_path):
            checkpoint = torch.load(checkpoint_path)
            self.model.load_state_dict(checkpoint['model_state_dict'])

    def _cleanup(self):
        """Cleans up temporary files"""
        if os.path.exists(self.temp_checkpoint_dir):
            shutil.rmtree(self.temp_checkpoint_dir)

In [4]:
class Predictor:
    """Handles model prediction"""
    
    def __init__(self, model, tokenizer, device, max_length=128):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        self.max_length = max_length

    def predict(self, sentence):
        """Makes a prediction for a single sentence"""
        self.model.eval()
        input_text = f"classify if this sentence is humorous: {sentence}"
        inputs = self.tokenizer(
            input_text,
            return_tensors="pt",
            max_length=self.max_length,
            truncation=True,
            padding="max_length"
        ).to(self.device)
        
        with torch.no_grad():
            outputs = self.model.generate(**inputs)
        
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

In [5]:
if __name__ == "__main__":
    # Initialize common components
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = T5Tokenizer.from_pretrained("t5-base")
    model = T5ForConditionalGeneration.from_pretrained("t5-base").to(device)

    # Initialize the three main components
    data_processor = DataProcessor(tokenizer)
    trainer = ModelTrainer(model, device)
    predictor = Predictor(model, tokenizer, device)

    # Prepare dataset
    dataset = data_processor.prepare_dataset("dataset.csv")
    
    # Create dataloaders
    train_dataloader = data_processor.create_dataloader(dataset['train'], batch_size=8)
    val_dataloader = data_processor.create_dataloader(dataset['test'], batch_size=8, shuffle=False)
    
    # Train the model
    trainer.train(
        train_dataloader,
        val_dataloader,
        epochs=3,
        learning_rate=2e-5,
        save_best_only=True
    ) 

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Training on cuda


Epoch 1/3:   0%|          | 0/20000 [00:00<?, ?it/s]




Epoch 1: Val Loss = 0.0132
Epoch 1 completed in 3738.94 seconds


Epoch 2/3:   0%|          | 0/20000 [00:00<?, ?it/s]


Epoch 2: Val Loss = 0.0121
Epoch 2 completed in 3711.92 seconds


Epoch 3/3:   0%|          | 0/20000 [00:00<?, ?it/s]


Epoch 3: Val Loss = 0.0130
Epoch 3 completed in 3710.42 seconds


In [6]:
# Make a prediction
test_sentence = "5 reasons the 2016 election feels so personal" 
prediction = predictor.predict(test_sentence)
print(f"\nSentence: '{test_sentence}'\nPrediction: {prediction}")


Sentence: '5 reasons the 2016 election feels so personal'
Prediction: not humorous




In [16]:
from sklearn.metrics import classification_report, accuracy_score

def evaluate(model, val_dataloader, device):
    """Evaluates the model on validation data and prints classification metrics."""
    model.eval()
    all_preds = []
    all_labels = []
    
    print("Starting evaluation...")

    with torch.no_grad():
        for i, batch in enumerate(val_dataloader):
            # Move batch to device
            batch = {k: v.to(device) for k, v in batch.items()}
            print(f"Processing batch {i + 1}/{len(val_dataloader)}")

            # Perform forward pass and get logits
            outputs = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["labels"])
            logits = outputs.logits

            # Get the most probable class (0 or 1) for each example
            preds = torch.argmax(logits, dim=-1).cpu().numpy().flatten()

            # Extract the ground truth labels, flatten them, and remove padding labels (-100)
            labels = batch["labels"].cpu().numpy().flatten()
            labels = [label for label in labels if label != -100]  # Keep only valid labels

            # Collect predictions and labels
            all_preds.extend(preds[:len(labels)])  # Ensure preds length matches labels
            all_labels.extend(labels)

    # Debug: Print lengths of all_labels and all_preds
    print(f"Length of all_labels: {len(all_labels)}, Length of all_preds: {len(all_preds)}")

    # Calculate and print classification metrics
    accuracy = accuracy_score(all_labels, all_preds)
    report = classification_report(all_labels, all_preds, target_names=["not humorous", "humorous"])
    
    print(f"\nAccuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(report)

    print("Evaluation completed successfully.")


In [17]:
evaluate(model, val_dataloader, device)


Starting evaluation...
Processing batch 1/5000
Processing batch 2/5000
Processing batch 3/5000




Processing batch 4/5000
Processing batch 5/5000
Processing batch 6/5000
Processing batch 7/5000
Processing batch 8/5000
Processing batch 9/5000
Processing batch 10/5000
Processing batch 11/5000
Processing batch 12/5000
Processing batch 13/5000
Processing batch 14/5000
Processing batch 15/5000
Processing batch 16/5000
Processing batch 17/5000
Processing batch 18/5000
Processing batch 19/5000
Processing batch 20/5000
Processing batch 21/5000
Processing batch 22/5000
Processing batch 23/5000
Processing batch 24/5000
Processing batch 25/5000
Processing batch 26/5000
Processing batch 27/5000
Processing batch 28/5000
Processing batch 29/5000
Processing batch 30/5000
Processing batch 31/5000
Processing batch 32/5000
Processing batch 33/5000
Processing batch 34/5000
Processing batch 35/5000
Processing batch 36/5000
Processing batch 37/5000
Processing batch 38/5000
Processing batch 39/5000
Processing batch 40/5000
Processing batch 41/5000
Processing batch 42/5000
Processing batch 43/5000
Proces

ValueError: Number of classes, 3, does not match size of target_names, 2. Try specifying the labels parameter