<a href="https://colab.research.google.com/github/reagan13/gpt2-distilbert-thesis-files/blob/main/notebook/Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Multitask Learning with GPT2

## Import Libraries

In [1]:
import json
import os
import time
from typing import List, Dict, Optional
from collections import Counter, defaultdict
import time
import sys

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Model, GPT2Config, GPT2TokenizerFast, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score
from datetime import datetime  # Add this import

# Verify device and GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Selected device: {device}")

# If CUDA is available, check and log GPU status
if device.type == "cuda":
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"Initial GPU Memory Allocated: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")
    torch.cuda.empty_cache()
else:
    print("No GPU detected. Running on CPU.")

# Updated function to check device of tensors or models
def check_device(item, name="Item"):
    if isinstance(item, torch.nn.Module):
        param = next(item.parameters(), None)
        if param is not None:
            print(f"{name} is on: {param.device}")
        else:
            print(f"{name} has no parameters to check")
    elif isinstance(item, torch.Tensor):
        print(f"{name} is on: {item.device}")
    else:
        print(f"{name} is not a tensor or model: {type(item)}")

# New function to save print statements locally and to Google Drive
def setup_logging(save_path: str, filename: str = "training_log.txt"):
    try:
        os.makedirs(save_path, exist_ok=True)
        log_path = os.path.join(save_path, filename)
        print(f"Local log path created: {log_path}")

        drive_path = os.path.join('/content/drive/', save_path)
        os.makedirs(drive_path, exist_ok=True)
        drive_log_path = os.path.join(drive_path, filename)
        print(f"Google Drive log path created: {drive_log_path}")

        class Logger:
            def __init__(self, file_handle, drive_file_handle, original_stdout):
                self.file = file_handle
                self.drive_file = drive_file_handle
                self.stdout = original_stdout

            def write(self, message):
                self.file.write(message)
                self.drive_file.write(message)
                self.stdout.write(message)

            def flush(self):
                self.file.flush()
                self.drive_file.flush()
                self.stdout.flush()

        log_file = open(log_path, "w", encoding="utf-8")
        drive_log_file = open(drive_log_path, "w", encoding="utf-8")

        sys.stdout = Logger(log_file, drive_log_file, sys.stdout)
        print(f"Logging started at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        print(f"Log file: {log_path}")
        print(f"Drive log file: {drive_log_path}")

    except Exception as e:
        print(f"Error setting up logging: {e}")


print(f"Current date: {datetime.now().strftime('%B %d, %Y')}")


Selected device: cpu
No GPU detected. Running on CPU.
Current date: March 06, 2025


## Daset Loading Functions

In [2]:
def load_dataset(json_file: str) -> List[Dict]:
    """Load dataset from a JSON file."""
    with open(json_file, 'r', encoding='utf-8') as f:
        return json.load(f)

def detect_labels(data: List[Dict]) -> Dict[str, Dict]:
    """Detect unique labels and create encoders for categories, intents, and NER tags."""
    start_time = time.time()
    if not isinstance(data, list):
        raise TypeError("Input 'data' must be a list of dictionaries")
    if not data:
        return {"category_encoder": {}, "intent_encoder": {}, "ner_label_encoder": {"O": 0}}

    unique_categories = set()
    unique_intents = set()
    unique_ner_labels = set(["O"])
    missing_fields = defaultdict(int)
    category_counts = Counter()
    intent_counts = Counter()
    ner_counts = Counter()

    for i, sample in enumerate(data):
        try:
            category = sample["category"]
            intent = sample["intent"]
            unique_categories.add(category)
            unique_intents.add(intent)
            category_counts[category] += 1
            intent_counts[intent] += 1

            ner_labels = sample["ner_labels_only"]
            if not isinstance(ner_labels, list):
                raise ValueError(f"'ner_labels_only' must be a list at sample {i}")
            for label in ner_labels:
                if not isinstance(label, dict) or "label" not in label or "text" not in label:
                    raise ValueError(f"NER label must have 'label' and 'text' fields at sample {i}")
                label_type = label["label"]
                unique_ner_labels.add(f"B-{label_type}")
                unique_ner_labels.add(f"I-{label_type}")
                ner_counts[f"B-{label_type}"] += 1
                ner_counts[f"I-{label_type}"] += 1
        except KeyError as e:
            missing_fields[str(e).strip("'")] += 1
            continue

    if missing_fields:
        print("Warning: Missing fields detected:")
        for field, count in missing_fields.items():
            print(f"  - '{field}' missing in {count} samples")

    category_encoder = {cat: idx for idx, cat in enumerate(sorted(unique_categories))}
    intent_encoder = {intent: idx for idx, intent in enumerate(sorted(unique_intents))}
    ner_label_encoder = {ner: idx for idx, ner in enumerate(sorted(unique_ner_labels))}

    print(f"Dataset summary:\n  - {len(data)} samples\n  - {len(category_encoder)} categories\n  - {len(intent_encoder)} intents\n  - {len(ner_label_encoder)} NER tags")
    print("Category distribution:", dict(category_counts))
    print("Intent distribution:", dict(intent_counts))
    print("NER tag distribution (non-O):", dict(ner_counts))
    print(f"Processing time: {time.time() - start_time:.3f} seconds")

    return {"category_encoder": category_encoder, "intent_encoder": intent_encoder, "ner_label_encoder": ner_label_encoder}

## Tokenization and NER Alignment

In [3]:
def tokenize_text_gpt2(text: str, gpt2_tokenizer, max_length: int) -> Dict[str, torch.Tensor]:
    """Tokenize text using GPT-2 tokenizer."""
    gpt2_inputs = gpt2_tokenizer(
        text, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
    )
    return {
        "gpt2_input_ids": gpt2_inputs["input_ids"].squeeze(0),
        "gpt2_attention_mask": gpt2_inputs["attention_mask"].squeeze(0)
    }

def align_ner_labels(text: str, ner_labels: List[Dict], tokenizer, ner_label_encoder: Dict, max_length: int) -> torch.Tensor:
    """Align NER labels with tokenized input."""
    sorted_labels = sorted(ner_labels, key=lambda x: len(x["text"]), reverse=True) if ner_labels else []
    encoding = tokenizer(
        text, max_length=max_length, padding="max_length", truncation=True, return_offsets_mapping=True, return_tensors="pt"
    )
    token_to_char_map = encoding["offset_mapping"][0].tolist()
    ner_aligned = [ner_label_encoder["O"]] * max_length

    for label in sorted_labels:
        if "text" not in label or "label" not in label:
            print(f"Warning: Skipping invalid NER entry {label} (missing 'text' or 'label')")
            continue
        try:
            label_text, label_type = label["text"], label["label"]
            start_pos = 0
            while True:
                label_start = text.find(label_text, start_pos)
                if label_start == -1:
                    break
                label_end = label_start + len(label_text)
                start_pos = label_end
                first_token = True
                for i, (start, end) in enumerate(token_to_char_map):
                    if start == 0 and end == 0:
                        continue
                    if max(start, label_start) < min(end, label_end):
                        prefix = "B-" if first_token else "I-"
                        first_token = False
                        ner_aligned[i] = ner_label_encoder.get(f"{prefix}{label_type}", ner_label_encoder["O"])
        except KeyError as e:
            print(f"Warning: Label '{e}' not found in encoder. Skipping.")

    return torch.tensor(ner_aligned, dtype=torch.long)

## Dataset and Dataloader

In [4]:
class MultiTaskDataset(Dataset):
    def __init__(self, data: List[Dict], gpt2_tokenizer, label_encoders, max_length: int):
        self.data = data
        self.gpt2_tokenizer = gpt2_tokenizer
        self.label_encoders = label_encoders
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        text = sample["instruction"]
        gpt2_inputs = tokenize_text_gpt2(text, self.gpt2_tokenizer, self.max_length)
        ner_labels = align_ner_labels(text, sample["ner_labels_only"], self.gpt2_tokenizer, self.label_encoders["ner_label_encoder"], self.max_length)

        return {
            "input_ids": gpt2_inputs["gpt2_input_ids"],
            "attention_mask": gpt2_inputs["gpt2_attention_mask"],
            "category_labels": torch.tensor(self.label_encoders["category_encoder"][sample["category"]], dtype=torch.long),
            "intent_labels": torch.tensor(self.label_encoders["intent_encoder"][sample["intent"]], dtype=torch.long),
            "ner_labels": ner_labels
        }

def get_dataloaders(train_data, val_data, test_data, gpt2_tokenizer, label_encoders, batch_size, num_workers, max_length):
    """Create DataLoaders for train, validation, and test sets."""
    pin_memory = device.type == "cuda"
    train_dataset = MultiTaskDataset(train_data, gpt2_tokenizer, label_encoders, max_length)
    val_dataset = MultiTaskDataset(val_data, gpt2_tokenizer, label_encoders, max_length)
    test_dataset = MultiTaskDataset(test_data, gpt2_tokenizer, label_encoders, max_length)

    return (
        DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=pin_memory),
        DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=pin_memory),
        DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=pin_memory)
    )

## Model Architecture

In [5]:
class BaselineGPT2MultiTask(nn.Module):
    def __init__(self, num_intents: int, num_categories: int, num_ner_labels: int, dropout_rate: float):
        super().__init__()
        self.config = GPT2Config.from_pretrained('gpt2')
        self.gpt2 = GPT2Model.from_pretrained('gpt2')
        hidden_size = self.config.n_embd

        # Freeze GPT-2 parameters
        for param in self.gpt2.parameters():
            param.requires_grad = False

        # Task-specific heads
        self.intent_head = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.Tanh(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_size, num_intents)
        )

        self.category_head = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.Tanh(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_size, num_categories)
        )

        self.ner_head = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.Tanh(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_size, num_ner_labels)
        )

        # Loss functions
        self.intent_loss_fn = nn.CrossEntropyLoss()
        self.category_loss_fn = nn.CrossEntropyLoss()
        self.ner_loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor,
                intent_labels: Optional[torch.Tensor] = None,
                category_labels: Optional[torch.Tensor] = None,
                ner_labels: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:

        outputs = self.gpt2(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state

        # Extract last token representation for intent and category
        batch_size = sequence_output.shape[0]
        sequence_lengths = attention_mask.sum(dim=1) - 1
        last_token_indexes = sequence_lengths.unsqueeze(-1).unsqueeze(-1).repeat(1, 1, sequence_output.shape[-1])
        sequence_repr = torch.gather(sequence_output, 1, last_token_indexes).squeeze(1)

        # Compute logits
        intent_logits = self.intent_head(sequence_repr)
        category_logits = self.category_head(sequence_repr)
        ner_logits = self.ner_head(sequence_output)

        # Base output dictionary (always returned)
        output_dict = {
            'intent_logits': intent_logits,
            'category_logits': category_logits,
            'ner_logits': ner_logits
        }

        # Compute losses only if labels are provided
        if all(label is not None for label in [intent_labels, category_labels, ner_labels]):
            intent_loss = self.intent_loss_fn(intent_logits, intent_labels)
            category_loss = self.category_loss_fn(category_logits, category_labels)

            # NER loss on active tokens only
            active_loss = attention_mask.view(-1) == 1
            active_logits = ner_logits.view(-1, ner_logits.size(-1))[active_loss]
            active_labels = ner_labels.view(-1)[active_loss]
            ner_loss = self.ner_loss_fn(active_logits, active_labels)

            # Add loss-related fields to output dictionary
            output_dict.update({
                'loss': intent_loss + category_loss + ner_loss,
                'intent_loss': intent_loss,
                'category_loss': category_loss,
                'ner_loss': ner_loss
            })

        return output_dict

## Training Loop

In [6]:
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm

def train_model(model, train_loader, val_loader, num_epochs, learning_rate):
    """Train the multi-task model."""
    optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate)
    history = {
        "train_loss": [], "val_loss": [],
        "train_intent_acc": [], "val_intent_acc": [],
        "train_category_f1": [], "val_category_f1": [],
        "train_ner_f1": [], "val_ner_f1": []
    }

        # Move model to device and verify
     # Move model to device and verify
    model.to(device)
    check_device(model, "Model")  # Should now show cuda:0


    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        all_train_intent_preds, all_train_intent_labels = [], []
        all_train_category_preds, all_train_category_labels = [], []
        all_train_ner_preds, all_train_ner_labels = [], []

        # Add tqdm progress bar for training
        # Add tqdm progress bar outside the inner loop
        with tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Training]", leave=False) as train_loop:
            for i, batch in enumerate(train_loop):
                optimizer.zero_grad()
                inputs = {k: v.to(device) for k, v in batch.items()}

                # Check first batch and GPU memory
                if i == 0 and epoch == 0:
                    check_device(inputs["input_ids"], "Input IDs")
                    print(f"GPU Memory Allocated After Data Load: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")

                outputs = model(**inputs)
                loss = outputs["loss"]
                loss.backward()
                optimizer.step()

                total_loss += loss.item()

                # Update tqdm progress bar with current loss
                train_loop.set_postfix(loss=loss.item())

                # Collect predictions
                intent_preds = torch.argmax(outputs["intent_logits"], dim=-1).cpu().numpy()
                category_preds = torch.argmax(outputs["category_logits"], dim=-1).cpu().numpy()
                ner_preds = torch.argmax(outputs["ner_logits"], dim=-1).cpu().numpy()

                all_train_intent_preds.extend(intent_preds)
                all_train_intent_labels.extend(batch["intent_labels"].cpu().numpy())
                all_train_category_preds.extend(category_preds)
                all_train_category_labels.extend(batch["category_labels"].cpu().numpy())
                all_train_ner_preds.extend(ner_preds.flatten())
                all_train_ner_labels.extend(batch["ner_labels"].cpu().numpy().flatten())

        # Training metrics
        train_intent_acc = accuracy_score(all_train_intent_labels, all_train_intent_preds)
        train_category_f1 = precision_recall_fscore_support(all_train_category_labels, all_train_category_preds, average="macro", zero_division=0)[2]
        train_ner_f1 = precision_recall_fscore_support(all_train_ner_labels, all_train_ner_preds, average="macro", zero_division=0)[2]

        history["train_loss"].append(total_loss / len(train_loader))
        history["train_intent_acc"].append(train_intent_acc)
        history["train_category_f1"].append(train_category_f1)
        history["train_ner_f1"].append(train_ner_f1)

        # Validation
        model.eval()
        val_loss = 0
        all_val_intent_preds, all_val_intent_labels = [], []
        all_val_category_preds, all_val_category_labels = [], []
        all_val_ner_preds, all_val_ner_labels = [], []

        # Add tqdm progress bar for validation
        val_loop = tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Validation]", leave=False)
        with torch.no_grad():
            for batch in val_loop:
                inputs = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**inputs)
                batch_val_loss = outputs["loss"].item()
                val_loss += batch_val_loss
                val_loop.set_postfix(loss=batch_val_loss)

                intent_preds = torch.argmax(outputs["intent_logits"], dim=-1).cpu().numpy()
                category_preds = torch.argmax(outputs["category_logits"], dim=-1).cpu().numpy()
                ner_preds = torch.argmax(outputs["ner_logits"], dim=-1).cpu().numpy()

                all_val_intent_preds.extend(intent_preds)
                all_val_intent_labels.extend(batch["intent_labels"].cpu().numpy())
                all_val_category_preds.extend(category_preds)
                all_val_category_labels.extend(batch["category_labels"].cpu().numpy())
                all_val_ner_preds.extend(ner_preds.flatten())
                all_val_ner_labels.extend(batch["ner_labels"].cpu().numpy().flatten())

        # Validation metrics
        val_intent_acc = accuracy_score(all_val_intent_labels, all_val_intent_preds)
        val_category_f1 = precision_recall_fscore_support(all_val_category_labels, all_val_category_preds, average="macro", zero_division=0)[2]
        val_ner_f1 = precision_recall_fscore_support(all_val_ner_labels, all_val_ner_preds, average="macro", zero_division=0)[2]

        history["val_loss"].append(val_loss / len(val_loader))
        history["val_intent_acc"].append(val_intent_acc)
        history["val_category_f1"].append(val_category_f1)
        history["val_ner_f1"].append(val_ner_f1)

        print(f"Epoch {epoch+1}/{num_epochs}:")
        print(f"  Train Loss:      {history['train_loss'][-1]:.4f}")
        print(f"  Val Loss:       {history['val_loss'][-1]:.4f}\n")
        print(f"  Train Intent Acc: {train_intent_acc:.4f}")
        print(f"  Val Intent Acc:  {val_intent_acc:.4f}\n")
        print(f"  Train Category F1:{train_category_f1:.4f}")
        print(f"  Val Category F1: {val_category_f1:.4f}\n")
        print(f"  Train NER F1:     {train_ner_f1:.4f}")
        print(f"  Val NER F1:      {val_ner_f1:.4f}\n")

    return history

## Evaluation

In [7]:
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm

def evaluate_model(model, test_loader):
    """Evaluate the model on the test set."""
    model.eval()
    all_intent_preds, all_intent_labels = [], []
    all_category_preds, all_category_labels = [], []
    all_ner_preds, all_ner_labels = [], []
    total_loss = 0

    device = next(model.parameters()).device  # Get the device from the model

    # Add tqdm progress bar
    test_loop = tqdm(test_loader, desc="Evaluation", leave=True)
    with torch.no_grad():
        for batch in test_loop:
            inputs = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**inputs)
            batch_loss = outputs["loss"].item()
            total_loss += batch_loss

            # Update progress bar with current loss
            test_loop.set_postfix(loss=batch_loss)

            intent_preds = torch.argmax(outputs["intent_logits"], dim=-1).cpu().numpy()
            category_preds = torch.argmax(outputs["category_logits"], dim=-1).cpu().numpy()
            ner_preds = torch.argmax(outputs["ner_logits"], dim=-1).cpu().numpy()

            all_intent_preds.extend(intent_preds)
            all_intent_labels.extend(batch["intent_labels"].cpu().numpy())
            all_category_preds.extend(category_preds)
            all_category_labels.extend(batch["category_labels"].cpu().numpy())
            all_ner_preds.extend(ner_preds.flatten())
            all_ner_labels.extend(batch["ner_labels"].cpu().numpy().flatten())

    # Compute metrics
    intent_acc = accuracy_score(all_intent_labels, all_intent_preds)
    category_f1 = precision_recall_fscore_support(all_category_labels, all_category_preds, average="macro", zero_division=0)[2]
    ner_f1 = precision_recall_fscore_support(all_ner_labels, all_ner_preds, average="macro", zero_division=0)[2]
    avg_loss = total_loss / len(test_loader)

    results = {
        "loss": avg_loss,
        "intent_accuracy": intent_acc,
        "category_f1": category_f1,
        "ner_f1": ner_f1
    }


    print(f"Test Results:")
    # Organize the results using f-strings and formatting
    print(f"  Loss:            {avg_loss:.4f}")
    print(f"  Intent Acc:      {intent_acc:.4f}")
    print(f"  Category F1:     {category_f1:.4f}")
    print(f"  NER F1:          {ner_f1:.4f}")




    return results

## Save Artifacts

### Save Locally

Save Artificats (label encoders,training metrics and test re

In [8]:

def save_artifacts(label_encoders, metrics, test_results, save_path):
    """
    Save label encoders, training metrics, and test results locally.

    Args:
        label_encoders (dict): Dictionary containing label encoders.
        metrics (dict): Dictionary containing training metrics.
        test_results (dict): Dictionary containing test results.
        save_path (str): Directory where the artifacts will be saved.
    """
    os.makedirs(save_path, exist_ok=True)

    # Save label encoders
    label_encoders_path = os.path.join(save_path, "label_encoders.json")
    with open(label_encoders_path, "w", encoding="utf-8") as f:
        json.dump(label_encoders, f, ensure_ascii=False, indent=4)
    print(f"Label encoders saved to {label_encoders_path}")

    # Save training metrics
    training_metrics_path = os.path.join(save_path, "training_metrics.json")
    with open(training_metrics_path, "w", encoding="utf-8") as f:
        json.dump(metrics, f, ensure_ascii=False, indent=4)
    print(f"Training metrics saved to {training_metrics_path}")

    # Save test results
    test_results_path = os.path.join(save_path, "test_results.json")
    with open(test_results_path, "w", encoding="utf-8") as f:
        json.dump(test_results, f, ensure_ascii=False, indent=4)
    print(f"Test results saved to {test_results_path}")


Saving training config

In [9]:

def save_training_config(config, save_path, filename="training_config.json"):
    """
    Save the training configuration to a JSON file locally.

    Args:
        config (dict): Dictionary containing training hyperparameters.
        save_path (str): Directory where the config file will be saved.
        filename (str): Name of the config file (default: "training_config.json").
    """
    os.makedirs(save_path, exist_ok=True)
    config_path = os.path.join(save_path, filename)

    with open(config_path, "w", encoding="utf-8") as f:
        json.dump(config, f, ensure_ascii=False, indent=4)
    print(f"Training configuration saved to {config_path}")

Save Model (pt file and tokenizer)

In [10]:

def save_full_model(model, gpt2_tokenizer, save_path):
    """
    Save the entire model and tokenizer locally.

    Args:
        model: The model to be saved.
        gpt2_tokenizer: The tokenizer to be saved.
        save_path (str): Directory where the model and tokenizer will be saved.
    """
    model_path = os.path.join(save_path, "model")
    os.makedirs(model_path, exist_ok=True)

    # Save the full model (architecture + weights)
    model_file_path = os.path.join(model_path, "full_model.pt")
    torch.save(model, model_file_path)
    print(f"Full model saved to {model_file_path}")

    # Save tokenizer
    tokenizer_path = os.path.join(save_path, "tokenizer")
    gpt2_tokenizer.save_pretrained(tokenizer_path)
    print(f"Tokenizer saved to {tokenizer_path}")


### Save to GDrive

Initiliaze Gdrive

In [11]:
import os
import json
import torch
from google.colab import drive

def mount_drive():
    """Mount Google Drive to save models and artifacts."""
    drive.mount('/content/drive')
    print("Google Drive mounted successfully.")

# def save_to_drive(local_path, drive_folder):
#     """Save a file or directory to Google Drive."""
#     drive_path = os.path.join("/content/drive", drive_folder)
#     if os.path.isdir(local_path):
#         os.system(f'cp -r {local_path} {drive_path}')
#     else:
#         os.system(f'cp {local_path} {drive_path}')
#     print(f"Saved {local_path} to {drive_path}")


Saving Training Configurations

In [12]:

def save_training_config_to_drive(config, drive_path, filename="training_config.json"):
    """
    Save the training configuration to a JSON file in Google Drive.

    Args:
        config (dict): Dictionary containing training hyperparameters.
        drive_path (str): Directory in Google Drive where the config file will be saved.
        filename (str): Name of the config file (default: "training_config.json").
    """
    os.makedirs("/content/drive/" + drive_path, exist_ok=True)
    config_path = os.path.join("/content/drive/", drive_path, filename)

    with open(config_path, "w", encoding="utf-8") as f:
        json.dump(config, f, ensure_ascii=False, indent=4)
    print(f"Training configuration saved to {config_path}")

Save Artificats (label encoders,training metrics and test re

In [13]:

def save_artifacts_to_drive(label_encoders, metrics, test_results, drive_path):
    """
    Save label encoders, training metrics, and test results to Google Drive.

    Args:
        label_encoders (dict): Dictionary containing label encoders.
        metrics (dict): Dictionary containing training metrics.
        test_results (dict): Dictionary containing test results.
        drive_path (str): Directory in Google Drive where the artifacts will be saved.
    """
    os.makedirs("/content/drive/" + drive_path, exist_ok=True)

    # Save label encoders
    label_encoders_path = os.path.join("/content/drive/", drive_path, "label_encoders.json")
    with open(label_encoders_path, "w", encoding="utf-8") as f:
        json.dump(label_encoders, f, ensure_ascii=False, indent=4)
    print(f"Label encoders saved to {label_encoders_path}")

    # Save training metrics
    training_metrics_path = os.path.join("/content/drive/", drive_path, "training_metrics.json")
    with open(training_metrics_path, "w", encoding="utf-8") as f:
        json.dump(metrics, f, ensure_ascii=False, indent=4)
    print(f"Training metrics saved to {training_metrics_path}")

    # Save test results
    test_results_path = os.path.join("/content/drive/", drive_path, "test_results.json")
    with open(test_results_path, "w", encoding="utf-8") as f:
        json.dump(test_results, f, ensure_ascii=False, indent=4)
    print(f"Test results saved to {test_results_path}")



Save Model (pt file and tokenizer)

In [14]:

def save_full_model_to_drive(model, gpt2_tokenizer, drive_path):
    """
    Save the entire model and tokenizer to Google Drive.

    Args:
        model: The model to be saved.
        gpt2_tokenizer: The tokenizer to be saved.
        drive_path (str): Directory in Google Drive where the model and tokenizer will be saved.
    """
    model_path = os.path.join("/content/drive/", drive_path, "model")
    os.makedirs(model_path, exist_ok=True)

    # Save the full model (architecture + weights)
    model_file_path = os.path.join(model_path, "full_model.pt")
    torch.save(model, model_file_path)
    print(f"Full model saved to {model_file_path}")

    # Save tokenizer
    tokenizer_path = os.path.join("/content/drive/", drive_path, "tokenizer")
    gpt2_tokenizer.save_pretrained(tokenizer_path)
    print(f"Tokenizer saved to {tokenizer_path}")

## Main Execution

### Paths and Hyperparameters

In [15]:
# Cell 9: Main Execution
# Data paths and hyperparameters
train_file = "train.json"
val_file = "val.json"
test_file = "test.json"
batch_size = 16
num_epochs = 1
learning_rate = 2e-5
max_length = 128
num_workers = 2
save_path = "MyDrive/thesis/baseline/test_6" # increment for different testing
dropout_rate= 0.24


# Define training configuration
training_config = {
    "train_file": train_file,
    "val_file": val_file,
    "test_file": test_file,
    "batch_size": batch_size,
    "num_epochs": num_epochs,
    "learning_rate": learning_rate,
    "max_length": max_length,
    "num_workers": num_workers,
    "model_name": "BaselineGPT2MultiTask",
    "gpt2_base": "gpt2",
    "dropout_rate":dropout_rate,
    "device": str(device),
    "date": datetime.now().strftime('%B %d, %Y')
}



In [16]:
# Mount Google Drive
mount_drive()
# Setup logging to save print statements
setup_logging(save_path)


Mounted at /content/drive
Google Drive mounted successfully.
Local log path created: MyDrive/thesis/baseline/test_6/training_log.txt
Google Drive log path created: /content/drive/MyDrive/thesis/baseline/test_6/training_log.txt
Logging started at 2025-03-06 21:52:49
Log file: MyDrive/thesis/baseline/test_6/training_log.txt
Drive log file: /content/drive/MyDrive/thesis/baseline/test_6/training_log.txt


### Initialization

In [17]:


# Load datasets
print("Loading datasets...\n")
train_data = load_dataset(train_file)[:100]  # Limited for demo
val_data = load_dataset(val_file)[:20]
test_data = load_dataset(test_file)[:20]

print("*" * 30)
print(f"""Dataset Summary:
Training samples: {len(train_data)}
Validation samples: {len(val_data)}
Test samples: {len(test_data)}""")

# Detect labels
label_encoders = detect_labels(train_data)


# Initialize tokenizer
gpt2_tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
if gpt2_tokenizer.pad_token is None:
    gpt2_tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Create data loaders
train_loader, val_loader, test_loader = get_dataloaders(
    train_data, val_data, test_data, gpt2_tokenizer, label_encoders, batch_size, num_workers, max_length
)

# Initialize model
model = BaselineGPT2MultiTask(
    num_intents=len(label_encoders["intent_encoder"]),
    num_categories=len(label_encoders["category_encoder"]),
    num_ner_labels=len(label_encoders["ner_label_encoder"]),
    dropout_rate=dropout_rate
)
if gpt2_tokenizer.pad_token_id is not None:
    model.gpt2.resize_token_embeddings(len(gpt2_tokenizer))


model.to(device)  # Ensure model is moved to GPU
check_device(model, "Model before training")  # Verify

# Save training config before training
save_training_config(training_config, save_path)

# Train model
print("*" * 30)
print("Starting training...")

start_time = time.time()

metrics = train_model(model, train_loader, val_loader, num_epochs, learning_rate)
print(f"Training completed in {(time.time() - start_time) / 60:.2f} minutes")

print("*" * 30)

# Evaluate model
print("Evaluating on test set...")
test_results = evaluate_model(model, test_loader)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
Evaluation: 100%|██████████| 2/2 [00:10<00:00,  5.19s/it, loss=8.55]


### Saving the Artifacts(models, labels, tokenizer, etc)

In [18]:
# Save training config before training locally and to Google Drive
save_training_config(training_config, save_path)
save_training_config_to_drive(training_config, save_path)

# Save artifacts locally and to Google Drive
save_artifacts(label_encoders, metrics, test_results, save_path)
save_artifacts_to_drive(label_encoders, metrics, test_results, save_path)

# Save full model locally and to Google Drive
save_full_model(model, gpt2_tokenizer, save_path)
save_full_model_to_drive(model, gpt2_tokenizer, save_path)