<a href="https://colab.research.google.com/github/reagan13/gpt2-distilbert-thesis-files/blob/main/notebook/Hybrid_Model_Parallel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Multitask Learning with Hybrid (GPT2-Distilbert)

## Import Libraries

In [1]:
import json
import os
import time
from typing import List, Dict, Optional
from collections import Counter, defaultdict
import sys

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    GPT2Model, GPT2Config, GPT2TokenizerFast,
    DistilBertModel, DistilBertTokenizerFast,  # Added for DistilBERT
    AdamW, get_linear_schedule_with_warmup
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score
from datetime import datetime

# Verify device and GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Selected device: {device}")

if device.type == "cuda":
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"Initial GPU Memory Allocated: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")
    torch.cuda.empty_cache()
else:
    print("No GPU detected. Running on CPU.")

# Function to check device of tensors or models (retained)
def check_device(item, name="Item"):
    if isinstance(item, torch.nn.Module):
        param = next(item.parameters(), None)
        if param is not None:
            print(f"{name} is on: {param.device}")
        else:
            print(f"{name} has no parameters to check")
    elif isinstance(item, torch.Tensor):
        print(f"{name} is on: {item.device}")
    else:
        print(f"{name} is not a tensor or model: {type(item)}")

def setup_logging(save_path: str, filename: str = "training_log.txt"):
    try:
        # Ensure local directory exists
        os.makedirs(save_path, exist_ok=True)
        local_log_path = os.path.join(save_path, filename)
        print(f"Local log path created: {local_log_path}")

        # Ensure Google Drive directory exists
        drive_base_path = '/content/drive/'
        if not os.path.exists(drive_base_path):
            raise FileNotFoundError("Google Drive not mounted. Please mount it first.")
        drive_path = os.path.join(drive_base_path, save_path)
        os.makedirs(drive_path, exist_ok=True)
        drive_log_path = os.path.join(drive_path, filename)
        print(f"Google Drive log path created: {drive_log_path}")

        class Logger:
            def __init__(self, local_path, drive_path):
                # Open files in append mode with no buffering
                self.local_file = open(local_path, "a", encoding="utf-8", buffering=1)  # Line-buffered
                self.drive_file = open(drive_path, "a", encoding="utf-8", buffering=1)  # Line-buffered
                self.original_stdout = sys.stdout

            def write(self, message):
                # Write to both files and original stdout
                self.local_file.write(message)
                self.drive_file.write(message)
                self.original_stdout.write(message)
                # Force flush to ensure immediate write
                self.local_file.flush()
                self.drive_file.flush()
                self.original_stdout.flush()

            def flush(self):
                self.local_file.flush()
                self.drive_file.flush()
                self.original_stdout.flush()

            def close(self):
                self.local_file.close()
                self.drive_file.close()
                sys.stdout = self.original_stdout

        # Instantiate logger and redirect stdout
        logger = Logger(local_log_path, drive_log_path)
        sys.stdout = logger

        # Initial log messages
        print(f"Logging started at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        print(f"Log file: {local_log_path}")
        print(f"Drive log file: {drive_log_path}")

        return logger  # Return logger for manual control if needed

    except Exception as e:
        print(f"Error setting up logging: {e}", file=sys.__stdout__)  # Log to original stdout on error
        sys.stdout = sys.__stdout__  # Reset stdout on failure
        return None


print(f"Current date: {datetime.now().strftime('%B %d, %Y')}")

Selected device: cuda
GPU Name: Tesla T4
Initial GPU Memory Allocated: 0.00 MB
Current date: March 07, 2025


## Dasaet Loading Functions

In [2]:
def load_dataset(json_file: str) -> List[Dict]:
    """Load dataset from a JSON file."""
    with open(json_file, 'r', encoding='utf-8') as f:
        return json.load(f)

def detect_labels(data: List[Dict]) -> Dict[str, Dict]:
    """Detect unique labels and create encoders for categories, intents, and NER tags."""
    start_time = time.time()
    if not isinstance(data, list):
        raise TypeError("Input 'data' must be a list of dictionaries")
    if not data:
        return {"category_encoder": {}, "intent_encoder": {}, "ner_label_encoder": {"O": 0}}

    unique_categories = set()
    unique_intents = set()
    unique_ner_labels = set(["O"])
    missing_fields = defaultdict(int)
    category_counts = Counter()
    intent_counts = Counter()
    ner_counts = Counter()

    for i, sample in enumerate(data):
        try:
            category = sample["category"]
            intent = sample["intent"]
            unique_categories.add(category)
            unique_intents.add(intent)
            category_counts[category] += 1
            intent_counts[intent] += 1

            ner_labels = sample["ner_labels_only"]
            if not isinstance(ner_labels, list):
                raise ValueError(f"'ner_labels_only' must be a list at sample {i}")
            for label in ner_labels:
                if not isinstance(label, dict) or "label" not in label or "text" not in label:
                    raise ValueError(f"NER label must have 'label' and 'text' fields at sample {i}")
                label_type = label["label"]
                unique_ner_labels.add(f"B-{label_type}")
                unique_ner_labels.add(f"I-{label_type}")
                ner_counts[f"B-{label_type}"] += 1
                ner_counts[f"I-{label_type}"] += 1
        except KeyError as e:
            missing_fields[str(e).strip("'")] += 1
            continue

    if missing_fields:
        print("Warning: Missing fields detected:")
        for field, count in missing_fields.items():
            print(f"  - '{field}' missing in {count} samples")

    category_encoder = {cat: idx for idx, cat in enumerate(sorted(unique_categories))}
    intent_encoder = {intent: idx for idx, intent in enumerate(sorted(unique_intents))}
    ner_label_encoder = {ner: idx for idx, ner in enumerate(sorted(unique_ner_labels))}

    print(f"Dataset summary:\n  - {len(data)} samples\n  - {len(category_encoder)} categories\n  - {len(intent_encoder)} intents\n  - {len(ner_label_encoder)} NER tags")
    print("Category distribution:", dict(category_counts))
    print("Intent distribution:", dict(intent_counts))
    print("NER tag distribution (non-O):", dict(ner_counts))
    print(f"Processing time: {time.time() - start_time:.3f} seconds")

    return {"category_encoder": category_encoder, "intent_encoder": intent_encoder, "ner_label_encoder": ner_label_encoder}

## Tokenization and NER Alignment

In [3]:
def tokenize_text_hybrid(text: str, gpt2_tokenizer, distilbert_tokenizer, max_length: int) -> Dict[str, torch.Tensor]:
    """Tokenize text using both GPT-2 and DistilBERT tokenizers."""
    gpt2_inputs = gpt2_tokenizer(
        text, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
    )
    distilbert_inputs = distilbert_tokenizer(
        text, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
    )
    return {
        "gpt2_input_ids": gpt2_inputs["input_ids"].squeeze(0),
        "gpt2_attention_mask": gpt2_inputs["attention_mask"].squeeze(0),
        "distilbert_input_ids": distilbert_inputs["input_ids"].squeeze(0),
        "distilbert_attention_mask": distilbert_inputs["attention_mask"].squeeze(0)
    }

def align_ner_labels(text: str, ner_labels: List[Dict], tokenizer, ner_label_encoder: Dict, max_length: int) -> torch.Tensor:
    """Align NER labels with tokenized input (using GPT-2 tokenizer for consistency)."""
    sorted_labels = sorted(ner_labels, key=lambda x: len(x["text"]), reverse=True) if ner_labels else []
    encoding = tokenizer(
        text, max_length=max_length, padding="max_length", truncation=True, return_offsets_mapping=True, return_tensors="pt"
    )
    token_to_char_map = encoding["offset_mapping"][0].tolist()
    ner_aligned = [ner_label_encoder["O"]] * max_length

    for label in sorted_labels:
        if "text" not in label or "label" not in label:
            print(f"Warning: Skipping invalid NER entry {label} (missing 'text' or 'label')")
            continue
        try:
            label_text, label_type = label["text"], label["label"]
            start_pos = 0
            while True:
                label_start = text.find(label_text, start_pos)
                if label_start == -1:
                    break
                label_end = label_start + len(label_text)
                start_pos = label_end
                first_token = True
                for i, (start, end) in enumerate(token_to_char_map):
                    if start == 0 and end == 0:
                        continue
                    if max(start, label_start) < min(end, label_end):
                        prefix = "B-" if first_token else "I-"
                        first_token = False
                        ner_aligned[i] = ner_label_encoder.get(f"{prefix}{label_type}", ner_label_encoder["O"])
        except KeyError as e:
            print(f"Warning: Label '{e}' not found in encoder. Skipping.")

    return torch.tensor(ner_aligned, dtype=torch.long)

## Dataset and Dataloader

In [4]:
class MultiTaskDataset(Dataset):
    def __init__(self, data: List[Dict], gpt2_tokenizer, distilbert_tokenizer, label_encoders, max_length: int):
        self.data = data
        self.gpt2_tokenizer = gpt2_tokenizer
        self.distilbert_tokenizer = distilbert_tokenizer
        self.label_encoders = label_encoders
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        text = sample["instruction"]
        hybrid_inputs = tokenize_text_hybrid(text, self.gpt2_tokenizer, self.distilbert_tokenizer, self.max_length)
        ner_labels = align_ner_labels(text, sample["ner_labels_only"], self.gpt2_tokenizer, self.label_encoders["ner_label_encoder"], self.max_length)

        return {
            "gpt2_input_ids": hybrid_inputs["gpt2_input_ids"],
            "gpt2_attention_mask": hybrid_inputs["gpt2_attention_mask"],
            "distilbert_input_ids": hybrid_inputs["distilbert_input_ids"],
            "distilbert_attention_mask": hybrid_inputs["distilbert_attention_mask"],
            "category_labels": torch.tensor(self.label_encoders["category_encoder"][sample["category"]], dtype=torch.long),
            "intent_labels": torch.tensor(self.label_encoders["intent_encoder"][sample["intent"]], dtype=torch.long),
            "ner_labels": ner_labels
        }

def get_dataloaders(train_data, val_data, test_data, gpt2_tokenizer, distilbert_tokenizer, label_encoders, batch_size, num_workers, max_length):
    """Create DataLoaders for train, validation, and test sets."""
    pin_memory = device.type == "cuda"
    train_dataset = MultiTaskDataset(train_data, gpt2_tokenizer, distilbert_tokenizer, label_encoders, max_length)
    val_dataset = MultiTaskDataset(val_data, gpt2_tokenizer, distilbert_tokenizer, label_encoders, max_length)
    test_dataset = MultiTaskDataset(test_data, gpt2_tokenizer, distilbert_tokenizer, label_encoders, max_length)

    return (
        DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=pin_memory),
        DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=pin_memory),
        DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=pin_memory)
    )

## Model Architecture

In [5]:
class FusionLayer(nn.Module):
    def __init__(self, gpt2_dim: int, bert_dim: int, output_dim: int, dropout_rate: float):
        super().__init__()
        self.gpt2_proj = nn.Linear(gpt2_dim, output_dim)
        self.bert_proj = nn.Linear(bert_dim, output_dim)
        self.fusion = nn.Sequential(
            nn.Linear(output_dim * 2, output_dim),
            nn.Tanh(),
            nn.Dropout(dropout_rate)
        )
        self.layer_norm = nn.LayerNorm(output_dim)

    def forward(self, gpt2_features: torch.Tensor, bert_features: torch.Tensor) -> torch.Tensor:
        gpt2_proj = self.gpt2_proj(gpt2_features)
        bert_proj = self.bert_proj(bert_features)
        concat_features = torch.cat([gpt2_proj, bert_proj], dim=-1)
        fused = self.fusion(concat_features)
        return self.layer_norm(fused)

class HybridGPT2DistilBERTMultiTask(nn.Module):
    def __init__(self, num_intents: int, num_categories: int, num_ner_labels: int, dropout_rate: float):
        super().__init__()
        self.gpt2_config = GPT2Config.from_pretrained('gpt2')
        self.gpt2 = GPT2Model.from_pretrained('gpt2')
        self.distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')

        for param in self.gpt2.parameters():
            param.requires_grad = False
        for param in self.distilbert.parameters():
            param.requires_grad = False

        gpt2_dim = self.gpt2_config.n_embd
        bert_dim = self.distilbert.config.hidden_size
        hidden_size = gpt2_dim  # Keeping output dim same as GPT-2 for consistency

        self.fusion_layer = FusionLayer(gpt2_dim, bert_dim, hidden_size, dropout_rate)

        self.intent_head = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.Tanh(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_size, num_intents)
        )
        self.category_head = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.Tanh(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_size, num_categories)
        )
        self.ner_head = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.Tanh(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_size, num_ner_labels)
        )

        self.intent_loss_fn = nn.CrossEntropyLoss()
        self.category_loss_fn = nn.CrossEntropyLoss()
        self.ner_loss_fn = nn.CrossEntropyLoss()

    def forward(self, gpt2_input_ids: torch.Tensor, gpt2_attention_mask: torch.Tensor,
                distilbert_input_ids: torch.Tensor, distilbert_attention_mask: torch.Tensor,
                intent_labels: Optional[torch.Tensor] = None,
                category_labels: Optional[torch.Tensor] = None,
                ner_labels: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:

        gpt2_outputs = self.gpt2(input_ids=gpt2_input_ids, attention_mask=gpt2_attention_mask)
        distilbert_outputs = self.distilbert(input_ids=distilbert_input_ids, attention_mask=distilbert_attention_mask)

        gpt2_features = gpt2_outputs.last_hidden_state
        bert_features = distilbert_outputs.last_hidden_state

        fused_features = self.fusion_layer(gpt2_features, bert_features)

        batch_size = fused_features.shape[0]
        sequence_lengths = gpt2_attention_mask.sum(dim=1) - 1
        last_token_indexes = sequence_lengths.unsqueeze(-1).unsqueeze(-1).repeat(1, 1, fused_features.shape[-1])
        sequence_repr = torch.gather(fused_features, 1, last_token_indexes).squeeze(1)

        intent_logits = self.intent_head(sequence_repr)
        category_logits = self.category_head(sequence_repr)
        ner_logits = self.ner_head(fused_features)

        output_dict = {
            'intent_logits': intent_logits,
            'category_logits': category_logits,
            'ner_logits': ner_logits
        }

        if all(label is not None for label in [intent_labels, category_labels, ner_labels]):
            intent_loss = self.intent_loss_fn(intent_logits, intent_labels)
            category_loss = self.category_loss_fn(category_logits, category_labels)
            active_loss = gpt2_attention_mask.view(-1) == 1
            active_logits = ner_logits.view(-1, ner_logits.size(-1))[active_loss]
            active_labels = ner_labels.view(-1)[active_loss]
            ner_loss = self.ner_loss_fn(active_logits, active_labels)

            output_dict.update({
                'loss': intent_loss + category_loss + ner_loss,
                'intent_loss': intent_loss,
                'category_loss': category_loss,
                'ner_loss': ner_loss
            })

        return output_dict

## Training Loop

In [6]:
from tqdm import tqdm

def train_model(model, train_loader, val_loader, num_epochs, learning_rate):
    """Train the multi-task model."""
    optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate)
    history = {
        "train_loss": [], "val_loss": [],
        "train_intent_acc": [], "val_intent_acc": [],
        "train_category_f1": [], "val_category_f1": [],
        "train_ner_f1": [], "val_ner_f1": []
    }

    model.to(device)
    check_device(model, "Model")

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        all_train_intent_preds, all_train_intent_labels = [], []
        all_train_category_preds, all_train_category_labels = [], []
        all_train_ner_preds, all_train_ner_labels = [], []

        with tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Training]", leave=False) as train_loop:
            for i, batch in enumerate(train_loop):
                optimizer.zero_grad()
                inputs = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**inputs)
                loss = outputs["loss"]
                loss.backward()
                optimizer.step()

                total_loss += loss.item()
                train_loop.set_postfix(loss=loss.item())

                intent_preds = torch.argmax(outputs["intent_logits"], dim=-1).cpu().numpy()
                category_preds = torch.argmax(outputs["category_logits"], dim=-1).cpu().numpy()
                ner_preds = torch.argmax(outputs["ner_logits"], dim=-1).cpu().numpy()

                all_train_intent_preds.extend(intent_preds)
                all_train_intent_labels.extend(batch["intent_labels"].cpu().numpy())
                all_train_category_preds.extend(category_preds)
                all_train_category_labels.extend(batch["category_labels"].cpu().numpy())
                all_train_ner_preds.extend(ner_preds.flatten())
                all_train_ner_labels.extend(batch["ner_labels"].cpu().numpy().flatten())

        train_intent_acc = accuracy_score(all_train_intent_labels, all_train_intent_preds)
        train_category_f1 = precision_recall_fscore_support(all_train_category_labels, all_train_category_preds, average="macro", zero_division=0)[2]
        train_ner_f1 = precision_recall_fscore_support(all_train_ner_labels, all_train_ner_preds, average="macro", zero_division=0)[2]

        history["train_loss"].append(total_loss / len(train_loader))
        history["train_intent_acc"].append(train_intent_acc)
        history["train_category_f1"].append(train_category_f1)
        history["train_ner_f1"].append(train_ner_f1)

        model.eval()
        val_loss = 0
        all_val_intent_preds, all_val_intent_labels = [], []
        all_val_category_preds, all_val_category_labels = [], []
        all_val_ner_preds, all_val_ner_labels = [], []

        val_loop = tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Validation]", leave=False)
        with torch.no_grad():
            for batch in val_loop:
                inputs = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**inputs)
                batch_val_loss = outputs["loss"].item()
                val_loss += batch_val_loss
                val_loop.set_postfix(loss=batch_val_loss)

                intent_preds = torch.argmax(outputs["intent_logits"], dim=-1).cpu().numpy()
                category_preds = torch.argmax(outputs["category_logits"], dim=-1).cpu().numpy()
                ner_preds = torch.argmax(outputs["ner_logits"], dim=-1).cpu().numpy()

                all_val_intent_preds.extend(intent_preds)
                all_val_intent_labels.extend(batch["intent_labels"].cpu().numpy())
                all_val_category_preds.extend(category_preds)
                all_val_category_labels.extend(batch["category_labels"].cpu().numpy())
                all_val_ner_preds.extend(ner_preds.flatten())
                all_val_ner_labels.extend(batch["ner_labels"].cpu().numpy().flatten())

        val_intent_acc = accuracy_score(all_val_intent_labels, all_val_intent_preds)
        val_category_f1 = precision_recall_fscore_support(all_val_category_labels, all_val_category_preds, average="macro", zero_division=0)[2]
        val_ner_f1 = precision_recall_fscore_support(all_val_ner_labels, all_val_ner_preds, average="macro", zero_division=0)[2]

        history["val_loss"].append(val_loss / len(val_loader))
        history["val_intent_acc"].append(val_intent_acc)
        history["val_category_f1"].append(val_category_f1)
        history["val_ner_f1"].append(val_ner_f1)

        print(f"Epoch {epoch+1}/{num_epochs}:")
        print(f"  Train Loss:      {history['train_loss'][-1]:.4f}")
        print(f"  Val Loss:       {history['val_loss'][-1]:.4f}\n")
        print(f"  Train Intent Acc: {train_intent_acc:.4f}")
        print(f"  Val Intent Acc:  {val_intent_acc:.4f}\n")
        print(f"  Train Category F1:{train_category_f1:.4f}")
        print(f"  Val Category F1: {val_category_f1:.4f}\n")
        print(f"  Train NER F1:     {train_ner_f1:.4f}")
        print(f"  Val NER F1:      {val_ner_f1:.4f}\n")

    return history

## Evaluation

In [7]:
def evaluate_model(model, test_loader):
    """Evaluate the model on the test set."""
    model.eval()
    all_intent_preds, all_intent_labels = [], []
    all_category_preds, all_category_labels = [], []
    all_ner_preds, all_ner_labels = [], []
    total_loss = 0

    device = next(model.parameters()).device

    test_loop = tqdm(test_loader, desc="Evaluation", leave=True)
    with torch.no_grad():
        for batch in test_loop:
            inputs = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**inputs)
            batch_loss = outputs["loss"].item()
            total_loss += batch_loss
            test_loop.set_postfix(loss=batch_loss)

            intent_preds = torch.argmax(outputs["intent_logits"], dim=-1).cpu().numpy()
            category_preds = torch.argmax(outputs["category_logits"], dim=-1).cpu().numpy()
            ner_preds = torch.argmax(outputs["ner_logits"], dim=-1).cpu().numpy()

            all_intent_preds.extend(intent_preds)
            all_intent_labels.extend(batch["intent_labels"].cpu().numpy())
            all_category_preds.extend(category_preds)
            all_category_labels.extend(batch["category_labels"].cpu().numpy())
            all_ner_preds.extend(ner_preds.flatten())
            all_ner_labels.extend(batch["ner_labels"].cpu().numpy().flatten())

    intent_acc = accuracy_score(all_intent_labels, all_intent_preds)
    category_f1 = precision_recall_fscore_support(all_category_labels, all_category_preds, average="macro", zero_division=0)[2]
    ner_f1 = precision_recall_fscore_support(all_ner_labels, all_ner_preds, average="macro", zero_division=0)[2]
    avg_loss = total_loss / len(test_loader)

    results = {
        "loss": avg_loss,
        "intent_accuracy": intent_acc,
        "category_f1": category_f1,
        "ner_f1": ner_f1
    }

    print(f"Test Results:")
    print(f"  Loss:            {avg_loss:.4f}")
    print(f"  Intent Acc:      {intent_acc:.4f}")
    print(f"  Category F1:     {category_f1:.4f}")
    print(f"  NER F1:          {ner_f1:.4f}")

    return results

## Save Artifacts

### Save Locally

In [8]:
def save_artifacts(label_encoders, metrics, test_results, save_path):
    os.makedirs(save_path, exist_ok=True)
    label_encoders_path = os.path.join(save_path, "label_encoders.json")
    with open(label_encoders_path, "w", encoding="utf-8") as f:
        json.dump(label_encoders, f, ensure_ascii=False, indent=4)
    print(f"Label encoders saved to {label_encoders_path}")

    training_metrics_path = os.path.join(save_path, "training_metrics.json")
    with open(training_metrics_path, "w", encoding="utf-8") as f:
        json.dump(metrics, f, ensure_ascii=False, indent=4)
    print(f"Training metrics saved to {training_metrics_path}")

    test_results_path = os.path.join(save_path, "test_results.json")
    with open(test_results_path, "w", encoding="utf-8") as f:
        json.dump(test_results, f, ensure_ascii=False, indent=4)
    print(f"Test results saved to {test_results_path}")

def save_training_config(config, save_path, filename="training_config.json"):
    os.makedirs(save_path, exist_ok=True)
    config_path = os.path.join(save_path, filename)
    with open(config_path, "w", encoding="utf-8") as f:
        json.dump(config, f, ensure_ascii=False, indent=4)
    print(f"Training configuration saved to {config_path}")

def save_full_model(model, gpt2_tokenizer, distilbert_tokenizer, save_path):
    model_path = os.path.join(save_path, "model")
    os.makedirs(model_path, exist_ok=True)
    model_file_path = os.path.join(model_path, "full_model.pt")
    torch.save(model, model_file_path)
    print(f"Full model saved to {model_file_path}")

    gpt2_tokenizer_path = os.path.join(save_path, "gpt2_tokenizer")
    gpt2_tokenizer.save_pretrained(gpt2_tokenizer_path)
    print(f"GPT-2 tokenizer saved to {gpt2_tokenizer_path}")

    distilbert_tokenizer_path = os.path.join(save_path, "distilbert_tokenizer")
    distilbert_tokenizer.save_pretrained(distilbert_tokenizer_path)
    print(f"DistilBERT tokenizer saved to {distilbert_tokenizer_path}")


### Save to GDrive

## Main Execution

In [9]:

from google.colab import drive

def mount_drive():
    drive.mount('/content/drive')
    print("Google Drive mounted successfully.")

def save_training_config_to_drive(config, drive_path, filename="training_config.json"):
    os.makedirs("/content/drive/" + drive_path, exist_ok=True)
    config_path = os.path.join("/content/drive/", drive_path, filename)
    with open(config_path, "w", encoding="utf-8") as f:
        json.dump(config, f, ensure_ascii=False, indent=4)
    print(f"Training configuration saved to {config_path}")

def save_artifacts_to_drive(label_encoders, metrics, test_results, drive_path):
    os.makedirs("/content/drive/" + drive_path, exist_ok=True)
    label_encoders_path = os.path.join("/content/drive/", drive_path, "label_encoders.json")
    with open(label_encoders_path, "w", encoding="utf-8") as f:
        json.dump(label_encoders, f, ensure_ascii=False, indent=4)
    print(f"Label encoders saved to {label_encoders_path}")

    training_metrics_path = os.path.join("/content/drive/", drive_path, "training_metrics.json")
    with open(training_metrics_path, "w", encoding="utf-8") as f:
        json.dump(metrics, f, ensure_ascii=False, indent=4)
    print(f"Training metrics saved to {training_metrics_path}")

    test_results_path = os.path.join("/content/drive/", drive_path, "test_results.json")
    with open(test_results_path, "w", encoding="utf-8") as f:
        json.dump(test_results, f, ensure_ascii=False, indent=4)
    print(f"Test results saved to {test_results_path}")

def save_full_model_to_drive(model, gpt2_tokenizer, distilbert_tokenizer, drive_path):
    model_path = os.path.join("/content/drive/", drive_path, "model")
    os.makedirs(model_path, exist_ok=True)
    model_file_path = os.path.join(model_path, "full_model.pt")
    torch.save(model, model_file_path)
    print(f"Full model saved to {model_file_path}")

    gpt2_tokenizer_path = os.path.join("/content/drive/", drive_path, "gpt2_tokenizer")
    gpt2_tokenizer.save_pretrained(gpt2_tokenizer_path)
    print(f"GPT-2 tokenizer saved to {gpt2_tokenizer_path}")

    distilbert_tokenizer_path = os.path.join("/content/drive/", drive_path, "distilbert_tokenizer")
    distilbert_tokenizer.save_pretrained(distilbert_tokenizer_path)
    print(f"DistilBERT tokenizer saved to {distilbert_tokenizer_path}")

### Paths and Hyperparameters

In [10]:
# Paths and Hyperparameters
train_file = "train.json"
val_file = "val.json"
test_file = "test.json"
batch_size = 32
num_epochs = 5
learning_rate = 3e-5
max_length = 128
num_workers = 2
save_path = "MyDrive/thesis/hybrid_fusion/test_1"
dropout_rate = 0.2

training_config = {
    "train_file": train_file,
    "val_file": val_file,
    "test_file": test_file,
    "batch_size": batch_size,
    "num_epochs": num_epochs,
    "learning_rate": learning_rate,
    "max_length": max_length,
    "num_workers": num_workers,
    "model_name": "HybridGPT2DistilBERTMultiTask",
    "gpt2_base": "gpt2",
    "distilbert_base": "distilbert-base-uncased",
    "dropout_rate": dropout_rate,
    "device": str(device),
    "date": datetime.now().strftime('%B %d, %Y')
}


### Initialization

In [11]:

# Mount Google Drive
mount_drive()
# Setup logging and keep logger in scope
logger = setup_logging(save_path)
if logger is None:
    raise RuntimeError("Logging setup failed. Check Google Drive mount and permissions.")


# Load datasets
print("Loading datasets...\n")
train_data = load_dataset(train_file)
val_data = load_dataset(val_file)
test_data = load_dataset(test_file)

print("*" * 30)
print(f"""Dataset Summary:
Training samples: {len(train_data)}
Validation samples: {len(val_data)}
Test samples: {len(test_data)}""")

# Detect labels
label_encoders = detect_labels(train_data)

# Initialize tokenizers
gpt2_tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
if gpt2_tokenizer.pad_token is None:
    gpt2_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
distilbert_tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Create data loaders
train_loader, val_loader, test_loader = get_dataloaders(
    train_data, val_data, test_data, gpt2_tokenizer, distilbert_tokenizer, label_encoders, batch_size, num_workers, max_length
)

# Initialize model
model = HybridGPT2DistilBERTMultiTask(
    num_intents=len(label_encoders["intent_encoder"]),
    num_categories=len(label_encoders["category_encoder"]),
    num_ner_labels=len(label_encoders["ner_label_encoder"]),
    dropout_rate=dropout_rate
)
if gpt2_tokenizer.pad_token_id is not None:
    model.gpt2.resize_token_embeddings(len(gpt2_tokenizer))

model.to(device)
check_device(model, "Model before training")

# Save training config
save_training_config(training_config, save_path)

# Train model
print("*" * 30)
print("Starting training...")
start_time = time.time()
metrics = train_model(model, train_loader, val_loader, num_epochs, learning_rate)
print(f"Training completed in {(time.time() - start_time) / 60:.2f} minutes")
print("*" * 30)

# Evaluate model
print("Evaluating on test set...")
test_results = evaluate_model(model, test_loader)


Mounted at /content/drive
Google Drive mounted successfully.
Local log path created: MyDrive/thesis/hybrid_fusion/test_1/training_log.txt
Google Drive log path created: /content/drive/MyDrive/thesis/hybrid_fusion/test_1/training_log.txt
Logging started at 2025-03-07 07:08:30
Log file: MyDrive/thesis/hybrid_fusion/test_1/training_log.txt
Drive log file: /content/drive/MyDrive/thesis/hybrid_fusion/test_1/training_log.txt
Loading datasets...

******************************
Dataset Summary:
Training samples: 21497
Validation samples: 2687
Test samples: 2688
Dataset summary:
  - 21497 samples
  - 11 categories
  - 27 intents
  - 19 NER tags
Category distribution: {'cancel': 751, 'account': 4826, 'refund': 2391, 'contact': 1590, 'order': 3225, 'delivery': 1605, 'shipping': 1526, 'invoice': 1592, 'feedback': 1570, 'subscription': 833, 'payment': 1588}
Intent distribution: {'check_cancellation_fee': 751, 'edit_account': 814, 'check_refund_policy': 797, 'contact_human_agent': 798, 'switch_accou

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Model before training is on: cuda:0
Training configuration saved to MyDrive/thesis/hybrid_fusion/test_1/training_config.json
******************************
Starting training...
Model is on: cuda:0




Epoch 1/5:
  Train Loss:      2.8405
  Val Loss:       0.8194

  Train Intent Acc: 0.4968
  Val Intent Acc:  0.7868

  Train Category F1:0.7118
  Val Category F1: 0.9587

  Train NER F1:     0.0651
  Val NER F1:      0.3608





KeyboardInterrupt: 

### Saving the Artifacts(models, labels, tokenizer, etc)

In [None]:

# Save artifacts
save_training_config(training_config, save_path)
save_training_config_to_drive(training_config, save_path)
save_artifacts(label_encoders, metrics, test_results, save_path)
save_artifacts_to_drive(label_encoders, metrics, test_results, save_path)
save_full_model(model, gpt2_tokenizer, distilbert_tokenizer, save_path)
save_full_model_to_drive(model, gpt2_tokenizer, distilbert_tokenizer, save_path)

# Inference

In [None]:
import torch.nn.functional as F

def run_inference(model_path: str, gpt2_tokenizer_path: str, distilbert_tokenizer_path: str, label_encoders_path: str, input_text: str, max_length: int = 128):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load the full model
    model = torch.load(model_path, map_location=device, weights_only=False)
    model.eval()

    # Load tokenizers
    gpt2_tokenizer = GPT2TokenizerFast.from_pretrained(gpt2_tokenizer_path)
    distilbert_tokenizer = DistilBertTokenizerFast.from_pretrained(distilbert_tokenizer_path)

    # Load label encoders
    with open(label_encoders_path, "r", encoding="utf-8") as f:
        label_encoders = json.load(f)
    intent_decoder = {v: k for k, v in label_encoders["intent_encoder"].items()}
    category_decoder = {v: k for k, v in label_encoders["category_encoder"].items()}
    ner_decoder = {v: k for k, v in label_encoders["ner_label_encoder"].items()}

    # Preprocess input
    gpt2_inputs = gpt2_tokenizer(
        input_text, return_tensors="pt", max_length=max_length, padding="max_length", truncation=True, return_offsets_mapping=True
    )
    distilbert_inputs = distilbert_tokenizer(
        input_text, return_tensors="pt", max_length=max_length, padding="max_length", truncation=True
    )
    inputs = {
        "gpt2_input_ids": gpt2_inputs["input_ids"].to(device),
        "gpt2_attention_mask": gpt2_inputs["attention_mask"].to(device),
        "distilbert_input_ids": distilbert_inputs["input_ids"].to(device),
        "distilbert_attention_mask": distilbert_inputs["attention_mask"].to(device)
    }
    offset_mapping = gpt2_inputs["offset_mapping"][0].cpu().tolist()

    # Run inference
    with torch.no_grad():
        outputs = model(**inputs)

        intent_logits = outputs["intent_logits"]
        intent_probs = F.softmax(intent_logits, dim=-1)[0]
        intent_pred_idx = torch.argmax(intent_probs).item()
        intent_confidence = intent_probs[intent_pred_idx].item()
        intent_label = intent_decoder[intent_pred_idx]

        category_logits = outputs["category_logits"]
        category_probs = F.softmax(category_logits, dim=-1)[0]
        category_pred_idx = torch.argmax(category_probs).item()
        category_confidence = category_probs[category_pred_idx].item()
        category_label = category_decoder[category_pred_idx]

        ner_logits = outputs["ner_logits"][0]
        ner_probs = F.softmax(ner_logits, dim=-1)
        ner_pred_idxs = torch.argmax(ner_probs, dim=-1).tolist()
        ner_confidences = torch.max(ner_probs, dim=-1).values.tolist()
        ner_labels = [ner_decoder[idx] for idx in ner_pred_idxs]

        seq_len = inputs["gpt2_attention_mask"][0].sum().item()
        ner_labels = ner_labels[:seq_len]
        ner_confidences = ner_confidences[:seq_len]
        offset_mapping = offset_mapping[:seq_len]

    # Detect entity spans
    entities = []
    current_entity = None
    entity_start = None
    entity_confidences = []

    for i, (label, conf, (start, end)) in enumerate(zip(ner_labels, ner_confidences, offset_mapping)):
        if label.startswith("B-"):
            if current_entity is not None:
                entity_text = input_text[entity_start:start]
                entities.append({
                    "entity": entity_text.strip(),
                    "label": current_entity,
                    "confidence": sum(entity_confidences) / len(entity_confidences)
                })
            current_entity = label[2:]
            entity_start = start
            entity_confidences = [conf]

        elif label.startswith("I-") and current_entity == label[2:]:
            entity_confidences.append(conf)

        elif label == "O" and current_entity is not None:
            entity_text = input_text[entity_start:start]
            entities.append({
                "entity": entity_text.strip(),
                "label": current_entity,
                "confidence": sum(entity_confidences) / len(entity_confidences)
            })
            current_entity = None
            entity_confidences = []

    if current_entity is not None:
        entity_text = input_text[entity_start:offset_mapping[-1][1]]
        entities.append({
            "entity": entity_text.strip(),
            "label": current_entity,
            "confidence": sum(entity_confidences) / len(entity_confidences)
        })

    results = {
        "intent": {"label": intent_label, "confidence": intent_confidence},
        "category": {"label": category_label, "confidence": category_confidence},
        "ner": entities
    }

    return results

# Example usage
input_text = "I want a refund amount of $2323 now"
results = run_inference(
    model_path="MyDrive/thesis/hybrid/test_1/model/full_model.pt",
    gpt2_tokenizer_path="MyDrive/thesis/hybrid/test_1/gpt2_tokenizer",
    distilbert_tokenizer_path="MyDrive/thesis/hybrid/test_1/distilbert_tokenizer",
    label_encoders_path="MyDrive/thesis/hybrid/test_1/label_encoders.json",
    input_text=input_text
)

print("Inference Results:")
print(f"Intent: {results['intent']['label']} (Confidence: {results['intent']['confidence']:.4f})")
print(f"Category: {results['category']['label']} (Confidence: {results['category']['confidence']:.4f})")
print("NER:")
for entity in results['ner']:
    print(f"  Entity: {entity['entity']} | Label: {entity['label']} | Confidence: {entity['confidence']:.4f}")