<a href="https://colab.research.google.com/github/reagan13/gpt2-distilbert-thesis-files/blob/main/notebook/Hybrid_Concat_V2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Hybrid V2

## Import

In [None]:
# Cell 1: Imports and Setup
import json
from typing import List, Dict, Optional
import time
from collections import Counter
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Config, GPT2Model, DistilBertModel, GPT2TokenizerFast, DistilBertTokenizerFast
from sklearn.metrics import precision_recall_fscore_support
from tqdm import tqdm
import os
from google.colab import drive


# Mount Google Drive
drive.mount('/content/drive')
output_dir = "/content/drive/MyDrive/thesis/Hybrid_Concat_Freeze"
os.makedirs(output_dir, exist_ok=True)
log_file = os.path.join(output_dir, "training_log.txt")

def log_to_file(message: str):
    with open(log_file, 'a') as f:
        f.write(f"{message}\n")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
log_to_file(f"Using device: {device}")

def check_device(tensor_or_model, name: str):
    if isinstance(tensor_or_model, torch.nn.Module):
        device_name = next(tensor_or_model.parameters()).device
    else:
        device_name = tensor_or_model.device
    log_to_file(f"{name} is on: {device_name}")


Mounted at /content/drive


## Data Loading and Label Detection

In [None]:
def load_dataset(json_file: str) -> List[Dict]:
    log_to_file(f"Loading dataset from {json_file}...")
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    log_to_file(f"Loaded {len(data)} samples from {json_file}")
    return data

def detect_labels(data: List[Dict]) -> Dict[str, Dict]:
    log_to_file("Detecting labels...")
    if not data:
        log_to_file("Warning: Empty dataset")
        return {"category_encoder": {}, "intent_encoder": {}, "ner_label_encoder": {"O": 0}}

    unique_categories = set()
    unique_intents = set()
    unique_ner_labels = set(["O"])

    for i, sample in enumerate(data):
        unique_categories.add(sample["category"])
        unique_intents.add(sample["intent"])
        for label in sample["ner_labels_only"]:
            label_type = label["label"]
            unique_ner_labels.add(f"B-{label_type}")
            unique_ner_labels.add(f"I-{label_type}")

    category_encoder = {cat: idx for idx, cat in enumerate(sorted(unique_categories))}
    intent_encoder = {intent: idx for idx, intent in enumerate(sorted(unique_intents))}
    ner_label_encoder = {ner: idx for idx, ner in enumerate(sorted(unique_ner_labels))}

    log_to_file(f"Label detection summary: Categories={len(category_encoder)}, Intents={len(intent_encoder)}, NER tags={len(ner_label_encoder)}")
    return {"category_encoder": category_encoder, "intent_encoder": intent_encoder, "ner_label_encoder": ner_label_encoder}


## Tokenization and Align Ner Labels

In [None]:

# Cell 3: Tokenization and NER Alignment
def tokenize_text(text: str, gpt2_tokenizer, distilbert_tokenizer, max_length: int) -> Dict[str, torch.Tensor]:
    gpt2_inputs = gpt2_tokenizer(
        text, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
    )
    distilbert_inputs = distilbert_tokenizer(
        text, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
    )
    return {
        "gpt2_input_ids": gpt2_inputs["input_ids"].squeeze(0),
        "gpt2_attention_mask": gpt2_inputs["attention_mask"].squeeze(0),
        "distilbert_input_ids": distilbert_inputs["input_ids"].squeeze(0),
        "distilbert_attention_mask": distilbert_inputs["attention_mask"].squeeze(0)
    }

def align_ner_labels(text: str, ner_labels: List[Dict], tokenizer, ner_label_encoder: Dict, max_length: int, sample_idx: int = -1) -> torch.Tensor:
    global print_counter
    if 'print_counter' not in globals():
        print_counter = 0

    should_log = print_counter < 2 and sample_idx >= 0
    if should_log:
        log_to_file(f"Aligning NER labels for text: {text[:50]}...")
        log_to_file(f"NER Labels: {ner_labels}")
    print_counter += 1 if sample_idx >= 0 else 0

    sorted_labels = sorted(ner_labels, key=lambda x: len(x["text"]), reverse=True) if ner_labels else []
    encoding = tokenizer(
        text, max_length=max_length, padding="max_length", truncation=True, return_offsets_mapping=True, return_tensors="pt"
    )
    token_to_char_map = encoding["offset_mapping"][0].tolist()
    ner_aligned = [ner_label_encoder["O"]] * max_length

    text_lower = text.lower()
    found_count = 0
    not_found_count = 0

    for label in sorted_labels:
        if "text" not in label or "label" not in label:
            if should_log:
                log_to_file(f"Warning: Skipping invalid NER entry {label}")
            continue
        label_text, label_type = label["text"], label["label"]
        label_text_lower = label_text.lower()
        start_pos = 0
        found_at_least_once = False
        while True:
            label_start = text_lower.find(label_text_lower, start_pos)
            if label_start == -1:
                if not found_at_least_once:
                    not_found_count += 1
                break
            label_end = label_start + len(label_text_lower)
            found_at_least_once = True
            found_count += 1
            start_pos = label_end
            first_token = True
            tokens_tagged = False
            for i, (start, end) in enumerate(token_to_char_map):
                if start == 0 and end == 0:
                    continue
                if start < label_end and end > label_start and end > start:
                    prefix = "B-" if first_token else "I-"
                    first_token = False
                    ner_aligned[i] = ner_label_encoder.get(f"{prefix}{label_type}", ner_label_encoder["O"])
                    tokens_tagged = True
            if not tokens_tagged and should_log:
                log_to_file(f"Warning: No tokens aligned for '{label_text}' ({label_type}) at {label_start}-{label_end}")

    if should_log:
        tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"][0].tolist())
        seq_len = encoding["attention_mask"][0].sum().item()
        log_to_file("Debug Aligned Labels:")
        log_to_file("Token | Aligned Label")
        for token, label_idx in zip(tokens[:seq_len], ner_aligned[:seq_len]):
            label = list(ner_label_encoder.keys())[list(ner_label_encoder.values()).index(label_idx)]
            log_to_file(f"{token:<15} | {label}")
        log_to_file(f"Alignment Summary: {found_count} entities found, {not_found_count} entities not found")

    return torch.tensor(ner_aligned, dtype=torch.long)

## Dataset and DataLoader

In [None]:
# Cell 4: Dataset and DataLoader
class MultiTaskDataset(Dataset):
    def __init__(self, data: List[Dict], gpt2_tokenizer, distilbert_tokenizer, label_encoders, max_length: int):
        self.data = data
        self.gpt2_tokenizer = gpt2_tokenizer
        self.distilbert_tokenizer = distilbert_tokenizer
        self.label_encoders = label_encoders
        self.max_length = max_length
        log_to_file(f"Initialized dataset with {len(data)} samples")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        text = sample["instruction"]
        inputs = tokenize_text(text, self.gpt2_tokenizer, self.distilbert_tokenizer, self.max_length)
        ner_labels = align_ner_labels(text, sample["ner_labels_only"], self.gpt2_tokenizer,
                                      self.label_encoders["ner_label_encoder"], self.max_length, sample_idx=idx)
        return {
            "gpt2_input_ids": inputs["gpt2_input_ids"],
            "gpt2_attention_mask": inputs["gpt2_attention_mask"],
            "distilbert_input_ids": inputs["distilbert_input_ids"],
            "distilbert_attention_mask": inputs["distilbert_attention_mask"],
            "category_labels": torch.tensor(self.label_encoders["category_encoder"][sample["category"]], dtype=torch.long),
            "intent_labels": torch.tensor(self.label_encoders["intent_encoder"][sample["intent"]], dtype=torch.long),
            "ner_labels": ner_labels
        }

def get_dataloaders(train_data, val_data, test_data, gpt2_tokenizer, distilbert_tokenizer, label_encoders, batch_size, num_workers, max_length):
    pin_memory = device.type == "cuda"
    log_to_file("Creating DataLoaders...")
    train_dataset = MultiTaskDataset(train_data, gpt2_tokenizer, distilbert_tokenizer, label_encoders, max_length)
    val_dataset = MultiTaskDataset(val_data, gpt2_tokenizer, distilbert_tokenizer, label_encoders, max_length)
    test_dataset = MultiTaskDataset(test_data, gpt2_tokenizer, distilbert_tokenizer, label_encoders, max_length)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=pin_memory)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=pin_memory)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=pin_memory)
    log_to_file(f"DataLoaders created: Train={len(train_loader)}, Val={len(val_loader)}, Test={len(test_loader)}")
    return train_loader, val_loader, test_loader



## Model Architecture

In [None]:

# Cell 5: Model Definition
class FusionLayer(nn.Module):
    def __init__(self, gpt2_dim: int, bert_dim: int, output_dim: int, dropout_rate: float):
        super().__init__()
        self.gpt2_proj = nn.Linear(gpt2_dim, output_dim)
        self.bert_proj = nn.Linear(bert_dim, output_dim)
        self.fusion = nn.Sequential(
            nn.Linear(output_dim * 2, output_dim),
            nn.Tanh(),
            nn.Dropout(dropout_rate)
        )
        self.layer_norm = nn.LayerNorm(output_dim)

    def forward(self, gpt2_features: torch.Tensor, bert_features: torch.Tensor,
                attention_mask: torch.Tensor) -> torch.Tensor:
        gpt2_proj = self.gpt2_proj(gpt2_features)
        bert_proj = self.bert_proj(bert_features)
        concat_features = torch.cat([gpt2_proj, bert_proj], dim=-1)
        fused = self.fusion(concat_features)
        return self.layer_norm(fused)

class HybridGPT2DistilBERTMultiTask(nn.Module):
    def __init__(self, num_intents: int, num_categories: int, num_ner_labels: int,
                 dropout_rate: float, loss_weights: Dict[str, float],
                 ner_class_weights: torch.Tensor , category_class_weights: torch.Tensor ,
                 intent_class_weights: torch.Tensor ):
        super().__init__()
        print("Initializing model...")
        self.gpt2_config = GPT2Config.from_pretrained('gpt2')
        self.gpt2 = GPT2Model.from_pretrained('gpt2')
        self.distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')

        # Option 1: Freeze all layers (default)
        for param in self.gpt2.parameters():
            param.requires_grad = False
        for param in self.distilbert.parameters():
            param.requires_grad = False
        log_to_file("All GPT-2 and DistilBERT layers remain frozen by default")

        # Option 2: Unfreeze last 2 layers of GPT-2 and DistilBERT (comment/uncomment as needed)

        # Unfreeze last 2 layers of GPT-2
        # for param in self.gpt2.h[-2:].parameters():
        #     param.requires_grad = True
        # # Unfreeze last 2 layers of DistilBERT
        # for param in self.distilbert.transformer.layer[-2:].parameters():
        #     param.requires_grad = True
        # log_to_file("Unfroze last 2 layers of GPT-2 and DistilBERT")
        # Unfreeze last 2 layers of GPT-2

        # for param in self.gpt2.h[-2:].parameters():
        #     param.requires_grad = True

        # # Keep DistilBERT frozen (or comment out the following loop if it was previously unfreezing)
        # for param in self.distilbert.transformer.layer[-2:].parameters():
        #     param.requires_grad = False  # This line ensures DistilBERT layers stay frozen

        # log_to_file("Unfroze last 2 layers of GPT-2 only, DistilBERT remains frozen")

        gpt2_dim = self.gpt2_config.n_embd  # 768
        bert_dim = self.distilbert.config.hidden_size  # 768
        hidden_size = gpt2_dim

        self.fusion_layer = FusionLayer(gpt2_dim, bert_dim, hidden_size, dropout_rate)

        self.intent_head = nn.Sequential(
            nn.Linear(hidden_size, hidden_size), nn.Tanh(), nn.Dropout(dropout_rate),
            nn.Linear(hidden_size, num_intents)
        )
        self.category_head = nn.Sequential(
            nn.Linear(hidden_size, hidden_size), nn.Tanh(), nn.Dropout(dropout_rate),
            nn.Linear(hidden_size, num_categories)
        )
        self.ner_head = nn.Sequential(
            nn.Linear(hidden_size, hidden_size), nn.Tanh(), nn.Dropout(dropout_rate),
            nn.Linear(hidden_size, num_ner_labels)
        )

        self.intent_loss_fn = nn.CrossEntropyLoss(weight=intent_class_weights)
        self.category_loss_fn = nn.CrossEntropyLoss(weight=category_class_weights)
        self.ner_loss_fn = nn.CrossEntropyLoss(weight=ner_class_weights)
        self.loss_weights = loss_weights

        log_to_file(f"Model initialized with loss weights: {self.loss_weights}")
        if intent_class_weights is not None:
            log_to_file(f"Intent class weights applied: {intent_class_weights[:5]}...")
        if category_class_weights is not None:
            log_to_file(f"Category class weights applied: {category_class_weights[:5]}...")
        if ner_class_weights is not None:
            log_to_file(f"NER class weights applied: {ner_class_weights[:5]}...")


    def forward(self, gpt2_input_ids: torch.Tensor, gpt2_attention_mask: torch.Tensor,
                distilbert_input_ids: torch.Tensor, distilbert_attention_mask: torch.Tensor,
                intent_labels: Optional[torch.Tensor] = None,
                category_labels: Optional[torch.Tensor] = None,
                ner_labels: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
        gpt2_outputs = self.gpt2(input_ids=gpt2_input_ids, attention_mask=gpt2_attention_mask)
        distilbert_outputs = self.distilbert(input_ids=distilbert_input_ids, attention_mask=distilbert_attention_mask)

        gpt2_features = gpt2_outputs.last_hidden_state
        bert_features = distilbert_outputs.last_hidden_state

        fused_features = self.fusion_layer(gpt2_features, bert_features, gpt2_attention_mask)

        masked_features = fused_features * gpt2_attention_mask.unsqueeze(-1)
        sequence_repr = masked_features.sum(dim=1) / gpt2_attention_mask.sum(dim=1, keepdim=True)

        intent_logits = self.intent_head(sequence_repr)
        category_logits = self.category_head(sequence_repr)
        ner_logits = self.ner_head(fused_features)

        output_dict = {
            'intent_logits': intent_logits,
            'category_logits': category_logits,
            'ner_logits': ner_logits
        }

        if all(label is not None for label in [intent_labels, category_labels, ner_labels]):
            intent_loss = self.intent_loss_fn(intent_logits, intent_labels)
            category_loss = self.category_loss_fn(category_logits, category_labels)
            combined_mask = (gpt2_attention_mask * distilbert_attention_mask)
            active_loss = combined_mask.view(-1) == 1
            active_logits = ner_logits.view(-1, ner_logits.size(-1))[active_loss]
            active_labels = ner_labels.view(-1)[active_loss]
            ner_loss = self.ner_loss_fn(active_logits, active_labels)

            total_loss = (self.loss_weights['intent'] * intent_loss +
                          self.loss_weights['category'] * category_loss +
                          self.loss_weights['ner'] * ner_loss)

            output_dict.update({
                'loss': total_loss,
                'intent_loss': intent_loss,
                'category_loss': category_loss,
                'ner_loss': ner_loss
            })

        return output_dict


## Training Function

In [None]:

# Cell 6: Training Function
def train_model(model, train_loader, val_loader, num_epochs, learning_rate):
    optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate)
    model.to(device)
    check_device(model, "Model")
    history = {"train_loss": [], "val_loss": [], "train_intent_f1": [], "val_intent_f1": [],
               "train_category_f1": [], "val_category_f1": [], "train_ner_f1": [], "val_ner_f1": []}

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        all_train_intent_preds, all_train_intent_labels = [], []
        all_train_category_preds, all_train_category_labels = [], []
        all_train_ner_preds, all_train_ner_labels = [], []

        log_to_file(f"Starting Epoch {epoch+1}/{num_epochs} [Training]")
        for i, batch in enumerate(tqdm(train_loader, desc="Training")):
            optimizer.zero_grad()
            inputs = {k: v.to(device) for k, v in batch.items()}
            if i == 0:
                check_device(inputs["gpt2_input_ids"], "GPT-2 Input IDs")
            outputs = model(**inputs)
            loss = outputs["loss"]
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

            intent_preds = torch.argmax(outputs["intent_logits"], dim=-1).cpu().numpy()
            category_preds = torch.argmax(outputs["category_logits"], dim=-1).cpu().numpy()
            ner_preds = torch.argmax(outputs["ner_logits"], dim=-1).cpu().numpy()
            all_train_intent_preds.extend(intent_preds)
            all_train_intent_labels.extend(batch["intent_labels"].cpu().numpy())
            all_train_category_preds.extend(category_preds)
            all_train_category_labels.extend(batch["category_labels"].cpu().numpy())
            all_train_ner_preds.extend(ner_preds.flatten())
            all_train_ner_labels.extend(batch["ner_labels"].cpu().numpy().flatten())

        train_intent_f1 = precision_recall_fscore_support(all_train_intent_labels, all_train_intent_preds, average="macro", zero_division=0)[2]
        train_category_f1 = precision_recall_fscore_support(all_train_category_labels, all_train_category_preds, average="macro", zero_division=0)[2]
        train_ner_f1 = precision_recall_fscore_support(all_train_ner_labels, all_train_ner_preds, average="macro", zero_division=0)[2]
        history["train_loss"].append(total_loss / len(train_loader))
        history["train_intent_f1"].append(float(train_intent_f1))  # Convert to float for JSON
        history["train_category_f1"].append(float(train_category_f1))
        history["train_ner_f1"].append(float(train_ner_f1))

        model.eval()
        val_loss = 0
        all_val_intent_preds, all_val_intent_labels = [], []
        all_val_category_preds, all_val_category_labels = [], []
        all_val_ner_preds, all_val_ner_labels = [], []

        log_to_file(f"Epoch {epoch+1}/{num_epochs} [Validation]")
        with torch.no_grad():
            for batch in tqdm(val_loader, desc="Validation"):
                inputs = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**inputs)
                val_loss += outputs["loss"].item()
                intent_preds = torch.argmax(outputs["intent_logits"], dim=-1).cpu().numpy()
                category_preds = torch.argmax(outputs["category_logits"], dim=-1).cpu().numpy()
                ner_preds = torch.argmax(outputs["ner_logits"], dim=-1).cpu().numpy()
                all_val_intent_preds.extend(intent_preds)
                all_val_intent_labels.extend(batch["intent_labels"].cpu().numpy())
                all_val_category_preds.extend(category_preds)
                all_val_category_labels.extend(batch["category_labels"].cpu().numpy())
                all_val_ner_preds.extend(ner_preds.flatten())
                all_val_ner_labels.extend(batch["ner_labels"].cpu().numpy().flatten())

        val_intent_f1 = precision_recall_fscore_support(all_val_intent_labels, all_val_intent_preds, average="macro", zero_division=0)[2]
        val_category_f1 = precision_recall_fscore_support(all_val_category_labels, all_val_category_preds, average="macro", zero_division=0)[2]
        val_ner_f1 = precision_recall_fscore_support(all_val_ner_labels, all_val_ner_preds, average="macro", zero_division=0)[2]
        history["val_loss"].append(val_loss / len(val_loader))
        history["val_intent_f1"].append(float(val_intent_f1))
        history["val_category_f1"].append(float(val_category_f1))
        history["val_ner_f1"].append(float(val_ner_f1))

        log_to_file(f"Epoch {epoch+1}/{num_epochs}:")
        log_to_file(f"  Train Loss: {history['train_loss'][-1]:.4f}, Intent F1: {train_intent_f1:.4f}, Category F1: {train_category_f1:.4f}, NER F1: {train_ner_f1:.4f}")
        log_to_file(f"  Val Loss: {history['val_loss'][-1]:.4f}, Intent F1: {val_intent_f1:.4f}, Category F1: {val_category_f1:.4f}, NER F1: {val_ner_f1:.4f}")
    return history



## Eval Function

In [None]:

# Cell 7: Evaluation Function
def evaluate_model(model, test_loader, label_encoders, gpt2_tokenizer):
    model.eval()
    total_loss = 0
    all_intent_preds, all_intent_labels = [], []
    all_category_preds, all_category_labels = [], []
    all_ner_preds, all_ner_labels = [], []
    device = next(model.parameters()).device

    log_to_file("Evaluating model on test set...")
    with torch.no_grad():
        for i, batch in enumerate(tqdm(test_loader, desc="Evaluation")):
            inputs = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**inputs)
            total_loss += outputs["loss"].item()
            intent_preds = torch.argmax(outputs["intent_logits"], dim=-1).cpu().numpy()
            category_preds = torch.argmax(outputs["category_logits"], dim=-1).cpu().numpy()
            ner_preds = torch.argmax(outputs["ner_logits"], dim=-1).cpu().numpy()
            all_intent_preds.extend(intent_preds)
            all_intent_labels.extend(batch["intent_labels"].cpu().numpy())
            all_category_preds.extend(category_preds)
            all_category_labels.extend(batch["category_labels"].cpu().numpy())
            all_ner_preds.extend(ner_preds.flatten())
            all_ner_labels.extend(batch["ner_labels"].cpu().numpy().flatten())

            if i == 0:
                tokens = gpt2_tokenizer.convert_ids_to_tokens(inputs["gpt2_input_ids"][0].tolist())
                log_to_file("Sample Prediction (First Batch):")
                log_to_file("Token | Predicted NER | True NER")
                for token, pred, true in zip(tokens, ner_preds[0], batch["ner_labels"][0].cpu().numpy()):
                    pred_label = list(label_encoders["ner_label_encoder"].keys())[list(label_encoders["ner_label_encoder"].values()).index(pred)]
                    true_label = list(label_encoders["ner_label_encoder"].keys())[list(label_encoders["ner_label_encoder"].values()).index(true)]
                    log_to_file(f"{token:<15} | {pred_label:<15} | {true_label}")

    intent_f1 = precision_recall_fscore_support(all_intent_labels, all_intent_preds, average="macro", zero_division=0)[2]
    category_f1 = precision_recall_fscore_support(all_category_labels, all_category_preds, average="macro", zero_division=0)[2]
    ner_f1 = precision_recall_fscore_support(all_ner_labels, all_ner_preds, average="macro", zero_division=0)[2]
    avg_loss = total_loss / len(test_loader)
    log_to_file(f"Test Results:")
    log_to_file(f"  Loss: {avg_loss:.4f}")
    log_to_file(f"  Intent F1: {intent_f1:.4f}")
    log_to_file(f"  Category F1: {category_f1:.4f}")
    log_to_file(f"  NER F1: {ner_f1:.4f}")
    return {"loss": float(avg_loss), "intent_f1": float(intent_f1), "category_f1": float(category_f1), "ner_f1": float(ner_f1)}


## Main Execution

### Hyper Parameters

In [None]:

# Cell 8: Main Execution
train_file = "train.json"
val_file = "val.json"
test_file = "test.json"
batch_size = 16
num_epochs = 3
learning_rate = 2e-5
max_length = 128
num_workers = 2
dropout_rate = 0.2
loss_weights = {"intent": 0.2, "category": 0.2, "ner": 0.4}


### Load Data, Label encoders and Compute Weights

In [None]:

train_data = load_dataset(train_file)
val_data = load_dataset(val_file)
test_data = load_dataset(test_file)
log_to_file(f"Dataset sizes: Train={len(train_data)}, Val={len(val_data)}, Test={len(test_data)}")
if len(train_data) == 0:
    raise ValueError("Training dataset is empty!")

label_encoders = detect_labels(train_data)

# Compute class weights
ner_counts = Counter()
for sample in train_data:
    for label in sample["ner_labels_only"]:
        ner_counts[f"B-{label['label']}"] += 1
        ner_counts[f"I-{label['label']}"] += 1
ner_counts["O"] = sum(ner_counts.values()) * 10
total_ner = sum(ner_counts.values())
ner_class_weights = torch.tensor([total_ner / (len(label_encoders["ner_label_encoder"]) * ner_counts.get(tag, 1))
                                  for tag in label_encoders["ner_label_encoder"]], dtype=torch.float).to(device)

category_counts = Counter()
for sample in train_data:
    category_counts[sample["category"]] += 1
total_category = sum(category_counts.values())
category_class_weights = torch.tensor([total_category / (len(label_encoders["category_encoder"]) * category_counts.get(cat, 1))
                                       for cat in label_encoders["category_encoder"]], dtype=torch.float).to(device)

intent_counts = Counter()
for sample in train_data:
    intent_counts[sample["intent"]] += 1
total_intent = sum(intent_counts.values())
intent_class_weights = torch.tensor([total_intent / (len(label_encoders["intent_encoder"]) * intent_counts.get(intent, 1))
                                     for intent in label_encoders["intent_encoder"]], dtype=torch.float).to(device)

log_to_file(f"NER class weights computed: {ner_class_weights[:5]}...")
log_to_file(f"Category class weights computed: {category_class_weights[:5]}...")
log_to_file(f"Intent class weights computed: {intent_class_weights[:5]}...")


### Tokenizer and Model Initialization

In [None]:


gpt2_tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
distilbert_tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
if gpt2_tokenizer.pad_token is None:
    gpt2_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
if distilbert_tokenizer.pad_token is None:
    distilbert_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
log_to_file("Tokenizers initialized")

train_loader, val_loader, test_loader = get_dataloaders(
    train_data, val_data, test_data, gpt2_tokenizer, distilbert_tokenizer, label_encoders, batch_size, num_workers, max_length
)

model = HybridGPT2DistilBERTMultiTask(
    num_intents=len(label_encoders["intent_encoder"]),
    num_categories=len(label_encoders["category_encoder"]),
    num_ner_labels=len(label_encoders["ner_label_encoder"]),
    dropout_rate=dropout_rate,
    loss_weights=loss_weights,
    ner_class_weights=ner_class_weights,
    category_class_weights=category_class_weights,
    intent_class_weights=intent_class_weights
)
if gpt2_tokenizer.pad_token_id is not None:
    model.gpt2.resize_token_embeddings(len(gpt2_tokenizer))
model.to(device)
check_device(model, "Model before training")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Initializing model...


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


### Start Training

In [None]:

# Save hyperparameters
hyperparameters = {
    "batch_size": batch_size,
    "num_epochs": num_epochs,
    "learning_rate": learning_rate,
    "max_length": max_length,
    "num_workers": num_workers,
    "dropout_rate": dropout_rate,
    "loss_weights": loss_weights,
    "num_intents": len(label_encoders["intent_encoder"]),
    "num_categories": len(label_encoders["category_encoder"]),
    "num_ner_labels": len(label_encoders["ner_label_encoder"])
}
with open(os.path.join(output_dir, "hyperparameters.json"), 'w') as f:
    json.dump(hyperparameters, f, indent=4)
log_to_file("Hyperparameters saved to hyperparameters.json")

start_time = time.time()
history = train_model(model, train_loader, val_loader, num_epochs, learning_rate)
training_time = (time.time() - start_time) / 60
log_to_file(f"Training completed in {training_time:.2f} minutes")

Training: 100%|██████████| 1344/1344 [04:28<00:00,  5.00it/s]
Validation: 100%|██████████| 168/168 [00:32<00:00,  5.14it/s]
Training: 100%|██████████| 1344/1344 [04:32<00:00,  4.92it/s]
Validation: 100%|██████████| 168/168 [00:32<00:00,  5.12it/s]
Training: 100%|██████████| 1344/1344 [04:32<00:00,  4.93it/s]
Validation: 100%|██████████| 168/168 [00:32<00:00,  5.13it/s]


### Evaluate

#### Save Training History, Test Results and Model

In [None]:
# During training, save:
with open(os.path.join(output_dir, "label_encoders.json"), 'w') as f:
    json.dump(label_encoders, f, indent=4)


In [None]:

# Save training history
with open(os.path.join(output_dir, "training_history.json"), 'w') as f:
    json.dump(history, f, indent=4)
log_to_file("Training history saved to training_history.json")

test_results = evaluate_model(model, test_loader, label_encoders, gpt2_tokenizer)

# Save test results
with open(os.path.join(output_dir, "test_results.json"), 'w') as f:
    json.dump(test_results, f, indent=4)
log_to_file("Test results saved to test_results.json")

# Save model
model_path = os.path.join(output_dir, "hybrid_model.pth")
torch.save(model.state_dict(), model_path)
log_to_file(f"Model saved to {model_path}")

Evaluation: 100%|██████████| 168/168 [00:33<00:00,  5.06it/s]


# Inference

In [None]:
import json
import torch
from transformers import GPT2TokenizerFast, DistilBertTokenizerFast, GPT2Config, GPT2Model, DistilBertModel
import os
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define the model class (matching the trained architecture)
class FusionLayer(nn.Module):
    def __init__(self, gpt2_dim: int, bert_dim: int, output_dim: int, dropout_rate: float):
        super().__init__()
        self.gpt2_proj = nn.Linear(gpt2_dim, output_dim)
        self.bert_proj = nn.Linear(bert_dim, output_dim)
        self.fusion = nn.Sequential(
            nn.Linear(output_dim * 2, output_dim),
            nn.Tanh(),
            nn.Dropout(dropout_rate)
        )
        self.layer_norm = nn.LayerNorm(output_dim)

    def forward(self, gpt2_features: torch.Tensor, bert_features: torch.Tensor,
                attention_mask: torch.Tensor) -> torch.Tensor:
        gpt2_proj = self.gpt2_proj(gpt2_features)
        bert_proj = self.bert_proj(bert_features)
        concat_features = torch.cat([gpt2_proj, bert_proj], dim=-1)
        fused = self.fusion(concat_features)
        return self.layer_norm(fused)

class HybridGPT2DistilBERTMultiTask(nn.Module):
    def __init__(self, num_intents: int, num_categories: int, num_ner_labels: int,
                 dropout_rate: float,
                 loss_weights: dict = None,  # Optional for inference
                 ner_class_weights: torch.Tensor = None,  # Optional for inference
                 category_class_weights: torch.Tensor = None,  # Optional for inference
                 intent_class_weights: torch.Tensor = None):  # Optional for inference
        super().__init__()
        self.gpt2_config = GPT2Config.from_pretrained('gpt2')
        self.gpt2 = GPT2Model.from_pretrained('gpt2')
        self.distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')

        # Freeze all layers (default)
        for param in self.gpt2.parameters():
            param.requires_grad = False
        for param in self.distilbert.parameters():
            param.requires_grad = False

        # Option to unfreeze last 2 layers (comment/uncomment as needed)
        """
        # Unfreeze last 2 layers of GPT-2
        for param in self.gpt2.h[-2:].parameters():
            param.requires_grad = True
        # Unfreeze last 2 layers of DistilBERT
        for param in self.distilbert.transformer.layer[-2:].parameters():
            param.requires_grad = True
        """

        gpt2_dim = self.gpt2_config.n_embd
        bert_dim = self.distilbert.config.hidden_size
        hidden_size = gpt2_dim

        self.fusion_layer = FusionLayer(gpt2_dim, bert_dim, hidden_size, dropout_rate)

        self.intent_head = nn.Sequential(
            nn.Linear(hidden_size, hidden_size), nn.Tanh(), nn.Dropout(dropout_rate),
            nn.Linear(hidden_size, num_intents)
        )
        self.category_head = nn.Sequential(
            nn.Linear(hidden_size, hidden_size), nn.Tanh(), nn.Dropout(dropout_rate),
            nn.Linear(hidden_size, num_categories)
        )
        self.ner_head = nn.Sequential(
            nn.Linear(hidden_size, hidden_size), nn.Tanh(), nn.Dropout(dropout_rate),
            nn.Linear(hidden_size, num_ner_labels)
        )

        # Define loss functions (optional for inference, but included for compatibility)
        self.intent_loss_fn = nn.CrossEntropyLoss(weight=intent_class_weights) if intent_class_weights is not None else nn.CrossEntropyLoss()
        self.category_loss_fn = nn.CrossEntropyLoss(weight=category_class_weights) if category_class_weights is not None else nn.CrossEntropyLoss()
        self.ner_loss_fn = nn.CrossEntropyLoss(weight=ner_class_weights) if ner_class_weights is not None else nn.CrossEntropyLoss()
        self.loss_weights = loss_weights or {'intent': 0.3, 'category': 0.3, 'ner': 0.4}

    def forward(self, gpt2_input_ids: torch.Tensor, gpt2_attention_mask: torch.Tensor,
                distilbert_input_ids: torch.Tensor, distilbert_attention_mask: torch.Tensor) -> Dict[str, torch.Tensor]:
        gpt2_outputs = self.gpt2(input_ids=gpt2_input_ids, attention_mask=gpt2_attention_mask)
        distilbert_outputs = self.distilbert(input_ids=distilbert_input_ids, attention_mask=distilbert_attention_mask)

        gpt2_features = gpt2_outputs.last_hidden_state
        bert_features = distilbert_outputs.last_hidden_state

        fused_features = self.fusion_layer(gpt2_features, bert_features, gpt2_attention_mask)

        masked_features = fused_features * gpt2_attention_mask.unsqueeze(-1)
        sequence_repr = masked_features.sum(dim=1) / gpt2_attention_mask.sum(dim=1, keepdim=True)

        intent_logits = self.intent_head(sequence_repr)
        category_logits = self.category_head(sequence_repr)
        ner_logits = self.ner_head(fused_features)

        return {'intent_logits': intent_logits, 'category_logits': category_logits, 'ner_logits': ner_logits}

# Tokenization function
def tokenize_text(text: str, gpt2_tokenizer, distilbert_tokenizer, max_length: int) -> Dict[str, torch.Tensor]:
    gpt2_inputs = gpt2_tokenizer(
        text, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
    )
    distilbert_inputs = distilbert_tokenizer(
        text, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
    )
    return {
        "gpt2_input_ids": gpt2_inputs["input_ids"].squeeze(0),
        "gpt2_attention_mask": gpt2_inputs["attention_mask"].squeeze(0),
        "distilbert_input_ids": distilbert_inputs["input_ids"].squeeze(0),
        "distilbert_attention_mask": distilbert_inputs["attention_mask"].squeeze(0)
    }

# Inference function
def inference(model, text: str, gpt2_tokenizer, distilbert_tokenizer, label_encoders, max_length: int, device):
    model.eval()
    print(f"Inference on text: {text[:50]}...")

    # Tokenize input
    inputs = tokenize_text(text, gpt2_tokenizer, distilbert_tokenizer, max_length)
    inputs = {k: v.unsqueeze(0).to(device) for k, v in inputs.items()}  # Add batch dimension

    # Run model
    with torch.no_grad():
        outputs = model(**inputs)

    # Decode predictions
    intent_pred = torch.argmax(outputs["intent_logits"], dim=-1).cpu().item()
    category_pred = torch.argmax(outputs["category_logits"], dim=-1).cpu().item()
    ner_preds = torch.argmax(outputs["ner_logits"], dim=-1).cpu().numpy()[0]

    # Map to labels
    intent_label = list(label_encoders["intent_encoder"].keys())[list(label_encoders["intent_encoder"].values()).index(intent_pred)]
    category_label = list(label_encoders["category_encoder"].keys())[list(label_encoders["category_encoder"].values()).index(category_pred)]
    tokens = gpt2_tokenizer.convert_ids_to_tokens(inputs["gpt2_input_ids"][0].tolist())
    seq_len = inputs["gpt2_attention_mask"][0].sum().item()
    ner_labels = [list(label_encoders["ner_label_encoder"].keys())[list(label_encoders["ner_label_encoder"].values()).index(pred)]
                  for pred in ner_preds[:seq_len]]

    # Print results
    print(f"Predicted Intent: {intent_label}")
    print(f"Predicted Category: {category_label}")
    print("NER Predictions:")
    print("Token | Predicted NER")
    for token, ner_label in zip(tokens[:seq_len], ner_labels):
        print(f"{token:<15} | {ner_label}")

    return {"intent": intent_label, "category": category_label, "ner": list(zip(tokens[:seq_len], ner_labels))}

# Load necessary components
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
output_dir = "/content/drive/MyDrive/thesis/Hybrid_Concat_Freeze"
model_path = os.path.join(output_dir, "hybrid_model.pth")

# Load saved label encoders
with open(os.path.join(output_dir, "label_encoders.json"), 'r') as f:
    label_encoders = json.load(f)
print("Loaded label encoders from label_encoders.json")

# Load tokenizers
gpt2_tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
distilbert_tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
if gpt2_tokenizer.pad_token is None:
    gpt2_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
if distilbert_tokenizer.pad_token is None:
    distilbert_tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Load hyperparameters (for max_length and dropout_rate)
with open(os.path.join(output_dir, "hyperparameters.json"), 'r') as f:
    hyperparameters = json.load(f)

# Initialize model with minimal arguments (loss-related args are optional)
model = HybridGPT2DistilBERTMultiTask(
    num_intents=len(label_encoders["intent_encoder"]),
    num_categories=len(label_encoders["category_encoder"]),
    num_ner_labels=len(label_encoders["ner_label_encoder"]),
    dropout_rate=hyperparameters["dropout_rate"]  # Use saved dropout_rate
)

# Resize embeddings before loading state dict to match training
if gpt2_tokenizer.pad_token_id is not None:
    model.gpt2.resize_token_embeddings(len(gpt2_tokenizer))

# Load state dict with strict=False to ignore loss function weights
model.load_state_dict(torch.load(model_path, weights_only=True), strict=False)
model.to(device)
model.eval()

# Perform inference
example_text = "I want to cancel order ord-123123 or or refund $123"
results = inference(model, example_text, gpt2_tokenizer, distilbert_tokenizer, label_encoders, max_length=hyperparameters["max_length"], device=device)
print(f"Inference results: {results}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loaded label encoders from label_encoders.json
Inference on text: I want to cancel order ord-123123 or or refund $12...
Predicted Intent: cancel_order
Predicted Category: order
NER Predictions:
Token | Predicted NER
I               | O
Ġwant           | O
Ġto             | O
Ġcancel         | O
Ġorder          | O
Ġord            | B-order_number
-               | I-order_number
123             | I-order_number
123             | I-order_number
Ġor             | I-order_number
Ġor             | I-order_number
Ġrefund         | O
Ġ$              | B-refund_amount
123             | I-refund_amount
Inference results: {'intent': 'cancel_order', 'category': 'order', 'ner': [('I', 'O'), ('Ġwant', 'O'), ('Ġto', 'O'), ('Ġcancel', 'O'), ('Ġorder', 'O'), ('Ġord', 'B-order_number'), ('-', 'I-order_number'), ('123', 'I-order_number'), ('123', 'I-order_number'), ('Ġor', 'I