In [1]:
# Install dependencies
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
!pip install -q transformers scikit-learn pandas matplotlib openpyxl


In [3]:
# Imports & Config
import os, math, random, json
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
from transformers import AutoTokenizer, AutoModel

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

# Model / tokenization backbone
BACKBONE = "bert-base-uncased"
MAX_LEN = 256
BATCH_SIZE = 8
LR = 2e-5
EPOCHS = 3
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

LABEL_COLUMNS = ["hardware","software","ai_models","data_pipelines","user_interface","integrations"]
print("Using device:", DEVICE)

Using device: cuda


In [5]:
# Load dataset
CSV_PATH = "/content/augmented_real_fda_dataset.csv"

df = pd.read_csv(CSV_PATH, encoding='latin-1')

# Keep only the columns we need
keep_cols = ["text_chunk"] + LABEL_COLUMNS
df = df[keep_cols]

# Normalize text
df["text_chunk"] = df["text_chunk"].astype(str).fillna("").str.strip()

# Convert labels into binary presence/absence per head
def to_binary(cell):
    if pd.isna(cell):
        return 0
    s = str(cell).strip()
    return 0 if (s == "" or s == "-" or s.lower() == "na") else 1

for col in LABEL_COLUMNS:
    df[col] = df[col].apply(to_binary).astype(int)

# Drop rows with empty text
df = df[df["text_chunk"].str.len() > 0].reset_index(drop=True)

# Quick stats
print("Rows:", len(df))
print(df[LABEL_COLUMNS].sum().rename("positives_per_head"))
print("Sample rows:")
display(df.head(3))

Rows: 641
hardware          207
software          541
ai_models         176
data_pipelines    493
user_interface    421
integrations      393
Name: positives_per_head, dtype: int64
Sample rows:


Unnamed: 0,text_chunk,hardware,software,ai_models,data_pipelines,user_interface,integrations
0,CINA-ASPECTS is a standalone computer-aided di...,1,1,0,1,0,0
1,CINA-ASPECTS is a standalone executable progra...,1,1,1,1,1,1
2,The score includes which ASPECT regions are id...,1,1,1,1,1,0


In [6]:
# Dataset & DataLoader
tokenizer = AutoTokenizer.from_pretrained(BACKBONE)

class FDADataset(Dataset):
    def __init__(self, df, tokenizer, max_len=256):
        self.texts = df["text_chunk"].tolist()
        self.labels = df[LABEL_COLUMNS].values.astype(np.float32)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoded = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in encoded.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

# Due to sparsity issues with multi-label stratification,
# we will perform a simple random split instead of stratification.
train_df, test_df = train_test_split(df, test_size=0.2, random_state=SEED)

train_ds = FDADataset(train_df, tokenizer, MAX_LEN)
test_ds  = FDADataset(test_df, tokenizer, MAX_LEN)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
test_loader  = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)

len(train_ds), len(test_ds)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

(512, 129)

In [7]:
# Model: Shared Encoder + Multi-Head Classifier
class MultiHeadExtractor(nn.Module):
    def __init__(self, backbone="bert-base-uncased", num_heads=6):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(backbone)
        hidden_size = self.encoder.config.hidden_size
        # one linear head per label (binary)
        self.heads = nn.ModuleList([nn.Linear(hidden_size, 1) for _ in range(num_heads)])

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0, :]  # [CLS] token
        logits = torch.cat([head(pooled) for head in self.heads], dim=1)  # shape: (B, num_heads)
        loss = None
        if labels is not None:
            loss_fn = nn.BCEWithLogitsLoss()
            loss = loss_fn(logits, labels)
        return {"logits": logits, "loss": loss}

model = MultiHeadExtractor(backbone=BACKBONE, num_heads=len(LABEL_COLUMNS)).to(DEVICE)
sum(p.numel() for p in model.parameters())/1e6


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

109.486854

In [9]:
#Train & Evaluate
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def evaluate(model, loader, threshold=0.5):
    model.eval()
    all_logits, all_labels = [], []
    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            labels = batch["labels"].cpu().numpy()
            out = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = out["logits"].cpu().numpy()
            all_logits.append(logits)
            all_labels.append(labels)
    all_logits = np.vstack(all_logits)
    all_labels = np.vstack(all_labels)
    probs = sigmoid(all_logits)
    preds = (probs >= threshold).astype(int)

    # micro-average metrics
    p_micro, r_micro, f_micro, _ = precision_recall_fscore_support(all_labels, preds, average="micro", zero_division=0)
    # per-head metrics
    head_metrics = {}
    for i, head in enumerate(LABEL_COLUMNS):
        p, r, f, _ = precision_recall_fscore_support(all_labels[:, i], preds[:, i], average="binary", zero_division=0)
        try:
            auc = roc_auc_score(all_labels[:, i], probs[:, i])
        except Exception:
            auc = float("nan")
        head_metrics[head] = {"precision": p, "recall": r, "f1": f, "auc": auc}
    return {"micro": {"precision": p_micro, "recall": r_micro, "f1": f_micro}, "per_head": head_metrics}

best_f1 = 0.0
for epoch in range(1, EPOCHS+1):
    model.train()
    total_loss = 0.0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)
        out = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = out["loss"]
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    metrics = evaluate(model, test_loader, threshold=0.5)
    print(f"Epoch {epoch} | train_loss={avg_loss:.4f} | micro P/R/F1 = {metrics['micro']}")
    print("Per-head metrics:")
    for k,v in metrics["per_head"].items():
        print(f"  - {k}: P={v['precision']:.3f} R={v['recall']:.3f} F1={v['f1']:.3f} AUC={v['auc']:.3f}")
    best_f1 = max(best_f1, metrics["micro"]["f1"])
print("Best micro-F1:", best_f1)


Epoch 1 | train_loss=0.5324 | micro P/R/F1 = {'precision': 0.822680412371134, 'recall': 0.8807947019867549, 'f1': 0.8507462686567164}
Per-head metrics:
  - hardware: P=0.828 R=0.571 F1=0.676 AUC=0.880
  - software: P=0.836 R=1.000 F1=0.911 AUC=0.957
  - ai_models: P=1.000 R=0.258 F1=0.410 AUC=0.910
  - data_pipelines: P=0.867 R=1.000 F1=0.929 AUC=0.931
  - user_interface: P=0.857 R=0.876 F1=0.867 AUC=0.837
  - integrations: P=0.716 R=0.975 F1=0.825 AUC=0.852
Epoch 2 | train_loss=0.3300 | micro P/R/F1 = {'precision': 0.899581589958159, 'recall': 0.9492273730684326, 'f1': 0.9237379162191193}
Per-head metrics:
  - hardware: P=0.914 R=0.762 F1=0.831 AUC=0.953
  - software: P=0.907 R=1.000 F1=0.951 AUC=0.980
  - ai_models: P=0.897 R=0.839 F1=0.867 AUC=0.968
  - data_pipelines: P=0.936 R=0.990 F1=0.963 AUC=0.942
  - user_interface: P=0.913 R=0.944 F1=0.928 AUC=0.948
  - integrations: P=0.830 R=0.975 F1=0.897 AUC=0.919
Epoch 3 | train_loss=0.1988 | micro P/R/F1 = {'precision': 0.9458874458874

In [11]:
# Inference on custom text
def predict_heads(text, threshold=0.5):
    model.eval()
    enc = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=MAX_LEN).to(DEVICE)
    with torch.no_grad():
        # Only pass input_ids and attention_mask to the model's forward method
        out = model(input_ids=enc["input_ids"], attention_mask=enc["attention_mask"])
        probs = torch.sigmoid(out["logits"]).cpu().numpy()[0]
    result = {head: float(probs[i]) for i, head in enumerate(LABEL_COLUMNS)}
    preds = [head for head, p in result.items() if p >= threshold]
    return result, preds

sample_text = "The software runs on a Linux server and uses deep learning to process DICOM CT images, then exports DICOM to PACS."
scores, heads = predict_heads(sample_text, threshold=0.5)
print("Scores:", json.dumps(scores, indent=2))
print("Predicted heads:", heads)

Scores: {
  "hardware": 0.32658544182777405,
  "software": 0.9732354283332825,
  "ai_models": 0.6761454343795776,
  "data_pipelines": 0.9021890759468079,
  "user_interface": 0.04732263460755348,
  "integrations": 0.9350360035896301
}
Predicted heads: ['software', 'ai_models', 'data_pipelines', 'integrations']


In [None]:
# Save model & tokenizer
SAVE_DIR = "/content/multitask_fda_extractor"
os.makedirs(SAVE_DIR, exist_ok=True)
torch.save(model.state_dict(), os.path.join(SAVE_DIR, "pytorch_model.bin"))
# Save label mapping & backbone
with open(os.path.join(SAVE_DIR, "labels.json"), "w") as f:
    json.dump({"labels": LABEL_COLUMNS, "backbone": BACKBONE, "max_len": MAX_LEN}, f, indent=2)
tokenizer.save_pretrained(SAVE_DIR)
print("Saved to:", SAVE_DIR)


Saved to: /content/multitask_fda_extractor


In [None]:
!zip -r /content/multitask_fda_extractor.zip /content/multitask_fda_extractor

# Provide a download link for the zip file
from google.colab import files
files.download('/content/multitask_fda_extractor.zip')

  adding: content/multitask_fda_extractor/ (stored 0%)
  adding: content/multitask_fda_extractor/special_tokens_map.json (deflated 42%)
  adding: content/multitask_fda_extractor/pytorch_model.bin (deflated 7%)
  adding: content/multitask_fda_extractor/tokenizer_config.json (deflated 75%)
  adding: content/multitask_fda_extractor/labels.json (deflated 31%)
  adding: content/multitask_fda_extractor/tokenizer.json (deflated 71%)
  adding: content/multitask_fda_extractor/vocab.txt (deflated 53%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>