In [1]:
!pip install -q pennylane transformers datasets tqdm accelerate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.1/57.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m39.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m930.8/930.8 kB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m63.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m55.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m167.9/167.9 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m52.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# Standard library and data handling
from dataclasses import dataclass, field
from typing import Optional, List
import os

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.optim import AdamW

# Quantum
import pennylane as qml
from pennylane.templates import StronglyEntanglingLayers

# Hugging Face
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling # <-- ADD THIS
from datasets import load_dataset
from tqdm import tqdm

# --- Configuration Object ---
# All parameters are in one place for easy modification
@dataclass
class TrainingConfig:
    model_name: str = "gpt2"
    # Training params
    do_train: bool = True
    epochs: int = 1
    lr: float = 5e-5
    batch_size: int = 4 # Lower if you encounter memory issues
    block_size: int = 128
    output_dir: str = "./quantum_gpt2_wikitext"
    # Generation params for the final demo
    prompt: str = "Hello from a quantum‑enhanced decoder."
    max_new_tokens: int = 64
    # Set device
    device: str = field(default_factory=lambda: "cuda" if torch.cuda.is_available() else "cpu")

config = TrainingConfig()

In [3]:
import numpy as np
from dataclasses import dataclass, field
from typing import Optional, List
import os

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F

# Quantum
import pennylane as qml
from pennylane.templates import StronglyEntanglingLayers

@dataclass
class QAGConfig:
    num_qubits: int = 6
    q_layers: int = 2
    reduce: str = "mean"
    scale_range: float = 0.5
    device_name: str = "default.qubit"

# --- A CUSTOM AUTOGRAD FUNCTION ---
# --- Corrected Custom Autograd Function ---
class QuantumCircuitFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, angles, weights, qnode):
        """
        Forward pass: detach tensors, convert to numpy, run circuit, convert back to tensor.
        """
        # Save the qnode and inputs for the backward pass
        ctx.qnode = qnode
        ctx.save_for_backward(angles, weights)

        # Execute the QNode with numpy inputs
        angles_np = angles.detach().numpy()
        weights_np = weights.detach().numpy()

        # We expect the qnode to return a single numpy array
        exp_vals_np = qnode(angles_np, weights_np)

        # Convert result back to a torch tensor
        return torch.tensor(exp_vals_np, dtype=angles.dtype, device=angles.device)

    @staticmethod
    def backward(ctx, grad_output):
        """
        Backward pass: compute the jacobian of the circuit and apply the chain rule.
        """
        qnode = ctx.qnode
        angles, weights = ctx.saved_tensors

        # Convert tensors to numpy to compute the jacobian with PennyLane
        angles_np = angles.detach().numpy()
        weights_np = weights.detach().numpy()

        # Define a classical wrapper around the qnode.
        # This wrapper returns the NumPy array that qml.jacobian expects.
        def qnode_wrapper(a, w):
            return np.array(qnode(a, w))

        # Compute the jacobian of the wrapper function.
        jacobian_fn = qml.jacobian(qnode_wrapper, argnum=[0, 1])
        j_angles, j_weights = jacobian_fn(angles_np, weights_np)

        # Convert jacobians to tensors
        j_angles = torch.tensor(j_angles, dtype=angles.dtype, device=angles.device)
        j_weights = torch.tensor(j_weights, dtype=weights.dtype, device=weights.device)

        # Apply the chain rule: grad_input = grad_output @ jacobian
        # We need to unsqueeze grad_output to match dimensions for matmul
        grad_output = grad_output.unsqueeze(0)
        grad_angles = (grad_output @ j_angles).squeeze(0)
        grad_weights = grad_output @ j_weights.reshape(j_weights.shape[0], -1)
        grad_weights = grad_weights.reshape(weights.shape)

        # Return gradients for each input of forward(): angles, weights, qnode
        return grad_angles, grad_weights, None

class QuantumAttentionGate(nn.Module):
    def __init__(self, in_dim: int, n_heads: int, cfg: QAGConfig):
        super().__init__()
        self.cfg = cfg
        self.n_heads = n_heads
        self.to_angles = nn.Linear(in_dim, cfg.num_qubits)
        self.post = nn.Linear(cfg.num_qubits, n_heads)
        self.scale_range = cfg.scale_range
        self.q_weights = nn.Parameter(0.01 * torch.randn(cfg.q_layers, cfg.num_qubits, 3))
        self.dev = qml.device(cfg.device_name, wires=cfg.num_qubits)

        # --- FIX IS HERE ---
        # Explicitly tell the QNode to use the parameter-shift rule for differentiation.
        # This will resolve the "Output seems independent of input" warning.
        @qml.qnode(self.dev, diff_method="parameter-shift")
        # --- END OF FIX ---
        def circuit(angles, weights):
            for i in range(cfg.num_qubits):
                qml.RY(angles[i], wires=i)
            StronglyEntanglingLayers(weights, wires=range(cfg.num_qubits))
            return [qml.expval(qml.PauliZ(i)) for i in range(cfg.num_qubits)]

        self.circuit = circuit

    def forward(self, summary_vec: torch.Tensor) -> torch.Tensor:
        B, _ = summary_vec.shape
        angles = self.to_angles(summary_vec)
        q_outputs = [QuantumCircuitFunction.apply(angles[b], self.q_weights, self.circuit) for b in range(B)]
        qfeat = torch.stack(q_outputs)
        head_logits = self.post(qfeat)
        scales = 1.0 + self.scale_range * torch.tanh(head_logits)
        return scales

class GPT2WithQuantumGate(nn.Module):
    def __init__(self, model_name: str = "gpt2", freeze_lm: bool = True, qcfg: QAGConfig = QAGConfig()):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        self.lm = AutoModelForCausalLM.from_pretrained(model_name)

        if freeze_lm:
            for p in self.lm.parameters():
                p.requires_grad = False

        hidden = self.lm.config.n_embd
        n_layers = self.lm.config.n_layer
        n_heads = self.lm.config.n_head
        self.qcfg = qcfg() if isinstance(qcfg, type) else qcfg

        self.qgates = nn.ModuleList([
            QuantumAttentionGate(in_dim=hidden, n_heads=n_heads, cfg=self.qcfg)
            for _ in range(n_layers)
        ])

        self._hooks: List[torch.utils.hooks.RemovableHandle] = []
        for i, block in enumerate(self.lm.transformer.h):
            handle = block.attn.register_forward_hook(self._make_attn_hook(i))
            self._hooks.append(handle)

    def _make_attn_hook(self, layer_idx: int):
        gate = self.qgates[layer_idx]
        def hook(module, inputs, outputs):
            hidden_states = inputs[0]
            attn_output = outputs[0]
            B, T, H = attn_output.shape
            n_heads = module.num_heads
            head_dim = H // n_heads
            attn_heads = attn_output.view(B, T, n_heads, head_dim)

            if self.qcfg.reduce == "mean":
                summary = hidden_states.mean(dim=1)
            else:
                summary = hidden_states[:, -1, :]

            scales = gate(summary).view(B, 1, n_heads, 1)
            gated = (attn_heads * scales).view(B, T, H)
            return (gated,) + outputs[1:]
        return hook

    def forward(self, *args, **kwargs):
        return self.lm(*args, **kwargs)

    @torch.inference_mode()
    def generate(self, prompt: str, **kwargs):
        self.to(kwargs.get("device", "cpu"))
        self.lm.eval()
        enc = self.tokenizer(prompt, return_tensors="pt").to(kwargs.get("device", "cpu"))
        out_ids = self.lm.generate(**enc, **kwargs)
        return self.tokenizer.decode(out_ids[0], skip_special_tokens=True)

    def save_quantum_adapters(self, save_directory: str):
        if not os.path.exists(save_directory):
            os.makedirs(save_directory)
        torch.save(self.qgates.state_dict(), os.path.join(save_directory, "qgates.pth"))
        print(f"Quantum Attention Gate weights saved to {save_directory}")

    def load_quantum_adapters(self, load_directory: str):
        qgates_path = os.path.join(load_directory, "qgates.pth")
        if not os.path.exists(qgates_path):
            raise FileNotFoundError(f"Could not find qgates.pth in {load_directory}")
        map_location = torch.device(config.device)
        self.qgates.load_state_dict(torch.load(qgates_path, map_location=map_location))
        print(f"Quantum Attention Gate weights loaded from {load_directory}")

In [4]:
def prepare_wikitext_data(tokenizer, block_size):
    """Load, tokenize, and preprocess the WikiText-2 dataset."""
    print("Loading and preparing WikiText-2 dataset...")
    # Using a smaller split for a faster demo, change to "train" for full training
    dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:10%]")

    def tokenize_function(examples):
        return tokenizer(examples["text"])

    tokenized_dataset = dataset.map(tokenize_function, batched=True, num_proc=2, remove_columns=["text"])

    def group_texts(examples):
        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        total_length = (total_length // block_size) * block_size
        result = {
            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
            for k, t in concatenated_examples.items()
        }
        result["labels"] = result["input_ids"].copy()
        return result

    lm_dataset = tokenized_dataset.map(group_texts, batched=True, batch_size=1000, num_proc=2)
    print(f"Dataset prepared with {len(lm_dataset)} samples.")
    return lm_dataset

def train(model, tokenizer, cfg: TrainingConfig):
    """Fine-tuning loop for the quantum gates on WikiText-2."""
    model.train()
    dataset = prepare_wikitext_data(tokenizer, block_size=cfg.block_size)

    # --- FIX IS HERE ---
    # 1. Instantiate the data collator for causal language modeling (mlm=False)
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    # 2. Pass the collator to the DataLoader
    data_loader = DataLoader(
        dataset,
        batch_size=cfg.batch_size,
        shuffle=True,
        collate_fn=data_collator
    )
    # --- END OF FIX ---

    trainable_params = [p for p in model.parameters() if p.requires_grad]
    optimizer = AdamW(trainable_params, lr=cfg.lr)

    print("\n=== Starting Fine-Tuning of Quantum Gates ===")
    print(f"Device: {cfg.device}, Epochs: {cfg.epochs}, Batch Size: {cfg.batch_size}, LR: {cfg.lr}")

    for epoch in range(cfg.epochs):
        print(f"\n--- Epoch {epoch+1}/{cfg.epochs} ---")
        total_loss = 0
        for batch in tqdm(data_loader, desc=f"Epoch {epoch+1}"):
            optimizer.zero_grad()

            # Now the batch items are correctly formatted Tensors
            input_ids = batch['input_ids'].to(cfg.device)
            attention_mask = batch['attention_mask'].to(cfg.device)
            labels = batch['labels'].to(cfg.device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(data_loader)
        print(f"Epoch {epoch+1} finished. Average Loss: {avg_loss:.4f}")

    model.save_quantum_adapters(cfg.output_dir)

In [5]:
# Initialize the model
model = GPT2WithQuantumGate(model_name=config.model_name, freeze_lm=True)
model.to(config.device)

# Start training if enabled in the config
if config.do_train:
    train(model, model.tokenizer, config)
else:
    print("Skipping training as per configuration.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Loading and preparing WikiText-2 dataset...


README.md: 0.00B [00:00, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/3672 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/3672 [00:00<?, ? examples/s]

Dataset prepared with 1877 samples.

=== Starting Fine-Tuning of Quantum Gates ===
Device: cpu, Epochs: 1, Batch Size: 4, LR: 5e-05

--- Epoch 1/1 ---


Epoch 1:   0%|          | 0/470 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
Epoch 1: 100%|██████████| 470/470 [41:59<00:00,  5.36s/it]

Epoch 1 finished. Average Loss: 4.5440
Quantum Attention Gate weights saved to ./quantum_gpt2_wikitext





In [6]:
print("\n=== Running Demo with Fine-Tuned Gates ===")

# Create a new model instance for the demo
demo_model = GPT2WithQuantumGate(model_name=config.model_name, freeze_lm=True)

# Load the fine-tuned quantum adapter weights
try:
    demo_model.load_quantum_adapters(config.output_dir)
    demo_model.to(config.device)

    # Generate text
    generated_text = demo_model.generate(
        prompt=config.prompt,
        max_new_tokens=config.max_new_tokens,
        do_sample=True,
        temperature=0.8,
        top_p=0.95,
        pad_token_id=demo_model.tokenizer.eos_token_id
    )

    print("\n--- PROMPT ---")
    print(config.prompt)
    print("\n--- GENERATED TEXT ---")
    print(generated_text)

except FileNotFoundError:
    print("\nCould not find trained adapter weights. Please ensure training was run successfully.")
    print("You can run generation with random gates by removing the call to `load_quantum_adapters`.")


=== Running Demo with Fine-Tuned Gates ===
Quantum Attention Gate weights loaded from ./quantum_gpt2_wikitext

--- PROMPT ---
Hello from a quantum‑enhanced decoder.

--- GENERATED TEXT ---
Hello from a quantum‑enhanced decoder. A classical quantum computer is one which can perform calculations on data in a way that is consistent with quantum mechanics. It's a more modern computer that has the capability to perform calculations on data in a way that is consistent with quantum mechanics. So it's not quite as sophisticated as the classical computer. I think that the question
