In [None]:
#!/usr/bin/env python3
"""
Refactored Quantum-Geometric Code WITHOUT Gradients or Backprop
---------------------------------------------------------------
This script:
  1) Demonstrates radial diffusion PDE updates (Heun-Euler).
  2) Maintains embeddings for tokens, updates them iteratively based on
     a negative-distance "score" (no gradient, just numeric rules).
  3) PDE residual acts as a measure of how "aligned" or "relaxed" the wavefunction is.
  4) Negative distance among tokens measures how "close" or "far" tokens are in embedding space.
  5) We apply dynamic dt updates or embedding updates using these scores, 
     with no .backward(), no optimizer, no autograd.

Usage:
   python quantum_no_grad.py

Date: 2025-02-09
"""

import math
import torch
from torch.utils.data import Dataset, DataLoader
import torch.profiler

###############################################################################
# I. Utility: Device, Precision (No Grad)
###############################################################################

def get_device():
    """
    Return CUDA device if available, else CPU.
    """
    return torch.device("cuda" if torch.cuda.is_available() else "cpu")

###############################################################################
# II. Radial Diffusion PDE: (No Potential), Heun Step
###############################################################################

def radial_diffusion_rhs(u: torch.Tensor, r_grid: torch.Tensor, alpha: float):
    """
    PDE: ∂u/∂t = alpha * [1/r d/dr ( r d/dr u ) ], potential=0 by default.
    No gradients: we do a direct finite difference numeric approach.
    """
    du_dt = torch.zeros_like(u)
    dr = r_grid[1] - r_grid[0]
    n = r_grid.shape[0]

    for i in range(1, n-1):
        r = r_grid[i]
        d_plus  = (u[i+1] - u[i]) / dr
        d_minus = (u[i] - u[i-1]) / dr
        flux_plus  = r * d_plus
        flux_minus = r * d_minus
        du_dt[i] = alpha*(1.0/r)*(flux_plus - flux_minus)/dr

    return du_dt

def heun_euler_step(u: torch.Tensor,
                    r_grid: torch.Tensor,
                    alpha: float,
                    dt: float) -> torch.Tensor:
    """
    Classic Heun-Euler step (no autograd):
      k1 = radial_diffusion_rhs(u)
      k2 = radial_diffusion_rhs(u + dt*k1)
      return u + dt/2*(k1 + k2)
    """
    k1 = radial_diffusion_rhs(u, r_grid, alpha)
    u_plus = u + dt*k1
    k2 = radial_diffusion_rhs(u_plus, r_grid, alpha)
    return u + 0.5*dt*(k1 + k2)

###############################################################################
# III. Negative Distance & Embedding Updates
###############################################################################

def negative_distance_matrix(embs: torch.Tensor) -> torch.Tensor:
    """
    M[i,j] = -||embs[i] - embs[j]|| (no gradient).
    """
    D_ = embs.shape[0]
    M = torch.zeros(D_, D_, device=embs.device, dtype=embs.dtype)
    for i in range(D_):
        for j in range(D_):
            dist = (embs[i] - embs[j]).norm()
            M[i, j] = -dist
    return M

def update_embeddings(embs: torch.Tensor,
                      dt: float,
                      dist_scale: float = 0.1
                      ) -> torch.Tensor:
    """
    Example numeric update rule for embeddings:
      - Compute negative distance matrix => mean_negdist
      - Move embeddings to get "closer" or "farther" by some numeric scheme
        (no gradient, just direct manipulations).

    Here, we do a simple approach: 
      1) M = negative_distance_matrix(embs) => mean_negdist
      2) embs += dt * dist_scale * mean_negdist * direction(???)

    Since we have no gradient, we define a simple 'collapse' step:
      For each embs[i], shift it slightly toward the average of all other vectors.
      The magnitude of this shift is dt * dist_scale * mean_negdist.

    This is a toy example of "relaxation" to reduce distance among tokens.
    """
    M = negative_distance_matrix(embs)
    mean_negdist = M.mean().item()  # scalar
    D_ = embs.shape[0]
    # Compute "average embedding" to drive them closer
    avg_vec = embs.mean(dim=0, keepdim=True)  # shape [1, E]

    # We'll define "shift" = (avg_vec - embs[i]) => pulling each embedding
    # slightly toward the center. Magnitude scaled by (dt * dist_scale * mean_negdist).
    shift_mag = dt * dist_scale * mean_negdist

    # shift all
    embs_new = embs + shift_mag*(avg_vec - embs)
    return embs_new

###############################################################################
# IV. PDE Residual as a Score, Dynamic dt
###############################################################################

def pde_residual(u: torch.Tensor, r_grid: torch.Tensor, alpha: float):
    """
    PDE residual => radial_diffusion_rhs(u). If we want "steady-state," 
    we want residual ~ 0. We'll treat the mean^2 of this as a 'score'.
    """
    res = radial_diffusion_rhs(u, r_grid, alpha)
    return (res**2).mean().item()

def dynamic_dt_update(dt: float,
                      pde_score: float,
                      negdist_score: float,
                      dt_scale: float = 0.1):
    """
    Combine PDE score and negative distance to modify dt:
      dt_new = dt * (1 - dt_scale*(pde_score + negdist_score))
    or any approach you want. We'll do a simple approach here.

    If pde_score or negdist_score are large => dt shrinks => more fine steps.
    If they are small => dt grows => bigger steps.
    """
    adjust = pde_score + negdist_score
    dt_new = dt * (1.0 - dt_scale*adjust)
    if dt_new < 1e-6:
        dt_new = 1e-6
    return dt_new

###############################################################################
# V. Synthetic Dataset (No Grad) & Iterative "Training"
###############################################################################

class SyntheticTokenDataset(Dataset):
    """
    Large dataset of random tokens (batch-based). We'll just iterate 
    and do PDE + embedding updates for each batch. No gradient usage.
    """
    def __init__(self, total_samples=1000, seq_length=32, vocab_size=10000):
        super().__init__()
        self.total_samples = total_samples
        self.seq_length = seq_length
        self.vocab_size = vocab_size

    def __len__(self):
        return self.total_samples

    def __getitem__(self, idx):
        # Return random tokens
        return torch.randint(low=0, high=self.vocab_size, size=(self.seq_length,))

###############################################################################
# VI. Helper Functions: Sentence -> Embeddings
###############################################################################

def polar_to_cartesian(r, theta):
    rx = r*math.cos(theta)
    ry = r*math.sin(theta)
    return rx, ry

def sentence_to_embeddings(sentence, N, device, dtype):
    """
    Like before: 
    r_i = 0.3 + 0.6*(i/(D-1))
    theta_i = 2π*(i/D)
    (rx, ry) => first 2 dims => leftover N-2=0
    """
    D_ = len(sentence)
    embs = torch.zeros((D_, N), device=device, dtype=dtype)
    for i in range(D_):
        if D_>1:
            frac = i/float(D_-1)
        else:
            frac=0.0
        r_i = 0.3 + 0.6*frac
        theta_i = 2.0*math.pi*(i/float(D_)) if D_>0 else 0.0
        rx, ry = polar_to_cartesian(r_i, theta_i)
        embs[i,0] = rx
        embs[i,1] = ry
    return embs

###############################################################################
# VII. Main No-Grad Loop
###############################################################################

def main():
    device = get_device()
    print("[INFO] Device:", device)

    # Basic numeric config
    dtype = torch.float32
    # PDE alpha = 2 => if we interpret N=4 => alpha= N/2 => just pick alpha=2
    alpha = 2.0

    # Radial grid
    n_r = 50
    r_grid = torch.linspace(0,1, steps=n_r, device=device, dtype=dtype)

    # PDE wavefunction
    # We'll store it in a tensor. Initialization => Gaussian around r=0.5
    # no gradients => direct numeric updates
    u = torch.exp(-((r_grid-0.5)**2)/(2*0.01))

    # dt init
    dt = 0.05  # a fixed start or from inverse beta if you prefer

    # Example sentence -> embeddings
    sentence = ["I","like","quantum","mechanics","with","pizza"]
    N=4
    embs = sentence_to_embeddings(sentence, N, device, dtype)

    # Create a synthetic dataset if we want to do a batch approach
    dataset = SyntheticTokenDataset(total_samples=200, seq_length=16, vocab_size=1000)
    loader = DataLoader(dataset, batch_size=8, shuffle=True)

    # We'll do a simple fixed number of epochs or steps
    max_epochs = 3
    steps_per_epoch = 20

    # No gradients => we just do iterative PDE + embedding updates
    step_count=0
    for epoch in range(max_epochs):
        for batch_idx, token_batch in enumerate(loader):
            step_count+=1
            # (1) PDE step with Heun-Euler
            # PDE residual => measure
            u_old = u.clone()
            u_new = heun_euler_step(u_old, r_grid, alpha, dt)
            u[:] = u_new  # update in place

            # PDE Score = mean of residual^2
            pde_score = (radial_diffusion_rhs(u, r_grid, alpha)**2).mean().item()

            # (2) Update embeddings => negative distance logic
            # We'll do a direct numeric approach
            embs_old = embs.clone()
            embs_new = update_embeddings(embs_old, dt, dist_scale=0.05)
            embs[:] = embs_new  # in place

            # Negative distance "score"
            M = negative_distance_matrix(embs)
            neg_score = M.mean().item()

            # (3) Possibly adjust dt => dynamic
            dt_old = dt
            dt = dynamic_dt_update(dt, pde_score, abs(neg_score), dt_scale=0.02)

            if step_count%5 ==0:
                print(f"Epoch={epoch}, step={step_count}, PDE_score={pde_score:.4e}, "
                      f"negdist_score={neg_score:.4e}, dt={dt_old:.3e} -> {dt:.3e}")

            if step_count>= steps_per_epoch:
                break
        if step_count>= steps_per_epoch:
            break

    # Final output
    print("\n[INFO] Final wavefunction sample:", u[::10].cpu().numpy())
    print("[INFO] Negative distance matrix sample:\n", M[:3,:3].cpu().numpy())

    # Show a quick "reconstructed" sentence
    # (Force 'I'->'like' start => then pick next by max neg dist)
    def reconstruct_sentence(sentence, negdist_mat):
        D_ = len(sentence)
        if D_<2: return sentence[:]
        visited = [False]*D_
        order=[]
        visited[0]=True
        visited[1]=True
        order.append(sentence[0])
        order.append(sentence[1])
        current=1
        for _ in range(D_-2):
            row = negdist_mat[current]
            cand_j=-1
            cand_val=-1e9
            for j in range(D_):
                val_j = row[j].item()
                if (not visited[j]) and (val_j>cand_val):
                    cand_val= val_j
                    cand_j=j
            visited[cand_j]=True
            current=cand_j
            order.append(sentence[current])
        return order

    reorder = reconstruct_sentence(sentence, M)
    print("\nReconstructed sentence (no grad approach):", reorder)


if __name__=="__main__":
    # Optional: torch.profiler
    activities=[torch.profiler.ProfilerActivity.CPU]
    if torch.cuda.is_available():
        activities.append(torch.profiler.ProfilerActivity.CUDA)

    with torch.profiler.profile(
        activities=activities,
        record_shapes=True,
        profile_memory=True,
        with_stack=False
    ) as prof:
        main()

    print("\n== Profiler Summary (Top 10 by CPU time) ==")
    print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))
    if torch.cuda.is_available():
        print("\n== Profiler Summary (Top 10 by self CUDA time) ==")
        print(prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=10))
