In [None]:
import torch, torchvision, torch.nn as nn
from torch.utils.data import DataLoader
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
import torch, numpy as np, random, os
SEED = 42
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [None]:
transform = torchvision.transforms.Compose([
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize((0.4914,0.4822,0.4465),
                                     (0.2023,0.1994,0.2010))
])
train_set = torchvision.datasets.CIFAR10(root='./data', train=True,
                                         download=True, transform=transform)
train_loader = DataLoader(train_set, batch_size=256, shuffle=True,
                          num_workers=2, pin_memory=True)

test_set  = torchvision.datasets.CIFAR10(root='./data', train=False,
                                         download=True, transform=transform)
test_loader = DataLoader(test_set, batch_size=256, shuffle=False)

In [None]:
from torchvision.models import ResNet18_Weights

model = torchvision.models.resnet18(
    weights=ResNet18_Weights.IMAGENET1K_V1
)
model.fc = nn.Linear(512, 10)
model = model.to(DEVICE)


In [None]:
EPOCHS = 15
opt  = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)
sched = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=EPOCHS)
crit = nn.CrossEntropyLoss()

for ep in range(EPOCHS):
    model.train()
    for x, y in train_loader:
        x, y = x.to(DEVICE), y.to(DEVICE)
        opt.zero_grad(); loss = crit(model(x), y); loss.backward(); opt.step()
    sched.step()

In [None]:
def top1(net):
    net.eval(); correct = total = 0
    with torch.no_grad():
        for x, y in test_loader:
            x, y = x.to(DEVICE), y.to(DEVICE)
            pred = net(x).argmax(1)
            correct += (pred == y).sum().item(); total += y.size(0)
    return 100 * correct / total

baseline_acc = top1(model)
print(f"Fine-tuned Top-1 = {baseline_acc:.2f} %")

In [None]:
ckpt_path = "res18_cifar10_finetuned.pt"
torch.save(model.state_dict(), ckpt_path)
print(f"Checkpoint saved to {ckpt_path}")

In [None]:
import copy, torch, torchvision, torch.nn as nn
from torchvision.models import ResNet18_Weights

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
ckpt_path = "res18_cifar10_finetuned.pt"


base_clean = torchvision.models.resnet18(weights=None)
base_clean.fc = nn.Linear(512, 10)
base_clean.load_state_dict(torch.load(ckpt_path, map_location=DEVICE))
base_clean = base_clean.to(DEVICE).eval()

def top1(net):
    net.eval(); correct = total = 0
    with torch.no_grad():
        for x, y in test_loader:
            x, y = x.to(DEVICE), y.to(DEVICE)
            pred = net(x).argmax(1)
            correct += (pred == y).sum().item(); total += y.size(0)
    return 100 * correct / total

print(f"Baseline check → Top-1 {top1(base_clean):.2f} %")


In [None]:
from torch import linalg as LA

def compress_conv(conv: nn.Conv2d, rank: int) -> nn.Module:
    k = conv.kernel_size[0]
    if k == 1 or conv.stride != (1, 1) or conv.groups != 1:
        return conv

    OC, IC = conv.out_channels, conv.in_channels
    Wm = conv.weight.data.reshape(OC, -1)


    U, S, Vh = torch.linalg.svd(Wm.cpu(), full_matrices=False)
    U_r  = U[:, :rank] * S[:rank].sqrt()
    V_r  = (S[:rank].sqrt().unsqueeze(1) * Vh[:rank]) \
           .reshape(rank, IC, k, k)

    device = conv.weight.device
    conv_k = nn.Conv2d(
        IC, rank, k,
        padding=k // 2, stride=1, dilation=conv.dilation,
        groups=1, bias=False, device=device)

    conv_1 = nn.Conv2d(
        rank, OC, 1,
        padding=0,  stride=1, dilation=conv.dilation,
        groups=1, bias=True,  device=device)


    conv_k.weight.data.copy_(V_r.to(device))
    conv_1.weight.data.copy_(U_r.to(device).unsqueeze(-1).unsqueeze(-1))
    if conv.bias is not None:
        conv_1.bias.data.copy_(conv.bias.data)
    else:
        conv_1.bias.data.zero_()

    return nn.Sequential(conv_k, conv_1)


def compress_model(base: nn.Module,
                   energy: float = 0.99,
                   return_rank: bool = False):
    import copy, torch.nn as nn
    from torch import linalg as LA

    model = copy.deepcopy(base)
    rank_map = {}

    for name, m in list(model.named_modules()):
        if isinstance(m, nn.Conv2d) and m.kernel_size[0] > 1:
            S = LA.svdvals(m.weight.reshape(m.out_channels, -1))
            cum = torch.cumsum(S**2, 0) / torch.sum(S**2)
            r   = int((cum < energy).sum()) + 1
            rank_map[name] = r

            parent, child = name.rsplit('.',1) if '.' in name else ('', name)
            tgt = model if parent == '' else dict(model.named_modules())[parent]
            setattr(tgt, child, compress_conv(m, r))

    return (model, rank_map) if return_rank else model



In [None]:
model_99 = compress_model(copy.deepcopy(base_clean), 0.99).eval().to(DEVICE)

acc_99 = top1(model_99)
params_99 = sum(p.numel() for p in model_99.parameters()) / 1e6
print(f"SVD(99%) → Top-1 {acc_99:.2f} % · Params {params_99:.2f} M")


In [None]:
energies = [0.99, 0.95, 0.90, 0.85, 0.80, 0.75]
results  = []
baseline_params = sum(p.numel() for p in base_clean.parameters()) / 1e6

for e in energies:
    m_c = compress_model(copy.deepcopy(base_clean), e).eval().to(DEVICE)
    acc = top1(m_c)
    params = sum(p.numel() for p in m_c.parameters()) / 1e6
    comp_ratio = params / baseline_params
    print(f"SVD({e*100:.0f}%) → Top-1 {acc:.2f}% · {params:.2f} M params")
    results.append((e, comp_ratio, acc, params))


In [None]:
import matplotlib.pyplot as plt

comp = [r[1] for r in results]
acc  = [r[2] for r in results]

plt.figure(figsize=(6,4))
plt.plot(comp, acc, marker='o', label='SVD-compressed')
plt.scatter(1.0, results[0][2], color='red', label='Baseline')
plt.xlabel('Compression Ratio (relative params)')
plt.ylabel('Top-1 Accuracy (%)')
plt.title('Compression vs Accuracy – ResNet-18 / CIFAR-10')
plt.grid(True); plt.legend(); plt.show()


In [None]:

model_95, rank_dict95 = compress_model(base_clean, 0.95, return_rank=True)
model_90, rank_dict90 = compress_model(base_clean, 0.90, return_rank=True)


In [None]:
import torch, time

torch.backends.cudnn.benchmark = True

def gpu_latency(model,
                input_size=(256,3,224,224),
                reps=200,
                warmup=30):
    """
    量測 GPU 推論延遲，回傳：毫秒 / 張
    """
    model = model.to('cuda').eval()
    x = torch.randn(*input_size, device='cuda')

    # warm-up
    with torch.no_grad():
        for _ in range(warmup):
            _ = model(x)
    torch.cuda.synchronize()

    start = torch.cuda.Event(enable_timing=True)
    end   = torch.cuda.Event(enable_timing=True)

    elapsed = 0.0
    with torch.no_grad():
        for _ in range(reps):
            start.record()
            _ = model(x)
            end.record()
            torch.cuda.synchronize()
            elapsed += start.elapsed_time(end)

    return elapsed / reps / input_size[0]


In [None]:
# 1) 測速工具
import torch, time, matplotlib.pyplot as plt
torch.backends.cudnn.benchmark = True

def gpu_latency(model, input_size=(256,3,224,224), reps=200, warmup=30):
    x = torch.randn(*input_size, device='cuda')
    model.eval()
    with torch.no_grad():
        for _ in range(warmup):
            _ = model(x)
    torch.cuda.synchronize()
    st, ed = torch.cuda.Event(True), torch.cuda.Event(True)
    t = 0.0
    with torch.no_grad():
        for _ in range(reps):
            st.record(); _ = model(x); ed.record()
            torch.cuda.synchronize()
            t += st.elapsed_time(ed)
    return t / reps / input_size[0]

energies   = [0.99, 0.95, 0.90, 0.85, 0.80, 0.75]
model_bank = {'Baseline': torch.jit.script(base_clean).eval().to('cuda')}

for e in energies:
    mdl = compress_model(base_clean, e)
    mdl = torch.jit.script(mdl).eval().to('cuda')     # ★ fuse kernels
    model_bank[f'SVD-{int(e*100)}'] = mdl

# 3) 測速
records = []
for name, mdl in model_bank.items():
    ms = gpu_latency(mdl, (256,3,224,224))
    records.append((name, ms))
    print(f"{name:<9}: {ms:.3f} ms/img")

# 4) 畫圖
labels, vals = zip(*records)
plt.figure(figsize=(6,3.5))
plt.bar(labels, vals)
plt.ylabel('ms / image (batch128)')
plt.title('GPU Latency – ResNet-18 (TorchScript fused)')
plt.grid(axis='y', ls=':')
plt.tight_layout(); plt.show()


In [None]:
comp_ratio = [1.00, 0.99, 0.81, 0.68, 0.59]
accs       = [83.8, 83.3, 79.6, 76.0, 67.5]
gpu_lat_ms = [0.185, 0.123, 0.127, 0.143, 0.166]

fig, ax1 = plt.subplots(figsize=(6,4))
ax1.plot(comp_ratio, accs, 'o-', color='tab:blue')
ax1.set_xlabel('Compression Ratio')
ax1.set_ylabel('Top-1 Acc (%)', color='tab:blue')
ax1.tick_params(axis='y', labelcolor='tab:blue')

ax2 = ax1.twinx()
ax2.plot(comp_ratio, gpu_lat_ms, 's--', color='tab:red')
ax2.set_ylabel('GPU latency (ms / img)', color='tab:red')
ax2.tick_params(axis='y', labelcolor='tab:red')

fig.tight_layout(); plt.title('Accuracy vs GPU Latency vs Compression')
plt.show()

