In [1]:
import os

BASE = "/kaggle/working/semantic-llm-pruning"
os.makedirs(f"{BASE}/results/ablation_studies", exist_ok=True)
os.makedirs(f"{BASE}/models", exist_ok=True)
os.makedirs(f"{BASE}/artifacts", exist_ok=True)  # optionnel


In [2]:
!pip -q install -U "transformers>=4.44" "accelerate>=0.33" "datasets>=2.20" einops peft evaluate sentencepiece

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m85.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.9/380.9 kB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m512.3/512.3 kB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m556.4/556.4 kB[0m [31m37.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import os
BASE = "/kaggle/working/semantic-llm-pruning"
os.environ["HF_HOME"] = f"{BASE}/.hf"
os.environ["TRANSFORMERS_CACHE"] = f"{BASE}/.hf/transformers"
os.environ["HF_DATASETS_CACHE"] = f"{BASE}/.hf/datasets"


In [4]:
import os

# Empêche transformers d'aller chercher TF/JAX/Flax (cause de ton crash)
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["TRANSFORMERS_NO_FLAX"] = "1"
os.environ["USE_TF"] = "0"
os.environ["USE_FLAX"] = "0"
os.environ["JAX_PLATFORM_NAME"] = "cpu"

# Cache HF (suffisant, pas besoin de TRANSFORMERS_CACHE)
BASE = "/kaggle/working/semantic-llm-pruning"
os.environ["HF_HOME"] = f"{BASE}/.hf"
os.environ["HF_DATASETS_CACHE"] = f"{BASE}/.hf/datasets"


In [5]:
import torch
import transformers
import datasets
import accelerate

print("CUDA:", torch.cuda.is_available(), torch.cuda.get_device_name(0) if torch.cuda.is_available() else None)
print("transformers:", transformers.__version__)
print("datasets:", datasets.__version__)
print("accelerate:", accelerate.__version__)




CUDA: True Tesla T4
transformers: 4.57.3
datasets: 4.4.2
accelerate: 1.12.0


In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

MODEL_ID = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    dtype=torch.float16,
    device_map="auto",
    attn_implementation="eager",   # <-- IMPORTANT
)
model.eval()


tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/560 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.40G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rot

In [7]:
import math, json, time
from datasets import load_dataset

def perplexity_wikitext2(model, tokenizer, max_samples=256, seq_len=1024, stride=512):
    ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
    # concat simple
    text = "\n\n".join(ds["text"])
    enc = tokenizer(text, return_tensors="pt")
    input_ids = enc["input_ids"][0]

    device = next(model.parameters()).device
    nlls = []
    n_tokens = 0

    t0 = time.time()
    model.eval()
    with torch.no_grad():
        # sliding window
        for start in range(0, min(len(input_ids) - 2, max_samples * stride), stride):
            end = min(start + seq_len, len(input_ids))
            trg_len = end - start
            if trg_len < 2:
                break

            x = input_ids[start:end].unsqueeze(0).to(device)
            # labels = x, mais on masque le prefix si stride < seq_len
            labels = x.clone()
            if start > 0:
                # on ne compte que la nouvelle partie
                labels[:, :-min(stride, labels.size(1))] = -100

            out = model(input_ids=x, labels=labels)
            neg_log_likelihood = out.loss * (labels != -100).sum()

            nlls.append(neg_log_likelihood.detach().float().cpu())
            n_tokens += (labels != -100).sum().item()

    ppl = torch.exp(torch.stack(nlls).sum() / max(n_tokens, 1)).item()
    return ppl, time.time() - t0

ppl, elapsed = perplexity_wikitext2(model, tokenizer, max_samples=256, seq_len=1024, stride=512)
print(f"Perplexity (WikiText-2 test) = {ppl:.3f} | time={elapsed:.1f}s")

# métriques hardware simples
gpu_name = torch.cuda.get_device_name(0) if torch.cuda.is_available() else None
max_mem = torch.cuda.max_memory_allocated() / (1024**3) if torch.cuda.is_available() else None

baseline = {
    "model_id": MODEL_ID,
    "dataset": "wikitext-2-raw-v1:test",
    "metric": "perplexity",
    "perplexity": ppl,
    "eval_seconds": elapsed,
    "gpu": gpu_name,
    "max_memory_allocated_gb": max_mem,
    "dtype": "float16",
}

out_path = f"{BASE}/results/perplexity_baseline.json"
with open(out_path, "w") as f:
    json.dump(baseline, f, indent=2)

print("Saved:", out_path)


README.md: 0.00B [00:00, ?B/s]

wikitext-2-raw-v1/test-00000-of-00001.pa(…):   0%|          | 0.00/733k [00:00<?, ?B/s]

wikitext-2-raw-v1/train-00000-of-00001.p(…):   0%|          | 0.00/6.36M [00:00<?, ?B/s]

wikitext-2-raw-v1/validation-00000-of-00(…):   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Perplexity (WikiText-2 test) = 7.382 | time=52.7s
Saved: /kaggle/working/semantic-llm-pruning/results/perplexity_baseline.json


In [8]:
import time
import torch

@torch.no_grad()
def measure_latency_ms_per_token(model, tokenizer, prompt="Write a short paragraph about pruning.", gen_tokens=64, warmup=1, runs=3):
    device = next(model.parameters()).device
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # warmup
    for _ in range(warmup):
        _ = model.generate(**inputs, max_new_tokens=gen_tokens, do_sample=False)

    torch.cuda.synchronize() if torch.cuda.is_available() else None

    times = []
    for _ in range(runs):
        t0 = time.time()
        out = model.generate(**inputs, max_new_tokens=gen_tokens, do_sample=False)
        torch.cuda.synchronize() if torch.cuda.is_available() else None
        dt = time.time() - t0
        times.append(dt)

    avg = sum(times)/len(times)
    ms_per_token = (avg * 1000) / gen_tokens
    return ms_per_token

ms_tok = measure_latency_ms_per_token(model, tokenizer)
print(f"Latency ≈ {ms_tok:.2f} ms/token")


Latency ≈ 37.81 ms/token


In [9]:
import os, json
import numpy as np
import torch
from datasets import load_dataset

BASE = "/kaggle/working/semantic-llm-pruning"
ART = f"{BASE}/artifacts"
os.makedirs(ART, exist_ok=True)

ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")

def make_batches(tokenizer, ds, batch_size=1, seq_len=256, max_batches=200):
    # concat léger + chunking
    text = "\n\n".join(ds["text"][:5000])
    ids = tokenizer(text, return_tensors="pt")["input_ids"][0]
    # découpe en chunks seq_len
    chunks = []
    for i in range(0, min(len(ids) - seq_len - 1, max_batches * seq_len), seq_len):
        chunks.append(ids[i:i+seq_len])
        if len(chunks) >= batch_size * max_batches:
            break

    # yield batches
    for i in range(0, len(chunks), batch_size):
        b = chunks[i:i+batch_size]
        if len(b) < batch_size:
            break
        yield torch.stack(b, dim=0)

batches = list(make_batches(tokenizer, ds, batch_size=1, seq_len=256, max_batches=200))
len(batches), batches[0].shape


(200, torch.Size([1, 256]))

In [10]:
x0 = batches[0].to(next(model.parameters()).device)
out0 = model(input_ids=x0, output_attentions=True, return_dict=True, use_cache=False)
print(out0.attentions is not None, len(out0.attentions) if out0.attentions is not None else None)


True 22


In [11]:
import json
import numpy as np
import torch

device = next(model.parameters()).device
n_layers = model.config.num_hidden_layers
n_heads  = model.config.num_attention_heads

head_entropy_sum = torch.zeros((n_layers, n_heads), dtype=torch.float64)
head_count = 0

@torch.no_grad()
def attention_entropy(attn):  # attn: [B, H, T, S]
    eps = 1e-9
    p = attn.clamp_min(eps)
    ent = -(p * p.log()).sum(dim=-1)   # [B, H, T]
    return ent.mean(dim=(0, 2))        # [H]

model.eval()
for x in batches:
    x = x.to(device)
    out = model(input_ids=x, output_attentions=True, return_dict=True, use_cache=False)
    # out.attentions: tuple length L, each [B,H,T,S]
    for l, attn in enumerate(out.attentions):
        head_entropy_sum[l] += attention_entropy(attn).double().cpu()
    head_count += 1

head_entropy = (head_entropy_sum / max(head_count, 1)).numpy()  # [L,H]

# ranking: plus faible entropie = plus "focus" (souvent + important)
ranking = [{"layer": l, "head": h, "entropy": float(head_entropy[l, h])}
           for l in range(n_layers) for h in range(n_heads)]
ranking_sorted = sorted(ranking, key=lambda d: d["entropy"])

np.save(f"{ART}/head_importance.npy", head_entropy)
with open(f"{ART}/head_ranking.json", "w") as f:
    json.dump({
        "metric": "attention_entropy_mean",
        "lower_is_more_important": True,
        "num_layers": n_layers,
        "num_heads": n_heads,
        "num_batches": head_count,
        "seq_len": int(batches[0].shape[1]),
        "ranking": ranking_sorted
    }, f, indent=2)

print("Saved:", f"{ART}/head_importance.npy")
print("Saved:", f"{ART}/head_ranking.json")
print("Top-10 heads (lowest entropy):")
print(ranking_sorted[:10])


Saved: /kaggle/working/semantic-llm-pruning/artifacts/head_importance.npy
Saved: /kaggle/working/semantic-llm-pruning/artifacts/head_ranking.json
Top-10 heads (lowest entropy):
[{'layer': 0, 'head': 0, 'entropy': nan}, {'layer': 0, 'head': 1, 'entropy': nan}, {'layer': 0, 'head': 2, 'entropy': nan}, {'layer': 0, 'head': 3, 'entropy': nan}, {'layer': 0, 'head': 4, 'entropy': nan}, {'layer': 0, 'head': 5, 'entropy': nan}, {'layer': 0, 'head': 6, 'entropy': nan}, {'layer': 0, 'head': 7, 'entropy': nan}, {'layer': 0, 'head': 8, 'entropy': nan}, {'layer': 0, 'head': 9, 'entropy': nan}]


In [12]:
import json
import numpy as np
import torch

device = next(model.parameters()).device
n_layers = model.config.num_hidden_layers

state = {
    "mlp_sum": None,    # tensor [L, intermediate_size]
    "count": 0
}

handles = []

def make_mlp_hook(layer_idx):
    def hook(module, inputs, output):
        # inputs[0]: [B, T, intermediate_size]
        x = inputs[0].detach()
        s = x.abs().mean(dim=(0, 1)).cpu()  # [intermediate_size]

        if state["mlp_sum"] is None:
            state["mlp_sum"] = torch.zeros(
                (n_layers, s.numel()),
                dtype=torch.float64
            )

        state["mlp_sum"][layer_idx] += s.double()
    return hook

# Register hooks
for l in range(n_layers):
    down_proj = model.model.layers[l].mlp.down_proj
    handles.append(down_proj.register_forward_hook(make_mlp_hook(l)))

model.eval()
with torch.no_grad():
    for x in batches:
        x = x.to(device)
        _ = model(input_ids=x, return_dict=True, use_cache=False)
        state["count"] += 1

# Remove hooks
for h in handles:
    h.remove()

mlp_importance = (state["mlp_sum"] / max(state["count"], 1)).numpy()  # [L, intermediate_size]

ranking = [
    {"layer": l, "neuron": i, "mean_abs_act": float(mlp_importance[l, i])}
    for l in range(n_layers)
    for i in range(mlp_importance.shape[1])
]
ranking_sorted = sorted(ranking, key=lambda d: d["mean_abs_act"], reverse=True)

np.save(f"{ART}/mlp_importance.npy", mlp_importance)
with open(f"{ART}/mlp_ranking.json", "w") as f:
    json.dump({
        "metric": "mean_abs_activation_before_down_proj",
        "higher_is_more_important": True,
        "num_layers": n_layers,
        "intermediate_size": int(mlp_importance.shape[1]),
        "num_batches": state["count"],
        "seq_len": int(batches[0].shape[1]),
        "ranking": ranking_sorted[:2000]
    }, f, indent=2)

print("Saved:", f"{ART}/mlp_importance.npy")
print("Saved:", f"{ART}/mlp_ranking.json")
print("Top-10 neurons:", ranking_sorted[:10])


Saved: /kaggle/working/semantic-llm-pruning/artifacts/mlp_importance.npy
Saved: /kaggle/working/semantic-llm-pruning/artifacts/mlp_ranking.json
Top-10 neurons: [{'layer': 20, 'neuron': 3279, 'mean_abs_act': 3.2506298828125}, {'layer': 21, 'neuron': 4696, 'mean_abs_act': 2.2972216796875}, {'layer': 21, 'neuron': 1940, 'mean_abs_act': 2.2022802734375}, {'layer': 21, 'neuron': 4132, 'mean_abs_act': 2.0382861328125}, {'layer': 21, 'neuron': 5002, 'mean_abs_act': 1.92822998046875}, {'layer': 21, 'neuron': 1955, 'mean_abs_act': 1.882861328125}, {'layer': 21, 'neuron': 3171, 'mean_abs_act': 1.756611328125}, {'layer': 21, 'neuron': 596, 'mean_abs_act': 1.7434521484375}, {'layer': 21, 'neuron': 3053, 'mean_abs_act': 1.7325732421875}, {'layer': 21, 'neuron': 3201, 'mean_abs_act': 1.627783203125}]


In [13]:
print(mlp_importance.shape)   # (22, intermediate_size)
print("Mean score:", mlp_importance.mean())


(22, 5632)
Mean score: 0.031226783112684366


In [14]:
import json, time
import torch

BASE = "/kaggle/working/semantic-llm-pruning"

latency = {
    "model_id": MODEL_ID,
    "metric": "ms_per_token",
    "ms_per_token": float(ms_tok),
    "prompt": "Write a short paragraph about pruning.",
    "gen_tokens": 64,
    "gpu": torch.cuda.get_device_name(0) if torch.cuda.is_available() else None,
    "dtype": str(next(model.parameters()).dtype).replace("torch.", ""),
    "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
}

out_path = f"{BASE}/results/latency_baseline.json"
with open(out_path, "w") as f:
    json.dump(latency, f, indent=2)

print("Saved:", out_path)


Saved: /kaggle/working/semantic-llm-pruning/results/latency_baseline.json


In [15]:
import os, time
import pandas as pd
import torch

BASE = "/kaggle/working/semantic-llm-pruning"
OUT_CSV = f"{BASE}/results/ablation_studies/baseline.csv"
os.makedirs(os.path.dirname(OUT_CSV), exist_ok=True)

def count_params(m):
    return sum(p.numel() for p in m.parameters())

row = {
    "model": str(MODEL_ID),
    "input_model": "baseline",
    "tag": "baseline",
    "prune_ratio_heads": 0.0,
    "prune_ratio_mlp": 0.0,
    "ppl": float(ppl),          # <-- variable EXISTANTE
    "latency_ms_per_token": float(ms_tok),
    "params": int(count_params(model)),
    "gpu": torch.cuda.get_device_name(0) if torch.cuda.is_available() else None,
    "dtype": str(next(model.parameters()).dtype).replace("torch.", ""),
    "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
}

df_new = pd.DataFrame([row])

if os.path.exists(OUT_CSV):
    df_old = pd.read_csv(OUT_CSV)
    df = pd.concat([df_old, df_new], ignore_index=True)
else:
    df = df_new

df.to_csv(OUT_CSV, index=False)
print("Saved baseline CSV ->", OUT_CSV)
print(df.tail(1))


Saved baseline CSV -> /kaggle/working/semantic-llm-pruning/results/ablation_studies/baseline.csv
                                               model input_model       tag  \
0  TinyLlama/TinyLlama-1.1B-intermediate-step-143...    baseline  baseline   

   prune_ratio_heads  prune_ratio_mlp       ppl  latency_ms_per_token  \
0                0.0              0.0  7.382191             37.809968   

       params       gpu    dtype            timestamp  
0  1100048384  Tesla T4  float16  2026-01-07 11:22:39  


In [16]:
# --- Save baseline model for later reuse (Notebook 3 / 4) ---

BASELINE_SAVE_PATH = "/kaggle/working/semantic-llm-pruning/models/baseline"

os.makedirs(BASELINE_SAVE_PATH, exist_ok=True)

model.save_pretrained(BASELINE_SAVE_PATH)
tokenizer.save_pretrained(BASELINE_SAVE_PATH)

print("Baseline model saved to:", BASELINE_SAVE_PATH)


Baseline model saved to: /kaggle/working/semantic-llm-pruning/models/baseline
