Paths IN/OUT + dossiers

In [1]:
import os

BASE_IN  = "/kaggle/input/02-structured-head-pruning-qo-gqa/semantic-llm-pruning"   # <-- adapte si ton nom diffère
BASE_OUT = "/kaggle/working/semantic-llm-pruning"

ART_IN = f"{BASE_IN}/artifacts"   # si tu as gardé artifacts dans l’output (sinon pointe vers notebook 01 output)
MOD_IN = f"{BASE_IN}/models"

os.makedirs(f"{BASE_OUT}/models", exist_ok=True)
os.makedirs(f"{BASE_OUT}/results/ablation_studies", exist_ok=True)

print("BASE_IN:", BASE_IN)
print("BASE_OUT:", BASE_OUT)


BASE_IN: /kaggle/input/02-structured-head-pruning-qo-gqa/semantic-llm-pruning
BASE_OUT: /kaggle/working/semantic-llm-pruning


Env “anti TF/JAX” (à mettre tout en haut)

In [2]:
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["TRANSFORMERS_NO_FLAX"] = "1"
os.environ["USE_TF"] = "0"
os.environ["USE_FLAX"] = "0"
os.environ["JAX_PLATFORM_NAME"] = "cpu"


Imports

In [3]:
import json
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM


Charger le modèle d’entrée (celui pruné heads -20%)

In [4]:
PRUNED_HEADS_PATH = f"{MOD_IN}/pruned_heads_20"

tokenizer = AutoTokenizer.from_pretrained(PRUNED_HEADS_PATH, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    PRUNED_HEADS_PATH,
).to("cuda")
model.eval()

print("Loaded:", PRUNED_HEADS_PATH)
print("CUDA:", torch.cuda.is_available(), torch.cuda.get_device_name(0))


Loaded: /kaggle/input/02-structured-head-pruning-qo-gqa/semantic-llm-pruning/models/pruned_heads_20
CUDA: True Tesla T4


Charger l’importance MLP (depuis Notebook 01 output)

In [5]:
BASE_IN1 = "/kaggle/input/00-setup-01-baseline-and-importance/semantic-llm-pruning"
mlp_importance = np.load(f"{BASE_IN1}/artifacts/mlp_importance.npy")
print("mlp_importance:", mlp_importance.shape)


mlp_importance: (22, 5632)


Choisir les neurones MLP à garder (−10% et −20%)

In [6]:
import numpy as np

num_layers = model.config.num_hidden_layers
intermediate_size = mlp_importance.shape[1]

assert mlp_importance.shape[0] == num_layers, (mlp_importance.shape, num_layers)

def build_mlp_keep_indices(mlp_importance, keep_ratio):
    """
    keep_ratio: ex 0.90 pour -10%
    return: dict layer -> np.array(indices gardés, triés)
    """
    keep = {}
    k = int(intermediate_size * keep_ratio)
    k = max(1, k)

    for l in range(num_layers):
        scores = mlp_importance[l]  # [intermediate_size]
        # top-k (scores élevés = importants)
        idx = np.argpartition(scores, -k)[-k:]
        idx = np.sort(idx)
        keep[l] = idx.astype(np.int64)

    return keep, k

keep_10, k10 = build_mlp_keep_indices(mlp_importance, keep_ratio=0.90)  # -10%
keep_20, k20 = build_mlp_keep_indices(mlp_importance, keep_ratio=0.80)  # -20%

print("intermediate_size:", intermediate_size)
print("k10:", k10, "k20:", k20)
print("layer0 sample idx (10%):", keep_10[0][:10])


intermediate_size: 5632
k10: 5068 k20: 4505
layer0 sample idx (10%): [ 1  2  3  4  5  6  7  8  9 10]


Pruner physiquement le MLP (gate/up/down)

In [7]:
import torch
import torch.nn as nn

def prune_out_features_like(linear: nn.Linear, keep_out_idx: torch.Tensor):
    device = linear.weight.device
    dtype  = linear.weight.dtype
    new_linear = nn.Linear(
        linear.in_features,
        keep_out_idx.numel(),
        bias=(linear.bias is not None),
        device=device,
        dtype=dtype,
    )
    new_linear.weight.data.copy_(linear.weight.data[keep_out_idx, :])
    if linear.bias is not None:
        new_linear.bias.data.copy_(linear.bias.data[keep_out_idx])
    return new_linear

def prune_in_features_like(linear: nn.Linear, keep_in_idx: torch.Tensor):
    device = linear.weight.device
    dtype  = linear.weight.dtype
    new_linear = nn.Linear(
        keep_in_idx.numel(),
        linear.out_features,
        bias=(linear.bias is not None),
        device=device,
        dtype=dtype,
    )
    new_linear.weight.data.copy_(linear.weight.data[:, keep_in_idx])
    if linear.bias is not None:
        new_linear.bias.data.copy_(linear.bias.data)
    return new_linear


Fonction de pruning MLP sur un layer

In [8]:
def prune_mlp_layer(mlp, keep_idx_np):
    """
    mlp: LlamaMLP
    keep_idx_np: np.array indices (intermediate dim) à garder
    """
    keep_idx = torch.tensor(keep_idx_np, dtype=torch.long, device=mlp.gate_proj.weight.device)

    # gate_proj/up_proj: prune out_features
    mlp.gate_proj = prune_out_features_like(mlp.gate_proj, keep_idx)
    mlp.up_proj   = prune_out_features_like(mlp.up_proj, keep_idx)

    # down_proj: prune in_features
    mlp.down_proj = prune_in_features_like(mlp.down_proj, keep_idx)

    return keep_idx.numel()


Créer et sauvegarder 2 modèles : MLP -10% et MLP -20%

In [9]:
import os
from transformers import AutoModelForCausalLM, AutoTokenizer

def quick_generate(model, tokenizer, prompt="Explain MLP pruning.", max_new_tokens=40):
    device = next(model.parameters()).device
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    out = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
    return tokenizer.decode(out[0], skip_special_tokens=True)

def prune_and_save_mlp(pruned_heads_path, keep_dict, tag, base_out):
    # reload fresh (important)
    tok = AutoTokenizer.from_pretrained(pruned_heads_path, use_fast=True)
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token

    m = AutoModelForCausalLM.from_pretrained(pruned_heads_path).to("cuda").half().eval()

    # apply pruning
    for l in range(m.config.num_hidden_layers):
        mlp = m.model.layers[l].mlp
        new_int = prune_mlp_layer(mlp, keep_dict[l])

    # patch config
    m.config.intermediate_size = new_int

    # sanity
    txt = quick_generate(m, tok, prompt=f"Give 2 bullet points about {tag}.", max_new_tokens=40)
    print(f"[{tag}] generate ok:\n", txt[:300], "\n")

    # save
    save_dir = f"{base_out}/models/{tag}"
    os.makedirs(save_dir, exist_ok=True)
    m.save_pretrained(save_dir)
    tok.save_pretrained(save_dir)

    print("Saved:", save_dir)
    return save_dir

# run -10%
save_mlp10 = prune_and_save_mlp(PRUNED_HEADS_PATH, keep_10, "pruned_heads20_mlp10", BASE_OUT)

# run -20%
save_mlp20 = prune_and_save_mlp(PRUNED_HEADS_PATH, keep_20, "pruned_heads20_mlp20", BASE_OUT)
# =========================================================
# (B) MLP pruning on BASELINE (normal model)
# =========================================================
BASELINE_MODEL_PATH = "/kaggle/input/00-setup-01-baseline-and-importance/semantic-llm-pruning/models/baseline"

# IMPORTANT: on réutilise les mêmes keep_10 / keep_20
# (Option "comparaison apples-to-apples")
save_base_mlp10 = prune_and_save_mlp(BASELINE_MODEL_PATH, keep_10, "baseline_mlp10", BASE_OUT)
save_base_mlp20 = prune_and_save_mlp(BASELINE_MODEL_PATH, keep_20, "baseline_mlp20", BASE_OUT)

print("Saved baseline_mlp10 ->", save_base_mlp10)
print("Saved baseline_mlp20 ->", save_base_mlp20)


[pruned_heads20_mlp10] generate ok:
 Give 2 bullet points about pruned_heads20_mlp10.
The first is that the head is a head of pruned heads.
The second is that the head is pruned.
The third is that the head is pruned. 

Saved: /kaggle/working/semantic-llm-pruning/models/pruned_heads20_mlp10
[pruned_heads20_mlp20] generate ok:
 Give 2 bullet points about pruned_heads20_mlp20.
The 2018-2019 is the 2018-2019.
The 2018-2019 is the  

Saved: /kaggle/working/semantic-llm-pruning/models/pruned_heads20_mlp20
[baseline_mlp10] generate ok:
 Give 2 bullet points about baseline_mlp10.

### 2.1.1. Baseline_mlp10

Baseline_mlp10 is a baseline that uses the MLP10 model. 

Saved: /kaggle/working/semantic-llm-pruning/models/baseline_mlp10
[baseline_mlp20] generate ok:
 Give 2 bullet points about baseline_mlp20.

### 2.1.1. Baseline_mlp20

Baseline_mlp20 is a 20-year baseline dataset. It is 

Saved: /kaggle/working/semantic-llm-pruning/models/baseline_mlp20
Saved baseline_mlp10 -> /kaggle/working/semanti

Vérifier que les dossiers existent

In [10]:
!ls -lah /kaggle/working/semantic-llm-pruning/models | head -n 50
!ls -lah /kaggle/working/semantic-llm-pruning/models/pruned_heads20_mlp10 | head -n 30
!ls -lah /kaggle/working/semantic-llm-pruning/models/pruned_heads20_mlp20 | head -n 30


total 24K
drwxr-xr-x 6 root root 4.0K Jan  7 11:25 .
drwxr-xr-x 4 root root 4.0K Jan  7 11:24 ..
drwxr-xr-x 2 root root 4.0K Jan  7 11:25 baseline_mlp10
drwxr-xr-x 2 root root 4.0K Jan  7 11:25 baseline_mlp20
drwxr-xr-x 2 root root 4.0K Jan  7 11:24 pruned_heads20_mlp10
drwxr-xr-x 2 root root 4.0K Jan  7 11:24 pruned_heads20_mlp20


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


total 1.9G
drwxr-xr-x 2 root root 4.0K Jan  7 11:24 .
drwxr-xr-x 6 root root 4.0K Jan  7 11:25 ..
-rw-r--r-- 1 root root  668 Jan  7 11:24 config.json
-rw-r--r-- 1 root root  124 Jan  7 11:24 generation_config.json
-rw-r--r-- 1 root root 1.9G Jan  7 11:24 model.safetensors
-rw-r--r-- 1 root root  551 Jan  7 11:24 special_tokens_map.json
-rw-r--r-- 1 root root  978 Jan  7 11:24 tokenizer_config.json
-rw-r--r-- 1 root root 3.5M Jan  7 11:24 tokenizer.json
-rw-r--r-- 1 root root 489K Jan  7 11:24 tokenizer.model
total 1.7G
drwxr-xr-x 2 root root 4.0K Jan  7 11:24 .
drwxr-xr-x 6 root root 4.0K Jan  7 11:25 ..
-rw-r--r-- 1 root root  668 Jan  7 11:24 config.json
-rw-r--r-- 1 root root  124 Jan  7 11:24 generation_config.json
-rw-r--r-- 1 root root 1.7G Jan  7 11:24 model.safetensors
-rw-r--r-- 1 root root  551 Jan  7 11:24 special_tokens_map.json
-rw-r--r-- 1 root root  978 Jan  7 11:24 tokenizer_config.json
-rw-r--r-- 1 root root 3.5M Jan  7 11:24 tokenizer.json
-rw-r--r-- 1 root root 489K

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [11]:
import os, time, math, json
import pandas as pd
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM

BASE_OUT = "/kaggle/working/semantic-llm-pruning"
OUT_CSV = f"{BASE_OUT}/results/ablation_studies/mlp_pruning.csv"
os.makedirs(os.path.dirname(OUT_CSV), exist_ok=True)


In [12]:
@torch.no_grad()
def compute_ppl_wikitext2_zip_style(
    model,
    tokenizer,
    split="test",
    seq_len=1024,
    stride=512,
    max_windows=256,
    # alias Notebook-1:
    num_samples=None,
    max_length=None,
):
    if max_length is not None:
        seq_len = int(max_length)
    if num_samples is not None:
        max_windows = int(num_samples)

    model.eval()
    ds = load_dataset("wikitext", "wikitext-2-raw-v1", split=split)
    text = "\n\n".join(ds["text"])
    enc = tokenizer(text, return_tensors="pt")
    input_ids = enc["input_ids"][0]

    device = next(model.parameters()).device
    input_ids = input_ids.to(device)

    nll_sum = 0.0
    tok_count = 0
    nb = 0

    for start in range(0, input_ids.numel() - 1, stride):
        end = min(start + seq_len, input_ids.numel())
        x = input_ids[start:end].unsqueeze(0)

        labels = x.clone()
        if start > 0:
            overlap = min(stride, labels.size(1))
            labels[:, :-overlap] = -100

        out = model(input_ids=x, labels=labels, use_cache=False, return_dict=True)
        n_eval = (labels != -100).sum().item()

        nll_sum += out.loss.detach().float().item() * max(n_eval, 1)
        tok_count += n_eval

        nb += 1
        if nb >= max_windows or end == input_ids.numel():
            break

    ppl = float(torch.exp(torch.tensor(nll_sum / max(tok_count, 1))).item())
    return ppl


In [13]:
@torch.no_grad()
def measure_latency_ms_per_token(model, tokenizer, prompt="Explain MLP pruning.", gen_tokens=64, runs=3):
    model.eval()
    device = next(model.parameters()).device
    inp = tokenizer(prompt, return_tensors="pt").to(device)

    # warmup
    _ = model.generate(**inp, max_new_tokens=gen_tokens, do_sample=False)

    times = []
    for _ in range(runs):
        torch.cuda.synchronize()
        t0 = time.time()
        _ = model.generate(**inp, max_new_tokens=gen_tokens, do_sample=False)
        torch.cuda.synchronize()
        t1 = time.time()
        times.append((t1 - t0) * 1000 / gen_tokens)

    return sum(times) / len(times)


In [14]:
def count_params(model):
    return sum(p.numel() for p in model.parameters())


In [15]:
def eval_and_log_model(model_path, tag, prune_ratio_mlp, input_model_name):
    tok = AutoTokenizer.from_pretrained(model_path, use_fast=True)
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token

    m = AutoModelForCausalLM.from_pretrained(model_path).to("cuda").half().eval()

    ppl = compute_ppl_wikitext2_zip_style(
    m, tok,
    split="test",
    seq_len=1024,
    stride=512,
    max_windows=256
)
    lat = measure_latency_ms_per_token(m, tok)
    params = count_params(m)

    row = {
        "model": "TinyLlama-1.1B",
        "input_model": input_model_name,
        "tag": tag,
        "prune_ratio_mlp": float(prune_ratio_mlp),
        "ppl": float(ppl),
        "latency_ms_per_token": float(lat),
        "params": int(params),
    }

    df_new = pd.DataFrame([row])
    if os.path.exists(OUT_CSV):
        df_old = pd.read_csv(OUT_CSV)
        df = pd.concat([df_old, df_new], ignore_index=True)
    else:
        df = df_new

    df.to_csv(OUT_CSV, index=False)
    print(f"[{tag}] PPL={ppl:.3f} | latency={lat:.2f} ms/token | params={params:,}")
    print("Appended ->", OUT_CSV)
    return row


In [16]:
BASE_MODELS = "/kaggle/working/semantic-llm-pruning/models"

# (A) tes évaluations existantes (head+mlp) — tu les gardes
path_mlp10 = f"{BASE_MODELS}/pruned_heads20_mlp10"
path_mlp20 = f"{BASE_MODELS}/pruned_heads20_mlp20"
eval_and_log_model(path_mlp10, tag="pruned_heads20_mlp10", prune_ratio_mlp=0.10, input_model_name="pruned_heads_20")
eval_and_log_model(path_mlp20, tag="pruned_heads20_mlp20", prune_ratio_mlp=0.20, input_model_name="pruned_heads_20")

# (B) AJOUT: baseline (normal) + baseline MLP-only
base_mlp10_path = f"{BASE_MODELS}/baseline_mlp10"
base_mlp20_path = f"{BASE_MODELS}/baseline_mlp20"

eval_and_log_model(base_mlp10_path, tag="baseline_mlp10",  prune_ratio_mlp=0.10, input_model_name="baseline")
eval_and_log_model(base_mlp20_path, tag="baseline_mlp20",  prune_ratio_mlp=0.20, input_model_name="baseline")


README.md: 0.00B [00:00, ?B/s]

wikitext-2-raw-v1/test-00000-of-00001.pa(…):   0%|          | 0.00/733k [00:00<?, ?B/s]

wikitext-2-raw-v1/train-00000-of-00001.p(…):   0%|          | 0.00/6.36M [00:00<?, ?B/s]

wikitext-2-raw-v1/validation-00000-of-00(…):   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

[pruned_heads20_mlp10] PPL=15.424 | latency=27.09 ms/token | params=971,909,120
Appended -> /kaggle/working/semantic-llm-pruning/results/ablation_studies/mlp_pruning.csv
[pruned_heads20_mlp20] PPL=19.438 | latency=27.14 ms/token | params=895,809,536
Appended -> /kaggle/working/semantic-llm-pruning/results/ablation_studies/mlp_pruning.csv
[baseline_mlp10] PPL=8.020 | latency=26.96 ms/token | params=1,023,813,632
Appended -> /kaggle/working/semantic-llm-pruning/results/ablation_studies/mlp_pruning.csv
[baseline_mlp20] PPL=9.194 | latency=27.05 ms/token | params=947,714,048
Appended -> /kaggle/working/semantic-llm-pruning/results/ablation_studies/mlp_pruning.csv


{'model': 'TinyLlama-1.1B',
 'input_model': 'baseline',
 'tag': 'baseline_mlp20',
 'prune_ratio_mlp': 0.2,
 'ppl': 9.194290161132812,
 'latency_ms_per_token': 27.04836552341779,
 'params': 947714048}

In [17]:
!cat /kaggle/working/semantic-llm-pruning/results/ablation_studies/mlp_pruning.csv


model,input_model,tag,prune_ratio_mlp,ppl,latency_ms_per_token,params
TinyLlama-1.1B,pruned_heads_20,pruned_heads20_mlp10,0.1,15.424039840698242,27.08899850646655,971909120
TinyLlama-1.1B,pruned_heads_20,pruned_heads20_mlp20,0.2,19.438156127929688,27.138307690620422,895809536
TinyLlama-1.1B,baseline,baseline_mlp10,0.1,8.019798278808594,26.96259195605914,1023813632
TinyLlama-1.1B,baseline,baseline_mlp20,0.2,9.194290161132812,27.04836552341779,947714048


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
