# Telemetry Pipeline - Model Compression Workflow
This notebook takes the fine_tuned_gpt2_telemetry model, performs model compression using knowledge distillation, pruning and quantization.

                    GNU AFFERO GENERAL PUBLIC LICENSE
                       Version 3, 19 November 2007

Copyright (C) 2025 Shaji R. Nathan  
IP Infusion Inc.  
Email: shaji.nathan@ipinfusion.com  

This program is free software: you can redistribute it and/or modify  
it under the terms of the GNU Affero General Public License as  
published by the Free Software Foundation, either version 3 of the  
License, or (at your option) any later version.  

This program is distributed in the hope that it will be useful,  
but WITHOUT ANY WARRANTY; without even the implied warranty of  
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the  
GNU Affero General Public License for more details.  

You should have received a copy of the GNU Affero General Public License  
along with this program. If not, see <https://www.gnu.org/licenses/>.  

As per AGPLv3, if you modify this software and make it available over a  
network, you must provide the source code of your modifications under the  
same license.  

For inquiries, please contact:  
Shaji R. Nathan  
IP Infusion Inc.  
Email: shaji.nathan@ipinfusion.com  


## Load Fine Tuned Model for Distillation

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA Device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Version: {torch.version.cuda}")
    print(f"PyTorch Version: {torch.__version__}")

try:
    teacher = AutoModelForCausalLM.from_pretrained("./fine_tuned_gpt2_telemetry", device_map=None)
    print("✅ Model loaded successfully to CPU.")

    teacher = teacher.to("cuda")
    print("✅ Model moved to GPU successfully.")
except Exception as e:
    print(f"❌ Error during model load/move: {e}")


CUDA Available: True
CUDA Device: Quadro M1000M
CUDA Version: 11.7
PyTorch Version: 2.0.0+cu117




✅ Model loaded successfully to CPU.
✅ Model moved to GPU successfully.


In [2]:
import torch
import torch.nn.functional as F
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq
)
from datasets import load_dataset

# --- Load Tokenizer (from fine-tuned teacher) ---
tokenizer = AutoTokenizer.from_pretrained("fine_tuned_gpt2_telemetry")

# GPT-2 does not have a native padding token, so ensure we set one
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# --- Load Teacher and Student (Both on CPU to avoid GPU issues) ---
teacher = AutoModelForCausalLM.from_pretrained("fine_tuned_gpt2_telemetry", device_map=None).to("cpu")
student = AutoModelForCausalLM.from_pretrained("distilgpt2")

# Ensure the student has the same vocabulary size as the teacher
student.resize_token_embeddings(len(tokenizer))

# --- Load Dataset ---
dataset = load_dataset('json', data_files={'train': 'train.jsonl'})
train_test_split = dataset['train'].train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# --- Tokenization + Labels Preparation ---
# GPT-2 uses causal language modeling, so "labels" = "input_ids"
def process_examples(examples):
    combined = [
        f"prompt: {p}\nresponse: {r}" for p, r in zip(examples['prompt'], examples['response'])
    ]
    tokenized = tokenizer(combined, truncation=True, max_length=512, padding="max_length")
    tokenized["labels"] = tokenized["input_ids"].copy()  # Causal LM needs labels = input_ids
    return tokenized

tokenized_train = train_dataset.map(process_examples, batched=True)
tokenized_eval = eval_dataset.map(process_examples, batched=True)

# --- Data Collator for Dynamic Padding ---
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=student,
    padding=True
)

# --- TrainingArguments (force CPU-only via no_cuda=True) ---
training_args = TrainingArguments(
    output_dir="./distilled_results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=1,  # Small batch size (safer for CPU)
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    fp16=False,  # Mixed precision is disabled (no GPU)
    logging_dir="./logs",
    logging_steps=10,
    report_to="tensorboard",
    no_cuda=True  # Fully disable CUDA (critical fix for your GPU issues)
)

# --- Correct Distillation Loss (KL Divergence + CrossEntropy) ---
# ✅ This is token prediction across 50257 tokens — so we need:
#    - CrossEntropyLoss (hard target from dataset labels)
#    - KLDivLoss (soft target from teacher logits)

def distillation_loss(student_logits, teacher_logits, temperature=2.0):
    """
    KL Divergence loss between soft probabilities of student and teacher.
    """
    student_probs = F.log_softmax(student_logits / temperature, dim=-1)
    teacher_probs = F.softmax(teacher_logits / temperature, dim=-1)
    return F.kl_div(student_probs, teacher_probs, reduction='batchmean') * (temperature ** 2)

# --- Custom Trainer for Knowledge Distillation ---
class DistillationTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        """
        Compute combined loss:
        1. CrossEntropyLoss (hard label matching against ground truth tokens)
        2. KLDivLoss (soft label matching against teacher probabilities)
        """
        labels = inputs.get("labels")
        outputs = model(**inputs)
        student_logits = outputs.logits

        # Forward pass through teacher (always on CPU to avoid GPU issues)
        with torch.no_grad():
            teacher_outputs = teacher(**inputs)
            teacher_logits = teacher_outputs.logits

        # Debugging check (should always match)
        assert student_logits.shape == teacher_logits.shape, (
            f"Logits shape mismatch! Student: {student_logits.shape}, Teacher: {teacher_logits.shape}"
        )

        # CrossEntropyLoss — this is the core loss for language modeling
        ce_loss = F.cross_entropy(
            student_logits.view(-1, student_logits.size(-1)),
            labels.view(-1),
            ignore_index=tokenizer.pad_token_id  # Ignore padding tokens during loss calc
        )

        # KLDivLoss — this is the distillation component (soft target matching)
        distill_loss = distillation_loss(student_logits, teacher_logits)

        # Final combined loss (50% CE, 50% distillation)
        total_loss = 0.5 * ce_loss + 0.5 * distill_loss

        return (total_loss, outputs) if return_outputs else total_loss

# --- Instantiate Trainer ---
trainer = DistillationTrainer(
    model=student,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
    tokenizer=tokenizer
)

# --- Start Training ---
trainer.train()

# --- Save Distilled Student Model ---
student.save_pretrained("distilled_gpt2_telemetry")
tokenizer.save_pretrained("distilled_gpt2_telemetry")

print("✅ CPU-only Distillation complete — Student saved to 'distilled_gpt2_telemetry'")





Map:   0%|          | 0/160 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

  trainer = DistillationTrainer(


Epoch,Training Loss,Validation Loss
1,33.6667,26.494558
2,26.0895,19.545992
3,23.7152,19.037495


✅ CPU-only Distillation complete — Student saved to 'distilled_gpt2_telemetry'


## Load Distilled Model into CPU and GPU to check for corruption

In [5]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA Device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Version: {torch.version.cuda}")
    print(f"PyTorch Version: {torch.__version__}")

try:
    teacher = AutoModelForCausalLM.from_pretrained("./distilled_gpt2_telemetry", device_map=None)
    print("✅ Model loaded successfully to CPU.")

    teacher = teacher.to("cuda")
    print("✅ Model moved to GPU successfully.")
except Exception as e:
    print(f"❌ Error during model load/move: {e}")

CUDA Available: True
CUDA Device: Quadro M1000M
CUDA Version: 11.7
PyTorch Version: 2.0.0+cu117
✅ Model loaded successfully to CPU.
✅ Model moved to GPU successfully.


# Model Pruning

In [7]:

from torch.nn.utils import prune

model = AutoModelForCausalLM.from_pretrained("./distilled_gpt2_telemetry")

for name, module in model.named_modules():
    if isinstance(module, torch.nn.Linear):
        prune.l1_unstructured(module, name='weight', amount=0.3)

model.save_pretrained("distilled_pruned_gpt2_telemetry")
print("✅ Model pruned and saved as 'distilled_pruned_gpt2_telemetry'.")
    

✅ Model pruned and saved as 'distilled_pruned_gpt2_telemetry'.


## Check Distilled and Pruned Model for corruption 

In [8]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA Device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Version: {torch.version.cuda}")
    print(f"PyTorch Version: {torch.__version__}")

try:
    teacher = AutoModelForCausalLM.from_pretrained("./distilled_pruned_gpt2_telemetry", device_map=None)
    print("✅ Model loaded successfully to CPU.")

    teacher = teacher.to("cuda")
    print("✅ Model moved to GPU successfully.")
except Exception as e:
    print(f"❌ Error during model load/move: {e}")

CUDA Available: True
CUDA Device: Quadro M1000M
CUDA Version: 11.7
PyTorch Version: 2.0.0+cu117


Some weights of the model checkpoint at ./distilled_pruned_gpt2_telemetry were not used when initializing GPT2LMHeadModel: ['lm_head.weight_mask']
- This IS expected if you are initializing GPT2LMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2LMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


✅ Model loaded successfully to CPU.
✅ Model moved to GPU successfully.
