# Telemetry Pipeline - Model Fine Tuning Workflow
This notebook generates synthetic switch telemetry data, prepares a model for GPT-2 fine-tuning 

                    GNU AFFERO GENERAL PUBLIC LICENSE
                       Version 3, 19 November 2007

Copyright (C) 2024 Shaji R. Nathan  
IP Infusion Inc.  
Email: shaji.nathan@ipinfusion.com  

This program is free software: you can redistribute it and/or modify  
it under the terms of the GNU Affero General Public License as  
published by the Free Software Foundation, either version 3 of the  
License, or (at your option) any later version.  

This program is distributed in the hope that it will be useful,  
but WITHOUT ANY WARRANTY; without even the implied warranty of  
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the  
GNU Affero General Public License for more details.  

You should have received a copy of the GNU Affero General Public License  
along with this program. If not, see <https://www.gnu.org/licenses/>.  

As per AGPLv3, if you modify this software and make it available over a  
network, you must provide the source code of your modifications under the  
same license.  

For inquiries, please contact:  
Shaji R. Nathan  
IP Infusion Inc.  
Email: shaji.nathan@ipinfusion.com  


In [1]:

import pandas as pd
import numpy as np
from datetime import datetime, timedelta

def generate_synthetic_switch_telemetry(num_samples=100):
    np.random.seed(42)
    
    start_time = datetime.now()
    timestamps = [start_time - timedelta(minutes=5 * i) for i in range(num_samples)]
    names = [f"QSFP-{i%32+1}" for i in range(num_samples)]

    data = {
        'timestamp': timestamps,
        'name': names,
        'temp': np.random.uniform(20, 80, num_samples),
        'trans-volt': np.random.uniform(3.2, 3.5, num_samples)
    }

    for ch in range(1, 5):
        data[f'channel_{ch}_in_pwr'] = np.random.uniform(-3, 3, num_samples)
        data[f'channel_{ch}_out_pwr'] = np.random.uniform(-3, 3, num_samples)
        data[f'channel_{ch}_laser_bias_cur'] = np.random.uniform(5, 10, num_samples)

    df = pd.DataFrame(data)
    df['timestamp'] = df['timestamp'].astype(str)
    df.to_csv('synthetic_switch_telemetry.csv', index=False)
    return df

df = generate_synthetic_switch_telemetry(200)
df.head()
    

Unnamed: 0,timestamp,name,temp,trans-volt,channel_1_in_pwr,channel_1_out_pwr,channel_1_laser_bias_cur,channel_2_in_pwr,channel_2_out_pwr,channel_2_laser_bias_cur,channel_3_in_pwr,channel_3_out_pwr,channel_3_laser_bias_cur,channel_4_in_pwr,channel_4_out_pwr,channel_4_laser_bias_cur
0,2025-03-08 00:44:21.737018,QSFP-1,42.472407,3.392609,-2.381257,-1.98639,8.536193,-1.889202,1.549579,5.83521,-1.848796,2.739008,6.308528,2.860802,-0.956375,8.147788
1,2025-03-08 00:39:21.737018,QSFP-2,77.042858,3.225242,2.415317,-1.328458,5.762695,0.251406,-2.852479,5.838096,-1.059771,1.42505,6.234894,-2.335676,-2.569729,5.27166
2,2025-03-08 00:34:21.737018,QSFP-3,63.919637,3.248489,0.031514,-1.937937,7.881442,2.237675,-2.867259,5.183357,-1.640062,-0.880492,9.531273,-0.464709,-0.542227,8.743226
3,2025-03-08 00:29:21.737018,QSFP-4,55.919509,3.469566,1.958745,-2.467785,8.033575,1.393349,-1.058339,8.68201,-0.870022,-1.220787,6.247731,-2.747852,-1.132695,6.587934
4,2025-03-08 00:24:21.737018,QSFP-5,29.361118,3.381929,-1.079702,-2.276185,7.120653,1.839367,-0.068141,8.319023,-2.583457,-0.901781,6.359749,1.439425,1.062724,5.000673


In [2]:

import json

def classify_anomaly(row):
    anomalies = []
    if row['temp'] > 75:
        anomalies.append("Overheating")
    if row['trans-volt'] < 3.1 or row['trans-volt'] > 3.5:
        anomalies.append("Voltage Drift")
    for ch in range(1, 5):
        if row[f'channel_{ch}_laser_bias_cur'] > 9:
            anomalies.append(f"Channel {ch} Bias Current Spike")
        if row[f'channel_{ch}_out_pwr'] < -3:
            anomalies.append(f"Channel {ch} Power Loss")
    return "Anomalous - " + ', '.join(anomalies) if anomalies else "Normal"

def row_to_prompt(row):
    prompt = (
        f"Telemetry Report:\n"
        f"- Timestamp: {row['timestamp']}\n"
        f"- Module: {row['name']}\n"
        f"- Temperature: {row['temp']:.2f}°C\n"
        f"- Transceiver Voltage: {row['trans-volt']:.2f}V\n"
    )
    for ch in range(1, 5):
        prompt += (
            f"- Channel {ch} Input Power: {row[f'channel_{ch}_in_pwr']:.2f} dBm\n"
            f"- Channel {ch} Output Power: {row[f'channel_{ch}_out_pwr']:.2f} dBm\n"
            f"- Channel {ch} Laser Bias Current: {row[f'channel_{ch}_laser_bias_cur']:.2f} mA\n"
        )
    prompt += "\nIs this normal or anomalous?"
    return prompt

with open('train.jsonl', 'w') as f:
    for _, row in df.iterrows():
        json.dump({"prompt": row_to_prompt(row), "response": classify_anomaly(row)}, f)
        f.write('\n')

print("✅ train.jsonl created.")
    

✅ train.jsonl created.


# Safe Fine-Tuning and Saving 

In [None]:
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset

# --- Debug GPU Information ---
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA Device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Version: {torch.version.cuda}")
    print(f"PyTorch Version: {torch.__version__}")
    os.environ['CUDA_LAUNCH_BLOCKING'] = '1'  # Force clearer error reporting from CUDA

# --- Load Dataset ---
dataset = load_dataset('json', data_files={'train': 'train.jsonl'})
train_test_split = dataset['train'].train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# --- Load Tokenizer ---
model_name = 'gpt2'
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Ensure tokenizer has padding token (GPT-2 does not have one by default)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# --- Load Model Safely ---
# Set dtype explicitly to match intended precision (can be float16 if using fp16 training)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32)

# Move to GPU after verifying load works
model = model.to("cuda")

# --- Tokenization Helper ---
def concatenate_prompt_response(examples):
    combined = [
        f"prompt: {p}\nresponse: {r}" for p, r in zip(examples['prompt'], examples['response'])
    ]
    return tokenizer(combined, truncation=True, max_length=512)

# Tokenize datasets
tokenized_train = train_dataset.map(concatenate_prompt_response, batched=True)
tokenized_eval = eval_dataset.map(concatenate_prompt_response, batched=True)

# --- Data Collator (dynamic padding) ---
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# --- Training Arguments ---
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    fp16=True,  # Use mixed precision
    logging_dir="./logs",
    logging_steps=10,
    report_to="tensorboard"
)

# --- Trainer Setup ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
    tokenizer=tokenizer
)

# --- Train Model ---
trainer.train()

# --- Safe Save (CPU-based) ---
print("✅ Training complete. Saving model to CPU...")

model = model.to("cpu")
model.save_pretrained("fine_tuned_gpt2_telemetry")
tokenizer.save_pretrained("fine_tuned_gpt2_telemetry")

print("✅ Model and tokenizer saved safely to 'fine_tuned_gpt2_telemetry'.")

# --- Post-save Reload Test ---
print("✅ Reloading saved model for sanity check...")

reloaded_model = AutoModelForCausalLM.from_pretrained("fine_tuned_gpt2_telemetry", torch_dtype=torch.float32)
reloaded_model = reloaded_model.to("cuda")  # Move back to GPU

reloaded_tokenizer = AutoTokenizer.from_pretrained("fine_tuned_gpt2_telemetry")

# Quick inference test to confirm save/load worked
test_input = "prompt: What is knowledge distillation?\nresponse:"
inputs = reloaded_tokenizer(test_input, return_tensors="pt").to("cuda")

with torch.no_grad():
    outputs = reloaded_model(**inputs)

print(f"✅ Reloaded model test passed. Output shape: {outputs.logits.shape}")





CUDA Available: True
CUDA Device: Quadro M1000M
CUDA Version: 11.7
PyTorch Version: 2.0.0+cu117


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/160 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.5241,0.480681
2,0.4732,0.461769


# Test for Model Corruption

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA Device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Version: {torch.version.cuda}")
    print(f"PyTorch Version: {torch.__version__}")

try:
    teacher = AutoModelForCausalLM.from_pretrained("./fine_tuned_gpt2_telemetry", device_map=None)
    print("✅ Model loaded successfully to CPU.")

    teacher = teacher.to("cuda")
    print("✅ Model moved to GPU successfully.")
except Exception as e:
    print(f"❌ Error during model load/move: {e}")
