# GPT OSS MXFP4 (20B) Inference - T4 Safe Pipeline

This notebook provides a T4-safe implementation for running GPT models with mixed-precision inference.

**Features:**
- 20B parameter model support
- Mixed precision (FP4) optimization
- T4 GPU compatible memory management
- Safe inference with error handling

In [None]:
# Install required packages
!pip install -q torch transformers accelerate bitsandbytes
!pip install -q peft trl datasets

import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import warnings
warnings.filterwarnings('ignore')

In [None]:
# T4-Safe Configuration for 20B models
model_name = "microsoft/DialoGPT-large"  # Replace with your 20B model

# BitsAndBytes configuration for FP4
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

print(f"Loading model: {model_name}")
print(f"GPU available: {torch.cuda.is_available()}")
print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

In [None]:
# Load model with T4-safe settings
try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        device_map="auto",
        torch_dtype=torch.float16,
        trust_remote_code=True
    )
    
    print("Model loaded successfully!")
    print(f"Model device: {next(model.parameters()).device}")
    
except Exception as e:
    print(f"Error loading model: {e}")
    print("Please check model name and GPU memory availability")

In [None]:
# Safe inference pipeline
def safe_generate(prompt, max_length=512, temperature=0.7, top_p=0.9):
    try:
        inputs = tokenizer(prompt, return_tensors="pt")
        inputs = {k: v.cuda() for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=max_length,
                temperature=temperature,
                top_p=top_p,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )
        
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return generated_text
        
    except Exception as e:
        return f"Error during generation: {e}"

# Test the pipeline
test_prompt = "The future of artificial intelligence is"
result = safe_generate(test_prompt)
print(f"Prompt: {test_prompt}")
print(f"Generated: {result}")