# GPT and ViT Model Deployment with FastAPI

This notebook implements and tests FastAPI endpoints for GPT-2 and ViT models.

In [None]:
# Install required packages
!pip install fastapi uvicorn transformers torch python-multipart pytest httpx python-dotenv numpy Pillow pydantic nest-asyncio

In [None]:
# Import necessary libraries
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import GPT2LMHeadModel, GPT2Tokenizer, ViTModel, ViTImageProcessor
import torch
from PIL import Image
import io
import base64
from typing import Optional
import time
import nest_asyncio
import uvicorn
from fastapi.testclient import TestClient
import numpy as np
import json
from datetime import datetime

# Enable nested async loops (required for Colab)
nest_asyncio.apply()

In [None]:
# Create FastAPI application
app = FastAPI()

# Load models
print("Loading models...")
try:
    # GPT-2 model
    gpt_model = GPT2LMHeadModel.from_pretrained('gpt2')
    gpt_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    
    # ViT model
    vit_model = ViTModel.from_pretrained('google/vit-base-patch16-224')
    vit_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
    
    # Move models to GPU if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    gpt_model.to(device)
    vit_model.to(device)
    
except Exception as e:
    print(f"Error loading models: {str(e)}")
    raise

In [None]:
# Define request models
class TextRequest(BaseModel):
    text: str
    max_length: Optional[int] = 100

class ImageRequest(BaseModel):
    image: str  # base64 encoded image

# Define API endpoints
@app.post("/generate_text")
async def generate_text(request: TextRequest):
    start_time = time.time()
    try:
        inputs = gpt_tokenizer(request.text, return_tensors="pt").to(device)
        outputs = gpt_model.generate(
            **inputs,
            max_length=request.max_length,
            num_return_sequences=1,
            pad_token_id=gpt_tokenizer.eos_token_id
        )
        generated_text = gpt_tokenizer.decode(outputs[0], skip_special_tokens=True)
        process_time = time.time() - start_time
        return {
            "generated_text": generated_text,
            "processing_time": process_time
        }
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

@app.post("/process_image")
async def process_image(request: ImageRequest):
    start_time = time.time()
    try:
        # Decode base64 image
        image_bytes = base64.b64decode(request.image)
        image = Image.open(io.BytesIO(image_bytes))
        
        # Process image
        inputs = vit_processor(images=image, return_tensors="pt").to(device)
        outputs = vit_model(**inputs)
        
        # Get the [CLS] token representation
        cls_token = outputs.last_hidden_state[:, 0].cpu().detach().numpy()
        
        process_time = time.time() - start_time
        return {
            "embedding_shape": cls_token.shape,
            "processing_time": process_time
        }
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/health")
async def health_check():
    return {"status": "healthy"}

In [None]:
# Start the server in a separate thread
import threading
def run_server():
    uvicorn.run(app, host="127.0.0.1", port=8000)

server_thread = threading.Thread(target=run_server)
server_thread.daemon = True
server_thread.start()
print("Server started at http://127.0.0.1:8000")

# Wait a bit for the server to start
time.sleep(2)

In [None]:
# Testing functions
def create_test_image():
    # Create a simple test image
    img = Image.new('RGB', (224, 224), color='red')
    img_byte_arr = io.BytesIO()
    img.save(img_byte_arr, format='PNG')
    img_byte_arr = img_byte_arr.getvalue()
    return base64.b64encode(img_byte_arr).decode()

client = TestClient(app)

def test_health():
    response = client.get("/health")
    assert response.status_code == 200
    assert response.json() == {"status": "healthy"}
    print("Health check passed!")

def run_performance_tests():
    # Test health endpoint
    test_health()
    
    # Test GPT endpoint
    print("\nTesting GPT API Performance...")
    gpt_results = []
    test_text = "Once upon a time"
    
    for i in range(100):
        start_time = time.time()
        response = client.post(
            "/generate_text",
            json={"text": test_text, "max_length": 50}
        )
        end_time = time.time()
        
        assert response.status_code == 200
        
        result = {
            "request_number": i + 1,
            "total_time": end_time - start_time,
            "server_processing_time": response.json()["processing_time"]
        }
        gpt_results.append(result)
        print(f"GPT Request {i+1}/100 completed in {result['total_time']:.3f} seconds")
    
    # Test ViT endpoint
    print("\nTesting ViT API Performance...")
    vit_results = []
    test_image = create_test_image()
    
    for i in range(100):
        start_time = time.time()
        response = client.post(
            "/process_image",
            json={"image": test_image}
        )
        end_time = time.time()
        
        assert response.status_code == 200
        
        result = {
            "request_number": i + 1,
            "total_time": end_time - start_time,
            "server_processing_time": response.json()["processing_time"]
        }
        vit_results.append(result)
        print(f"ViT Request {i+1}/100 completed in {result['total_time']:.3f} seconds")
    
    # Calculate and display statistics
    def calculate_stats(results, model_name):
        total_times = [r["total_time"] for r in results]
        processing_times = [r["server_processing_time"] for r in results]
        
        stats = {
            "average_total_time": np.mean(total_times),
            "average_processing_time": np.mean(processing_times),
            "min_time": np.min(total_times),
            "max_time": np.max(total_times),
            "std_dev": np.std(total_times)
        }
        
        print(f"\n{model_name} Performance Statistics:")
        print(f"Average Total Time: {stats['average_total_time']:.3f} seconds")
        print(f"Average Processing Time: {stats['average_processing_time']:.3f} seconds")
        print(f"Min Time: {stats['min_time']:.3f} seconds")
        print(f"Max Time: {stats['max_time']:.3f} seconds")
        print(f"Standard Deviation: {stats['std_dev']:.3f} seconds")
        
        return stats
    
    gpt_stats = calculate_stats(gpt_results, "GPT")
    vit_stats = calculate_stats(vit_results, "ViT")
    
    # Save results
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    results = {
        "gpt": {
            "individual_results": gpt_results,
            "statistics": gpt_stats
        },
        "vit": {
            "individual_results": vit_results,
            "statistics": vit_stats
        }
    }
    
    with open(f"test_results_{timestamp}.json", "w") as f:
        json.dump(results, f, indent=4)
    print(f"\nDetailed results saved to test_results_{timestamp}.json")

In [None]:
# Run the performance tests
run_performance_tests()