In [None]:
# Productionizing LLM applications

# Install necessary libraries
!pip install transformers flask flask-ngrok

# Import libraries
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from flask import Flask, request, jsonify
from flask_ngrok import run_with_ngrok
import threading
import time
import os

# --- Step 1: Loading a Pre-trained LLM ---

# Choose a small model for demonstration
MODEL_NAME = "gpt2"

# Load the tokenizer and model
try:
    tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)
    model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)
    print(f"Successfully loaded model and tokenizer for {MODEL_NAME}")
except Exception as e:
    print(f"Error loading model: {e}")
    # Exit or handle the error appropriately in a real application
    exit()

# Set padding token for GPT2, as it doesn't have one by default
# This is useful for batching requests later, though not strictly needed for this simple example
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval() # Set model to evaluation mode

print(f"Using device: {device}")

# --- Step 2 & 3: Serving the Model with Flask and Creating an API Endpoint ---

# Initialize Flask app
app = Flask(__name__)

# Use ngrok to expose the Flask app running in Colab to the internet
# Note: This is for demonstration. In production, you would use a proper web server like Gunicorn/uWSGI
# behind a reverse proxy like Nginx/Apache.
run_with_ngrok(app)

@app.route("/generate", methods=["POST"])
def generate_text():
    """
    API endpoint to generate text based on a prompt.
    Expects a JSON payload with a 'prompt' key.
    """
    # --- Step 4: Adding Basic Error Handling ---
    if not request.json or 'prompt' not in request.json:
        return jsonify({"error": "Invalid request. Please provide a JSON payload with a 'prompt' key."}), 400

    prompt = request.json['prompt']
    max_length = request.json.get('max_length', 50) # Default max_length to 50

    if not isinstance(prompt, str) or not isinstance(max_length, int) or max_length <= 0:
         return jsonify({"error": "Invalid prompt (must be string) or max_length (must be a positive integer)."}), 400


    print(f"Received prompt: '{prompt}' with max_length: {max_length}")

    try:
        # Tokenize the prompt
        input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)

        # Generate text
        # Added max_length and num_return_sequences for basic control
        # You might want to add other generation parameters like do_sample, temperature, top_k, top_p etc.
        output = model.generate(
            input_ids,
            max_length=max_length,
            num_return_sequences=1, # Generate only one sequence
            no_repeat_ngram_size=2, # Avoid repeating n-grams
            pad_token_id=tokenizer.eos_token_id, # Important for padding
            attention_mask=None # Let transformers handle attention mask
        )

        # Decode the generated text
        generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

        # Remove the original prompt from the generated text if it's at the beginning
        if generated_text.startswith(prompt):
            generated_text = generated_text[len(prompt):].strip()

        print(f"Generated text: '{generated_text}'")

        return jsonify({"generated_text": generated_text})

    except Exception as e:
        print(f"Error during text generation: {e}")
        return jsonify({"error": "An internal error occurred during text generation."}), 500

# Function to run the Flask app in a separate thread
# This is needed in Colab/Jupyter to keep the notebook interactive
def run_flask():
    # Flask's run() is blocking, so we run it in a separate thread
    # Use debug=False in production
    app.run()

# Start the Flask app in a new thread
thread = threading.Thread(target=run_flask)
thread.daemon = True # Allow the main program to exit even if the thread is running
thread.start()

# Give Flask/ngrok some time to start
print("Starting Flask server...")
time.sleep(5) # Wait a bit for ngrok to establish tunnel

# You can test the API by sending a POST request to the ngrok URL.
# The ngrok URL will be printed by the `run_with_ngrok` function output.
# Example using `requests` (run this in a separate cell after the server starts):
#
# import requests
#
# # Replace with the actual ngrok URL printed above
# ngrok_url = "YOUR_NGROK_URL"
#
# data = {"prompt": "Tell me a short story about a cat and a dog.", "max_length": 100}
#
# try:
#     response = requests.post(f"{ngrok_url}/generate", json=data)
#     response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
#     result = response.json()
#     print("API Response:", result)
# except requests.exceptions.RequestException as e:
#     print(f"Error calling API: {e}")
#

# --- Step 5: Containerization (Conceptual) ---

# While we can't demonstrate full containerization in Colab,
# the typical next step for production is to package this application
# into a Docker container.

# A basic Dockerfile would look something like this:
#
# FROM python:3.9-slim
#
# WORKDIR /app
#
# COPY requirements.txt .
# RUN pip install --no-cache-dir -r requirements.txt
#
# COPY . . # Copy your Python scripts (e.g., app.py)
#
# # Expose the port your Flask app listens on
# EXPOSE 5000 # Default Flask port
#
# # Command to run the application using a production-ready WSGI server
# CMD ["gunicorn", "-w", "4", "-b", "0.0.0.0:5000", "app:app"]
#
# You would need a `requirements.txt` containing:
# transformers
# torch
# flask
# gunicorn # for production server

# To build and run the Docker container (outside of Colab):
# docker build -t llm-api .
# docker run -p 5000:5000 llm-api
#

# For production deployment, you would then push this image to a container registry
# (like Docker Hub, Google Container Registry, AWS ECR) and deploy it to:
# - Kubernetes (GKE, EKS, AKS)
# - Serverless platforms (Cloud Run, AWS Lambda behind API Gateway)
# - Virtual Machines

# Considerations for Production LLMs:
# - Model Size: Large models require significant memory and compute. Consider smaller models, quantization, or model serving solutions optimized for large models (like NVIDIA Triton, or cloud-specific AI platforms).
# - Latency and Throughput: Optimizing model inference speed is crucial. Techniques include batching requests, using optimized runtimes (TensorRT, OpenVINO), and choosing appropriate hardware (GPUs, TPUs).
# - Scalability: Your serving infrastructure must be able to handle varying load. This is where Kubernetes or serverless platforms shine.
# - Cost: Running powerful GPUs for LLMs can be expensive. Choose cost-effective instances and scale down when not needed.
# - Monitoring: Implement logging and metrics to track API usage, errors, and model performance.
# - Security: Secure your API endpoints.
# - Model Updates: Plan how you will update the model without downtime.

print("\nFlask server is running. Use the ngrok URL printed above to send POST requests to the /generate endpoint.")
print("Remember to replace 'YOUR_NGROK_URL' in the example testing code.")



In [None]:
#
# --- Step 6: Basic Load Testing and Performance Considerations ---

# In a real production scenario, you'd want to perform load testing
# to understand how your API behaves under stress and identify bottlenecks.
# Tools like `locust`, `Apache JMeter`, or `bombardier` can be used.

# Example (Conceptual):
# Using `requests` to send multiple requests:
#
# import requests
# import time
# import concurrent.futures
#
# ngrok_url = "YOUR_NGROK_URL" # Replace with the actual ngrok URL
# endpoint = f"{ngrok_url}/generate"
# num_requests = 10 # Number of concurrent requests
#
# def send_request(prompt, max_length=50):
#     try:
#         data = {"prompt": prompt, "max_length": max_length}
#         response = requests.post(endpoint, json=data, timeout=60) # Add timeout
#         response.raise_for_status()
#         return response.json().get("generated_text", "Error")
#     except requests.exceptions.RequestException as e:
#         return f"Request Error: {e}"
#
# # Example prompts
# prompts = [
#     "The quick brown fox jumps over the lazy dog.",
#     "Once upon a time in a land far, far away,",
#     "Explain the concept of machine learning.",
#     "Write a short poem about stars.",
# ]
#
# print(f"\nStarting basic load test with {num_requests} concurrent requests...")
# start_time = time.time()
#
# # Use ThreadPoolExecutor for concurrent requests
# with concurrent.futures.ThreadPoolExecutor(max_workers=num_requests) as executor:
#     # Map prompts to the send_request function
#     # Cycle through prompts if num_requests is greater than the number of prompts
#     results = list(executor.map(send_request, [prompts[i % len(prompts)] for i in range(num_requests)]))
#
# end_time = time.time()
# print("Load test finished.")
#
# # Print results (optional, can be verbose)
# # for i, result in enumerate(results):
# #     print(f"Request {i+1}: {result[:100]}...") # Print first 100 chars
#
# print(f"Total time taken for {num_requests} requests: {end_time - start_time:.2f} seconds")
# print(f"Average time per request: {(end_time - start_time) / num_requests:.2f} seconds")
#
# # In a real test, you would also measure:
# # - Success rate (how many requests returned 2xx status)
# # - Latency distribution (min, max, average, percentiles)
# # - Error rate
# # - Server-side metrics (CPU/GPU usage, memory, request queue length)


# --- Step 7: Adding Logging and Monitoring ---

# Implement robust logging to understand what's happening in your application
# and for debugging. In production, you'd integrate with a centralized logging
# system (like Stackdriver, ELK stack, Splunk).

import logging

# Configure basic logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Replace print statements with logging calls
# Example in the generate_text function:
# logging.info(f"Received prompt: '{prompt}' with max_length: {max_length}")
# logging.info(f"Generated text: '{generated_text}'")
# logging.error(f"Error during text generation: {e}", exc_info=True) # Log exception traceback

# For monitoring, you would expose metrics about your application (e.g., number of requests,
# latency, error counts) using a library like `prometheus_client` and integrate with
# a monitoring system (Prometheus, Datadog, Cloud Monitoring).

# --- Step 8: Caching (Consideration) ---

# For frequently requested prompts, consider implementing a caching layer
# (e.g., using Redis or a simple in-memory cache) to return pre-computed
# responses and reduce inference time and cost.

# Example (Conceptual simple in-memory cache):
#
# cache = {}
# CACHE_TTL = 3600 # Cache time-to-live in seconds
#
# @app.route("/generate", methods=["POST"])
# def generate_text_with_cache():
#     # ... (input validation as before)
#
#     cache_key = f"{prompt}_{max_length}"
#
#     # Check cache
#     if cache_key in cache and (time.time() - cache[cache_key]['timestamp']) < CACHE_TTL:
#         logging.info(f"Cache hit for key: {cache_key}")
#         return jsonify({"generated_text": cache[cache_key]['text']})
#
#     logging.info(f"Cache miss for key: {cache_key}. Generating text...")
#
#     try:
#         # ... (Text generation logic as before)
#
#         # Store in cache
#         cache[cache_key] = {'text': generated_text, 'timestamp': time.time()}
#         logging.info(f"Cached result for key: {cache_key}")
#
#         return jsonify({"generated_text": generated_text})
#
#     except Exception as e:
#         # ... (Error handling as before)
#         pass
#
# # Note: A simple dictionary cache like this is not thread-safe and
# # doesn't handle cache eviction well. Use dedicated caching libraries
# # or services in production.

# --- Step 9: Asynchronous Processing / Task Queues (Consideration) ---

# For requests that might take a long time (e.g., generating very long texts),
# processing them synchronously in the web server can block the server and
# lead to timeouts. Consider using a task queue (like Celery with Redis/RabbitMQ)
# to process these requests asynchronously.

# The web server would accept the request, queue the task, and immediately
# return a response (e.g., a job ID). A separate worker process would pick
# up the task from the queue, perform the generation, and store the result.
# The client could then poll an endpoint with the job ID to retrieve the result
# or receive it via a webhook.

# --- Step 10: Model Quantization and Optimization ---

# For models that are too large or slow, techniques like quantization,
# pruning, or using optimized model formats (like ONNX) can significantly
# reduce model size and improve inference speed on various hardware.
# Libraries like `Hugging Face Optimum`, `PyTorch Mobile`, or cloud AI platforms
# offer tools for this.

# Example (Conceptual with basic quantization - requires specific libraries):
#
# from torch.quantization import quantize_dynamic
#
# # Apply dynamic quantization to the model (post-training)
# # This is a simple example, more advanced techniques exist.
# quantized_model = quantize_dynamic(model, {torch.nn.Linear, torch.nn.Embedding, torch.nn.Conv1d}, dtype=torch.qint8)
#
# # Use quantized_model for inference
# # This might reduce model size and potentially improve CPU inference speed.
# # model = quantized_model # Replace the original model with the quantized one if suitable

# --- Step 11: A/B Testing and Model Versioning ---

# In production, you'll often have different versions of your model or
# generation parameters. You'll want to deploy these side-by-side and
# direct a portion of traffic to each version to compare performance
# (e.g., response quality, latency, cost) through A/B testing.
# Deployment platforms and API gateways often provide routing capabilities
# for this.

# --- Step 12: Edge Deployment (Consideration) ---

# For applications requiring very low latency or offline capabilities,
# deploying a smaller, optimized version of the model directly to
# user devices (mobile phones, browsers via WebAssembly/WebGPU) might
# be an option. This requires model conversion and integration with
# mobile/web frameworks.

# --- Step 13: Cost Management ---

# Running LLMs, especially large ones on GPUs, can be expensive.
# Monitor your infrastructure costs closely. Optimize model size and inference
# speed. Consider using spot instances or reserved instances for predictable
# workloads. Scale down infrastructure during low-traffic periods.



In [None]:
#
# --- Step 14: Security Considerations ---

# Protecting your API and your LLM model from malicious use is crucial.

# - API Authentication and Authorization: Use API keys, OAuth, or other
#   authentication mechanisms to ensure only authorized clients can access
#   your /generate endpoint.
# - Input Sanitization: While LLMs are generally robust, consider sanitizing
#   user inputs to prevent potential injection attacks (though less common
#   in text generation APIs compared to database queries).
# - Output Moderation: LLMs can sometimes generate inappropriate, biased,
#   or toxic content. Implement post-processing filters or integrate with
#   content moderation services to filter harmful outputs before returning
#   them to the user.
# - Rate Limiting: Protect your API from abuse and denial-of-service attacks
#   by implementing rate limiting based on IP address, API key, or user ID.
#   Flask-Limiter is a library that can help with this.

# Example (Conceptual rate limiting with Flask-Limiter):
#
# !pip install Flask-Limiter
#
# from flask_limiter import Limiter
# from flask_limiter.util import get_remote_address
#
# # Initialize Limiter
# # Apply a global limit of 10 requests per minute per IP address
# # In a real app, use a more robust storage like Redis instead of in-memory
# limiter = Limiter(
#     get_remote_address,
#     app=app,
#     default_limits=["10 per minute"],
#     storage_uri="memory://", # Use "redis://localhost:6379" for Redis
# )
#
# # Apply the limit to the generate_text endpoint
# @app.route("/generate", methods=["POST"])
# @limiter.limit("2 per second", override_defaults=False) # Add a stricter limit for this endpoint
# def generate_text_with_security():
#     # ... (existing generate_text logic)
#     pass
#
# # Note: Remember to apply rate limiting appropriate for your expected traffic.


# --- Step 15: Integrating with Data Pipelines and Feedback Loops ---

# In production, your LLM might be part of a larger data pipeline.
# - Data Ingestion: Where does the input prompt come from?
# - Output Storage: Where do you store the generated text? For analysis,
#   auditing, or further processing?
# - Feedback Loops: How do you collect user feedback on the quality of
#   generated text? This feedback is crucial for monitoring model performance
#   over time and for fine-tuning/improving the model.

# Example (Conceptual - logging feedback):
#
# @app.route("/feedback", methods=["POST"])
# def receive_feedback():
#     """
#     API endpoint to receive user feedback on generated text.
#     Expects a JSON payload with 'generated_text_id', 'rating', 'comment'.
#     """
#     if not request.json or 'generated_text_id' not in request.json or 'rating' not in request.json:
#         return jsonify({"error": "Invalid request."}), 400
#
#     feedback_data = request.json
#     logging.info(f"Received feedback: {feedback_data}")
#
#     # In a real application, you would store this feedback in a database
#     # or a data warehouse for analysis.
#
#     return jsonify({"message": "Feedback received."}), 200

# --- Step 16: Handling Compliance and Regulatory Requirements ---

# Depending on your industry and region, you might have compliance requirements
# regarding data privacy (e.g., GDPR, CCPA), content restrictions, or
# explainability/fairness of AI models. Ensure your deployment strategy and
# data handling practices comply with relevant regulations.
# - Data Storage: Anonymize or pseudonymize sensitive data if prompts or
#   generated text contain it.
# - Model Explainability: While LLMs are often black boxes, some techniques
#   (like attention visualization, LIME, SHAP - though harder for generation)
#   can provide insights if explainability is required.
# - Bias Mitigation: Continuously evaluate your model for biases in its
#   output and implement strategies to mitigate them.

# --- Step 17: Disaster Recovery and Business Continuity ---

# What happens if your primary deployment region goes down?
# Implement a disaster recovery plan:
# - Redundancy: Deploy your application across multiple availability zones
#   or regions.
# - Backups: Regularly back up your model weights and application configuration.
# - Failover: Have a mechanism to automatically or manually failover to a
#   secondary deployment if the primary fails.
# - Monitoring: Set up alerts to notify you of outages.

# --- Step 18: Cost Optimization Strategies ---

# Revisited Cost: Beyond instance types, consider:
# - Spot Instances/Preemptible VMs: Use cheaper, interruptible instances for
#   batch processing or less critical workloads.
# - Auto-scaling: Configure your infrastructure to automatically scale up
#   during peak load and scale down during off-peak times.
# - Model Serving Platforms: Cloud providers' managed AI platforms often
#   offer optimized serving and pricing models (e.g., pay-per-inference).
# - Model Choice: Re-evaluate if a smaller, less expensive model can meet
#   your requirements.

# --- Step 19: Documentation and API Management ---

# Provide clear documentation for your API. Use tools like OpenAPI (Swagger)
# to define your API contract. Consider using an API Gateway for managing
# multiple endpoints, handling authentication, rate limiting, and monitoring
# in a centralized way.

# --- Step 20: Continuous Integration/Continuous Deployment (CI/CD) ---

# Automate the process of building, testing, and deploying your LLM application.
# - Version Control: Store your code and model files in a version control system (Git).
# - Automated Testing: Implement unit tests, integration tests, and potentially
#   model performance tests.
# - CI Pipeline: Automatically build and test your application on every code change.
# - CD Pipeline: Automate deployment to staging and production environments
#   after successful testing.

# This is a more comprehensive overview of considerations for productionizing LLM applications.
# Each step can be a complex topic in itself, and the specific implementation
# details will depend heavily on your chosen cloud provider, infrastructure,
# and specific application requirements.


In [None]:
#
# --- Step 21: Leveraging Cloud-Specific AI Platforms ---

# Instead of building everything from scratch using VMs and Flask,
# consider using managed AI/ML platforms provided by cloud providers.
# These platforms offer features optimized for model deployment and serving,
# often including:
# - Easy model deployment from various frameworks (TensorFlow, PyTorch, Hugging Face).
# - Auto-scaling based on traffic.
# - Integrated monitoring and logging.
# - Optimized inference hardware (GPUs, TPUs, custom chips).
# - A/B testing and model versioning capabilities.
# - Serverless inference options (pay-per-prediction).

# Examples:
# - Google Cloud AI Platform Prediction / Vertex AI
# - AWS SageMaker Endpoints
# - Azure Machine Learning Endpoints

# Using these platforms can significantly reduce the operational overhead
# compared to managing your own infrastructure. You would typically upload
# your trained model artifacts and define the serving container/environment.

# --- Step 22: Using Specialized LLM Serving Frameworks ---

# For serving large and complex LLMs specifically, there are specialized
# open-source and commercial serving frameworks designed for high throughput
# and low latency. These frameworks often implement advanced techniques like:
# - Continuous Batching: Efficiently processes multiple requests simultaneously.
# - Quantization and Sparsity optimizations.
# - Efficient attention mechanisms.
# - Speculative Decoding.

# Examples:
# - vLLM: Highly optimized for serving large LLMs.
# - NVIDIA Triton Inference Server: Supports various models and frameworks with optimizations.
# - Text Generation Inference (TGI) by Hugging Face.

# These frameworks often require more complex setup but can yield significant
# performance improvements for demanding LLM workloads. You would typically
# deploy these frameworks on powerful GPU instances or clusters.

# --- Step 23: Fine-tuning and Adaptation ---

# While loading a pre-trained model is a good starting point, for many
# applications, you'll need to fine-tune the LLM on a domain-specific dataset
# or adapt it to a specific task (e.g., summarization, translation).
# This involves training the model further on your own data.

# Considerations for Fine-tuning:
# - Data Preparation: Curate and format your training data correctly.
# - Compute Resources: Fine-tuning often requires significant compute (GPUs).
# - Hyperparameter Tuning: Experiment with learning rates, epochs, etc.
# - Model Size: Fine-tuning a large model is resource-intensive. Consider
#   Parameter-Efficient Fine-Tuning (PEFT) techniques like LoRA.
# - Deployment: Deploy the fine-tuned model as a new version.

# --- Step 24: Natural Language Understanding (NLU) Integration ---

# Before passing user input to the LLM, you might need to perform NLU tasks:
# - Intent Recognition: Understand the user's goal.
# - Entity Extraction: Identify key information in the prompt.
# - Input Validation: Check if the prompt is relevant or appropriate.

# This pre-processing step can help route requests, provide better context
# to the LLM, or filter out irrelevant inputs.

# --- Step 25: Post-processing the Output ---

# The raw output from the LLM might need post-processing:
# - Formatting: Ensure the text is formatted correctly (paragraphs, bullet points).
# - Length Truncation: Limit the output length.
# - Content Moderation: As mentioned before, filter inappropriate content.
# - Adding Structure: If the expected output is structured (e.g., JSON),
#   parse the LLM's text output into the desired format.

# --- Step 26: Ethical Considerations and Responsible AI ---

# Deploying LLMs comes with significant ethical responsibilities:
# - Bias: LLMs can inherit biases from their training data, leading to unfair
#   or discriminatory outputs. Continuously evaluate and mitigate bias.
# - Fairness: Ensure the model performs equally well for different demographic groups.
# - Transparency: While not always possible for LLMs, aim for explainability where
#   needed.
# - Safety: Prevent the model from generating harmful, illegal, or dangerous content.
#   Implement robust safety filters and monitoring.
# - Privacy: Handle user data and prompts responsibly, especially if they contain
#   sensitive information.

# --- Step 27: Team and Process ---

# Productionizing and maintaining an LLM application requires a multidisciplinary team:
# - ML Engineers: For model development, training, and optimization.
# - Software Engineers: For building the serving infrastructure, API, and integrations.
# - DevOps/SRE: For deployment, monitoring, scaling, and reliability.
# - Data Engineers: For data pipelines and feedback loops.
# - Product Managers/Domain Experts: To define requirements and evaluate output quality.

# Establish clear processes for model development lifecycle, deployment, monitoring,
# and incident response.

# --- Step 28: Cost Monitoring and Optimization ---

# Implement detailed cost monitoring to understand where resources are being spent.
# Use cloud provider cost management tools. Identify and optimize expensive
# operations (e.g., large GPU instances running idle, inefficient inference).

# --- Step 29: Data Governance and Compliance ---

# Establish clear policies for handling the data used for training, fine-tuning,
# and inference. Ensure compliance with data privacy regulations and industry-specific
# requirements. Track data lineage and model versions.

# --- Step 30: Continuous Improvement and Model Retraining ---

# LLMs and their performance can degrade over time due to shifting user behavior
# or changes in the underlying data distribution (data drift).
# - Monitor Model Performance: Track metrics like output quality (can be subjective
#   and require human evaluation), user satisfaction, and relevance.
# - Collect Feedback: Use feedback loops (Step 15) to identify areas for improvement.
# - Retraining Strategy: Plan for periodically retraining your model on new data
#   or fine-tuning it to address performance degradation or incorporate new requirements.
# - A/B Testing Retrained Models: Always test new model versions against the current
#   production version before fully rolling them out.

# This extended list covers more advanced and operational aspects of taking
# an LLM application from development to a robust, scalable, and maintainable
# production service.

In [None]:
#
# --- Step 31: Configuration Management ---

# Avoid hardcoding configuration values (model names, API keys, database
# connections, etc.) directly in your code. Use configuration files (e.g.,
# YAML, JSON) or environment variables.

# Example using environment variables (Conceptual):
#
# import os
#
# MODEL_NAME = os.environ.get("MODEL_NAME", "gpt2") # Default to gpt2
# API_KEY = os.environ.get("API_KEY") # For external service integration
#
# # Access configuration values later in the code
# print(f"Using model: {MODEL_NAME}")
#
# # To set environment variables in Colab for demonstration:
# # os.environ['MODEL_NAME'] = 'gpt2-medium'
#
# # In production, you would set environment variables in your deployment environment
# # (e.g., Dockerfile, Kubernetes deployment YAML, Cloud Run settings).


# --- Step 32: API Versioning ---

# As your API evolves, you may need to introduce changes that are not
# backward-compatible. Implement API versioning to allow clients to continue
# using older versions while new versions are being rolled out.

# Common versioning strategies:
# - URL Path Versioning: e.g., `/v1/generate`, `/v2/generate`
# - Header Versioning: e.g., `Accept: application/json; version=1.0`
# - Query Parameter Versioning: e.g., `/generate?api-version=1`

# Example (Conceptual URL versioning):
#
# @app.route("/v1/generate", methods=["POST"])
# def generate_text_v1():
#     # ... (logic for version 1)
#     pass
#
# @app.route("/v2/generate", methods=["POST"])
# def generate_text_v2():
#     # ... (updated logic for version 2)
#     pass

# --- Step 33: Documentation and SDKs ---

# Good documentation is essential for developers using your API.
# - API Reference: Detail endpoints, parameters, request/response formats,
#   and error codes (e.g., using OpenAPI/Swagger).
# - Guides and Examples: Provide tutorials and code snippets in different
#   programming languages.
# - Client Libraries (SDKs): Consider generating or manually creating client
#   libraries to simplify integration for users.

# --- Step 34: Observability (Metrics, Logging, Tracing) ---

# Beyond basic logging and monitoring, aim for observability:
# - Metrics: Collect detailed metrics (request count, latency percentiles,
#   error rates, GPU utilization, model inference time) using tools like
#   Prometheus and visualize them in dashboards (Grafana).
# - Distributed Tracing: If your application involves multiple services or
#   components (e.g., API gateway, your service, caching layer, database),
#   use distributed tracing (e.g., OpenTelemetry, Jaeger) to track requests
#   end-to-end and identify bottlenecks.
# - Structured Logging: Use structured logging (JSON format) for easier
#   parsing and analysis in centralized logging systems.

# --- Step 35: Disaster Recovery Plan Testing ---

# It's not enough to *have* a disaster recovery plan; you need to *test* it
# regularly. Simulate failures (e.g., bringing down a database, a server,
# or a whole region) to ensure your failover mechanisms work as expected
# and your recovery time objectives (RTO) and recovery point objectives (RPO)
# are met.

# --- Step 36: Capacity Planning ---

# Understand the capacity limits of your deployed infrastructure and model.
# - Determine how many requests per second your current setup can handle
#   while meeting latency requirements.
# - Plan for scaling based on anticipated future traffic growth.
# - Consider different instance types and their performance characteristics
#   for your specific model.

# --- Step 37: Model Explainability and Interpretability ---

# Depending on the application domain (e.g., healthcare, finance), explainability
# might be a legal or ethical requirement. While complex for generative models,
# research in this area is ongoing. Techniques might include:
# - Attention visualization.
# - Analyzing model activations.
# - Using simpler surrogate models.

# --- Step 38: User Authentication and Authorization ---

# For applications where users interact directly or where requests are associated
# with specific users, implement robust user authentication (verifying user identity)
# and authorization (controlling what actions a user can perform).

# Example (Conceptual with basic token auth):
#
# import uuid
#
# # Simple in-memory "database" of valid API keys
# valid_api_keys = {
#     str(uuid.uuid4()): "user1",
#     str(uuid.uuid4()): "user2",
# }
#
# @app.route("/generate", methods=["POST"])
# def generate_text_with_auth():
#     api_key = request.headers.get("X-API-Key") # Get API key from header
#
#     if api_key not in valid_api_keys:
#         return jsonify({"error": "Unauthorized. Invalid API Key."}), 401
#
#     # ... (rest of the generate_text logic)
#
#     # You can potentially use the user ID (e.g., valid_api_keys[api_key])
#     # for logging, rate limiting, or tracking usage per user.
#
#     return jsonify({"generated_text": generated_text})
#
# # Note: Use a secure method for managing API keys in production, like
# # environment variables or a secret management system.


# --- Step 39: Securing Model Artifacts ---

# Your trained model weights are valuable intellectual property and could
# potentially be misused if accessed by unauthorized parties.
# - Store model files securely (e.g., in private cloud storage buckets with
#   strict access controls).
# - Use encrypted storage.
# - Control access to your deployment environment where the model is loaded.

# --- Step 40: Legal and Compliance Review ---

# Before deploying to production, especially in regulated industries, have a
# legal and compliance review of your application, data handling practices,
# and model usage. Ensure you understand and comply with all relevant laws
# and regulations (e.g., data privacy, intellectual property, content moderation,
# AI ethics guidelines).

# This extended list provides a more comprehensive view of the complexities
# involved in successfully productionizing LLM applications, moving beyond
# just serving the model to building a reliable, scalable, secure, and
# maintainable service.

In [None]:
#
# --- Step 41: Implementing Health Checks ---

# Essential for monitoring and orchestration systems (like Kubernetes)
# to determine if your application instance is healthy and can serve traffic.

@app.route("/healthz", methods=["GET"])
def health_check():
    """
    Basic health check endpoint. Returns 200 OK if the server is running.
    Could be extended to check model loading or connectivity to external services.
    """
    # In a more advanced health check, you might:
    # - Try to load a small input through the model.
    # - Check connectivity to any backend databases or caches.
    # - Verify required dependencies are available.

    # Simple check: just return OK if the app is running
    return jsonify({"status": "ok"}), 200

# Monitoring systems or orchestration platforms will periodically hit this endpoint.
# If it returns a non-200 status or times out, the system knows the instance
# is unhealthy and can take action (e.g., restart the instance, route traffic away).

# --- Step 42: Graceful Shutdown ---

# Ensure your application can shut down cleanly without dropping active requests.
# This is important during deployments or scaling events.
# - Flask's development server doesn't handle this well, but production WSGI
#   servers like Gunicorn or uWSGI have mechanisms for graceful shutdown.
# - They typically stop accepting new connections but continue processing
#   existing ones for a configurable timeout period.

# Example (Conceptual - relies on the WSGI server):
#
# # In a production WSGI server command (like Gunicorn):
# # gunicorn -w 4 -b 0.0.0.0:5000 --timeout 30 --graceful-timeout 30 app:app
# # '--timeout' is for request duration, '--graceful-timeout' is for shutdown.

# --- Step 43: Handling Out-of-Memory Errors ---

# LLMs, especially large ones, consume significant memory (GPU and CPU).
# - Monitor memory usage closely.
# - Choose instances with sufficient RAM and GPU memory.
# - Implement techniques like quantization, model parallelism, or offloading
#   to reduce memory footprint if needed.
# - Be prepared for OOM errors; logs should capture these, and monitoring
#   should alert you. Orchestration systems can automatically restart instances.

# --- Step 44: Input and Output Token Limits ---

# LLMs have context windows (maximum number of input tokens they can process)
# and practical limits on the number of output tokens they can generate due to
# computational cost and latency.
# - Define and enforce maximum limits for both input prompt size and generated
#   text length in your API.
# - Handle cases where the user prompt exceeds the context window (e.g., truncate
#   the prompt, return an error).
# - Consider adding input validation for token count, not just character count.

# Example modification in generate_text (Conceptual token count check):
#
# @app.route("/generate", methods=["POST"])
# def generate_text_with_limits():
#     # ... (initial validation and prompt/max_length extraction)
#
#     try:
#         input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
#         input_token_count = input_ids.shape[-1]
#
#         MAX_INPUT_TOKENS = 1024 # Define a reasonable max input length for your model
#
#         if input_token_count > MAX_INPUT_TOKENS:
#              return jsonify({"error": f"Prompt exceeds maximum allowed input tokens ({MAX_INPUT_TOKENS})."}), 400
#
#         # ... (rest of generation logic)
#
#         # Ensure max_length passed to model.generate doesn't exceed a global max output tokens
#         MAX_GENERATION_TOKENS = 512 # Define a reasonable max generation length
#         effective_max_length = min(max_length, MAX_GENERATION_TOKENS)
#
#         output = model.generate(
#             input_ids,
#             max_length=input_token_count + effective_max_length, # Total sequence length
#             # ... other parameters
#         )
#         # ... (decoding and return)
#
#     except Exception as e:
#          # ... (error handling)
#          pass

# --- Step 45: Handling Bias and Fairness ---

# Explicitly address potential biases in LLM outputs.
# - Bias Detection: Use libraries or human evaluation to identify if the model
#   generates biased or unfair content based on sensitive attributes.
# - Bias Mitigation: Techniques include:
#     - Data Augmentation/Filtering during fine-tuning.
#     - Prompt Engineering to guide the model away from biased responses.
#     - Post-processing filters to detect and modify/remove biased outputs.
# - Documentation: Be transparent with users about the potential for bias
#   in the model's output.

# --- Step 46: Handling PII (Personally Identifiable Information) ---

# If user prompts might contain PII:
# - Anonymization/Pseudonymization: Implement techniques to detect and remove or
#   mask PII from prompts before sending them to the model.
# - Data Retention Policies: Define how long prompts and generated outputs
#   are stored and when they are deleted.
# - Compliance: Ensure your handling of PII complies with regulations like GDPR, CCPA.

# --- Step 47: Experiment Tracking and Model Registry ---

# For managing multiple model versions, fine-tuning experiments, and hyperparameters:
# - Use MLflow, Comet ML, Weights & Biases, or cloud-specific services (Vertex AI
#   Experiments/Model Registry, SageMaker MLflow integration).
# - Track hyperparameters, metrics (perplexity, generation quality scores),
#   datasets used, and model artifacts for each experiment.
# - Maintain a model registry to store and manage registered model versions
#   ready for deployment.

# --- Step 48: Prompt Engineering Best Practices ---

# How users phrase their prompts significantly impacts LLM output.
# - Provide clear guidelines or templates for users on how to write effective prompts.
# - Consider adding an internal "prompt augmentation" step in your API to
#   automatically add context or instructions to the user's raw prompt before
#   passing it to the model.

# --- Step 49: Integrating with Other Services ---

# LLM applications often need to interact with other parts of your system:
# - Databases: To store prompts, outputs, feedback, or user data.
# - Search Engines/Knowledge Bases: To retrieve relevant information to augment
#   the prompt (Retrieval Augmented Generation - RAG).
# - Downstream applications: That consume the generated text.
# - Monitoring and Alerting Systems.
# - Logging Aggregation Systems.

# Design your application and infrastructure to facilitate these integrations.

# --- Step 50: User Experience (UX) Considerations ---

# The quality and latency of the generated text directly impact the user experience.
# - Manage user expectations regarding generation time.
# - Implement streaming responses if possible (sending back text as it's generated)
#   to improve perceived latency.
# - Handle errors gracefully and provide informative messages to the user.
# - Design the user interface (if any) to effectively capture prompts and display outputs.


In [None]:
#
# Example of using Flask-Limiter (requires installation: !pip install Flask-Limiter)
from flask_limiter import Limiter
from flask_limiter.util import get_remote_address

# Initialize Limiter
# Apply a global limit of 60 requests per minute per IP address by default
# In a real app, use a more robust storage like Redis instead of in-memory
limiter = Limiter(
    get_remote_address,
    app=app,
    default_limits=["60 per minute"], # Example: 60 requests per minute globally
    storage_uri="memory://", # Use "redis://localhost:6379" for Redis in production
)

# Apply the limit to the generate_text endpoint
@app.route("/generate", methods=["POST"])
# Apply a stricter limit specifically for the generate endpoint, e.g., 10 requests per minute
# Use `override_defaults=False` to keep the global limit if needed, or `True` to replace it
@limiter.limit("10 per minute", override_defaults=False)
def generate_text_with_security():
    """
    API endpoint to generate text with rate limiting.
    Expects a JSON payload with a 'prompt' key.
    """
    if not request.json or 'prompt' not in request.json:
        # Note: Rate limiting should ideally happen before this validation for efficiency,
        # but for demonstration, it's applied via the decorator.
        return jsonify({"error": "Invalid request. Please provide a JSON payload with a 'prompt' key."}), 400

    prompt = request.json['prompt']
    max_length = request.json.get('max_length', 50)

    if not isinstance(prompt, str) or not isinstance(max_length, int) or max_length <= 0:
         return jsonify({"error": "Invalid prompt (must be string) or max_length (must be a positive integer)."}), 400

    logging.info(f"Received prompt: '{prompt[:50]}...' with max_length: {max_length}") # Log truncated prompt

    try:
        # --- Add Input Token Limit Check (Conceptual) ---
        input_ids = tokenizer.encode(prompt, return_tensors='pt')
        input_token_count = input_ids.shape[-1]

        MAX_INPUT_TOKENS = 512  # Define a reasonable max input length for gpt2
        if input_token_count > MAX_INPUT_TOKENS:
            logging.warning(f"Prompt exceeds max input tokens: {input_token_count} > {MAX_INPUT_TOKENS}")
            return jsonify({"error": f"Prompt exceeds maximum allowed input tokens ({MAX_INPUT_TOKENS})."}), 400

        input_ids = input_ids.to(device) # Move to device after token check

        # --- Add Global Max Generation Token Limit ---
        MAX_GENERATION_TOKENS = 200 # Define a reasonable max generation length to prevent excessive generation
        effective_max_length = min(max_length, MAX_GENERATION_TOKENS)

        logging.info(f"Generating with effective max_length: {effective_max_length} tokens")

        # Generate text
        output = model.generate(
            input_ids,
            # Total sequence length = input + generated
            max_length=input_token_count + effective_max_length,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            pad_token_id=tokenizer.eos_token_id,
            attention_mask=None
        )

        # Decode the generated text
        generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

        # Remove the original prompt from the generated text if it's at the beginning
        if generated_text.startswith(prompt):
            generated_text = generated_text[len(prompt):].strip()

        logging.info(f"Generated text: '{generated_text[:100]}...'") # Log truncated output

        return jsonify({"generated_text": generated_text})

    except Exception as e:
        logging.error(f"Error during text generation: {e}", exc_info=True) # Log traceback for errors
        return jsonify({"error": "An internal error occurred during text generation."}), 500

# --- Add a Health Check Endpoint ---
@app.route("/healthz", methods=["GET"])
def health_check():
    """
    Basic health check endpoint. Returns 200 OK if the server is running.
    Can be extended to check model loading or connectivity.
    """
    logging.info("Health check received.")
    # Add checks here if needed, e.g., try a quick model inference or check device status
    try:
        # Simple model check: encode/decode a token
        test_input = tokenizer.encode("hello", return_tensors="pt").to(device)
        test_output = model.generate(test_input, max_length=test_input.shape[-1] + 1, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
        tokenizer.decode(test_output[0], skip_special_tokens=True)
        logging.info("Health check: Model is responsive.")
        return jsonify({"status": "ok", "model_status": "responsive"}), 200
    except Exception as e:
        logging.error(f"Health check failed: {e}", exc_info=True)
        return jsonify({"status": "error", "model_status": "unresponsive", "error": str(e)}), 500


# --- Add a Feedback Endpoint (Conceptual) ---
@app.route("/feedback", methods=["POST"])
def receive_feedback():
    """
    API endpoint to receive user feedback on generated text.
    Expects a JSON payload with 'prompt', 'generated_text', 'rating', 'comment'.
    """
    if not request.json or 'prompt' not in request.json or 'generated_text' not in request.json or 'rating' not in request.json:
        return jsonify({"error": "Invalid request. Missing required fields."}), 400

    feedback_data = request.json
    # Validate feedback data types if necessary
    if not isinstance(feedback_data.get('rating'), (int, float)):
         return jsonify({"error": "Invalid rating format."}), 400


    logging.info(f"Received feedback: Prompt='{feedback_data['prompt'][:50]}...', Generated='{feedback_data['generated_text'][:50]}...', Rating={feedback_data['rating']}, Comment='{feedback_data.get('comment', '')[:50]}...'")

    # In a real application, store this feedback in a database or data lake
    # for model monitoring and potential retraining data.
    # Example: store in a file for this demo (not production-ready)
    try:
        with open("feedback.log", "a") as f:
            import json
            f.write(json.dumps(feedback_data) + "\n")
        logging.info("Feedback logged to feedback.log")
    except Exception as e:
        logging.error(f"Failed to write feedback to file: {e}")
        # Decide if feedback failure should result in an error response

    return jsonify({"message": "Feedback received. Thank you!"}), 200


# Re-starting Flask app thread with updated routes
# Stop the existing thread if it's running (optional, but good practice)
# if 'thread' in locals() and thread.is_alive():
#     print("Attempting to stop existing Flask thread...")
#     # There's no clean way to stop a Python thread from outside,
#     # In a real app, you'd manage the server process differently.
#     # For Colab, often just running the cell again is sufficient
#     # but might leave the old server running in the background.
#     # A more robust approach for development might involve signaling
#     # or checking a flag in the request handler (not shown here).
#     # For this example, we'll just start a new one.

print("Starting Flask server with updated endpoints...")
# Start the Flask app in a new thread
# Ensure Flask runs on 0.0.0.0 to be accessible by ngrok
# Use_reloader=False is important for Colab, otherwise it might start multiple processes
thread = threading.Thread(target=lambda: app.run(host='0.0.0.0', port=5000, debug=False, use_reloader=False))
thread.daemon = True # Allow the main program to exit even if the thread is running
thread.start()

# Give Flask/ngrok some time to start
print("Giving Flask server time to start...")
time.sleep(10) # Increased sleep slightly

print("\nFlask server is running. Use the ngrok URL printed above to send POST requests to the /generate endpoint.")
print("Also available: /healthz (GET) and /feedback (POST).")
print("Remember to replace 'YOUR_NGROK_URL' in the example testing code.")


In [None]:
#
# --- Step 51: Environment-Specific Configuration ---

# Production deployments require different configurations than development
# (e.g., database URLs, API keys, logging levels). Use a library like
# `python-dotenv` or `Dynaconf` to manage environment-specific settings.

# Example using a simple config dictionary (Conceptual):
#
# import os
#
# # Define different configurations
# CONFIG = {
#     "development": {
#         "LOG_LEVEL": logging.DEBUG,
#         "STORAGE_URI": "memory://",
#         "MAX_INPUT_TOKENS": 1024,
#     },
#     "production": {
#         "LOG_LEVEL": logging.INFO,
#         "STORAGE_URI": "redis://redis:6379", # Use a real Redis instance
#         "MAX_INPUT_TOKENS": 512, # Potentially stricter in production
#         "API_KEY_HEADER": "X-API-Key",
#     }
# }
#
# # Determine the current environment
# ENV = os.environ.get("FLASK_ENV", "development") # Default to development
# current_config = CONFIG.get(ENV, CONFIG["development"])
#
# # Apply configuration
# logging.basicConfig(level=current_config["LOG_LEVEL"], format='%(asctime)s - %(levelname)s - %(message)s')
# limiter = Limiter(
#     get_remote_address,
#     app=app,
#     default_limits=["60 per minute"],
#     storage_uri=current_config["STORAGE_URI"],
# )
# MAX_INPUT_TOKENS = current_config["MAX_INPUT_TOKENS"]
#
# # Access other config values as needed, e.g.:
# # api_key = request.headers.get(current_config.get("API_KEY_HEADER", "Authorization"))

# In a real application, you would load these configurations from files
# or environment variables using a proper configuration management library.

# --- Step 52: Infrastructure as Code (IaC) ---

# Manage your cloud infrastructure (VMs, containers, load balancers, databases)
# using code. This ensures consistency, repeatability, and makes it easier to
# manage different environments (dev, staging, prod).

# Tools for IaC:
# - Terraform
# - CloudFormation (AWS)
# - Deployment Manager / Cloud Deployment Manager (GCP)
# - Azure Resource Manager (ARM) templates
# - Kubernetes YAML manifests

# Example (Conceptual Terraform structure):
#
# /terraform
#   /modules
#     /llm-service
#       main.tf # Defines container deployment, scaling, load balancer
#     /redis
#       main.tf # Defines Redis instance
#   /environments
#     /dev
#       main.tf # Uses llm-service and redis modules, defines dev-specific vars
#       variables.tf
#     /prod
#       main.tf # Uses llm-service and redis modules, defines prod-specific vars
#       variables.tf

# --- Step 53: Secrets Management ---

# Don't hardcode sensitive information (database passwords, API keys for
# external services, private keys) in your code or configuration files.
# Use a dedicated secrets management system.

# Tools:
# - HashiCorp Vault
# - AWS Secrets Manager
# - Google Cloud Secret Manager
# - Azure Key Vault
# - Kubernetes Secrets (with caution and potentially encryption)

# Example (Conceptual - fetching a secret in code):
#
# import os
# # Assuming a library to fetch from a secret manager based on environment variables
# # For example, using Google Cloud Secret Manager client library
# # from google.cloud import secretmanager
#
# # def access_secret_version(project_id, secret_id, version_id="latest"):
# #     client = secretmanager.SecretManagerServiceClient()
# #     name = f"projects/{project_id}/secrets/{secret_id}/versions/{version_id}"
# #     response = client.access_secret_version(request={"name": name})
# #     return response.payload.data.decode("UTF-8")
#
# # REDIS_PASSWORD = access_secret_version("my-gcp-project", "redis-password")
# # DB_PASSWORD = access_secret_version("my-gcp-project", "database-password")

# You would configure your deployment environment to provide necessary
# credentials or roles for your application to access the secrets manager.

# --- Step 54: Centralized Log Management and Analysis ---

# As your application scales, logs from different instances and services need
# to be collected, aggregated, and analyzed in a centralized system.
# - Logging Agents: Use agents (e.g., Filebeat, Fluentd, Cloud Logging agents)
#   to collect logs from application instances and send them to a central system.
# - Centralized Systems:
#   - ELK stack (Elasticsearch, Logstash, Kibana)
#   - Splunk
#   - Cloud Logging (GCP)
#   - CloudWatch Logs (AWS)
#   - Azure Monitor Logs

# This allows for searching, filtering, analyzing, and visualizing logs from
# your entire application stack, crucial for debugging and monitoring.

# --- Step 55: Performance Monitoring and Profiling ---

# Beyond just latency metrics, understand where time is spent within your
# application and model inference.
# - Application Performance Monitoring (APM): Use tools (e.g., Datadog, New Relic,
#   Prometheus + Grafana) to monitor request traces, function call times, database
#   queries, and external service calls.
# - Model Profiling: Use profiling tools specific to your ML framework (e.g.,
#   PyTorch profiler, TensorFlow profiler) to analyze the performance of
#   model layers and operations. Identify bottlenecks (e.g., CPU-bound pre-processing,
#   GPU underutilization).

# --- Step 56: Data Versioning and Provenance ---

# If you are fine-tuning your LLM, managing datasets is crucial.
# - Data Versioning: Use tools like DVC (Data Version Control) to version
#   datasets alongside your code.
# - Data Provenance: Track which dataset version was used to train or fine-tune
#   each model version. This is essential for reproducibility and debugging.

# --- Step 57: Cost Optimization Strategies (Advanced) ---

# - Right-sizing Instances: Continuously evaluate if your instances are
#   appropriately sized for the workload. Avoid over-provisioning.
# - Autoscaling Optimization: Fine-tune autoscaling policies to respond
#   quickly to load changes while avoiding excessive costs from frequent scaling.
# - Reserved Instances/Savings Plans: For predictable base loads, purchase
#   reserved instances or savings plans from cloud providers for discounted rates.
# - Spot Instances: Use spot instances for fault-tolerant workloads like batch
#   inference or model training/fine-tuning if cost is a major driver and
#   interruptions are acceptable.
# - Model Optimization: As mentioned in Step 10, optimize the model itself
#   (quantization, pruning) to potentially run on cheaper or fewer resources.

# --- Step 58: Handling Dependency Management ---

# Use a dependency manager (`pipenv`, `poetry`, `conda`) and a `requirements.txt`
# (or similar lock file) to specify exact dependencies. This ensures consistency
# across development, testing, and production environments and avoids "it works
# on my machine" issues.

# Example requirements.txt:
# transformers==4.30.2
# torch==2.0.1
# flask==2.3.2
# flask-ngrok==0.0.25
# Flask-Limiter==3.5.1
# gunicorn==21.2.0 # for production server

# Ensure your Dockerfile or deployment process installs these dependencies correctly.

# --- Step 59: Integration Testing ---

# Write automated integration tests that verify the interaction between different
# components of your system (e.g., testing the API endpoint, ensuring it
# successfully loads the model, performs inference, and returns a valid response).
# These tests are crucial for catching issues that unit tests miss.

# --- Step 60: Chaos Engineering (Advanced) ---

# Intentionally inject failures into your system (e.g., delay requests, shut down
# instances, introduce network latency) in a controlled environment to test the
# resilience of your application and infrastructure. Tools like "Chaos Monkey"
# or "Gremlin" can be used. This helps identify weak points before they cause
# production outages.

# These additional steps cover more advanced topics in building robust,
# scalable, and maintainable production systems for machine learning models,
# including LLMs. The specific steps you implement will depend on the
# complexity and criticality of your application.
