# Install Required Libraries
Use pip to install Flask, LightLLM, and Google Cloud SDK.

In [None]:
from huggingface_hub import login
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# Install required libraries
!pip install transformers lightllm google-cloud-build

# Authenticate with Google Cloud
!gcloud auth login

# Set Google Cloud project
!gcloud config set project YOUR_PROJECT_ID

# Set up Hugging Face authentication

# Replace 'YOUR_HUGGING_FACE_TOKEN' with your actual Hugging Face token
login(token='YOUR_HUGGING_FACE_TOKEN')

# Import required libraries

# Download model and tokenizer from Hugging Face
model_name = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, use_auth_token=True)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)

# Save the model and tokenizer locally
model.save_pretrained("./model")
tokenizer.save_pretrained("./model")

# Set Up Flask Application
Create a Flask application that uses LightLLM to generate responses based on a given prompt.

In [None]:
# Import Required Libraries
from flask import Flask, request, jsonify
from lightllm import LLMPredictor
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# Initialize Flask Application
app = Flask(__name__)


# Load LightLLM Model
predictor = LLMPredictor(model_path="./model")

# Define Route for Text Prediction
@app.route('/predict', methods=['POST'])
def predict():
    # Get JSON data from the POST request
    data = request.get_json()
    
    # Extract the text input from the JSON data
    text = data.get('text')
    
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt")
    
    # Perform inference using the model
    outputs = model(**inputs)
    
    # Apply softmax to get probabilities
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    
    # Return the predictions as a JSON response
    return jsonify(predictions.tolist())

# Example usage
text = "Hello, how are you?"
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

print(predictions)

# Run Flask Application
if __name__ == '__main__':
    app.run(host='0.0.0.0', port=8080)

# Create Dockerfile
Write a Dockerfile to containerize the Flask application.

In [None]:
# Create Dockerfile
%%writefile Dockerfile
# Use the official Python image from the Docker Hub
FROM python:3.8-slim

# Set the working directory in the container
WORKDIR /app

# Copy the current directory contents into the container at /app
COPY . /app

# Install any needed packages specified in requirements.txt
RUN pip install --no-cache-dir -r requirements.txt

# Make port 8080 available to the world outside this container
EXPOSE 8080

# Define environment variable
ENV NAME World

# Run app.py when the container launches
CMD ["python", "app.py"]

# Build and Push Docker Image
Build the Docker image and push it to Google Container Registry.

In [None]:
# Authenticate with Google Cloud
!gcloud auth login

# Set Google Cloud project
!gcloud config set project YOUR_PROJECT_ID

# Create a cloudbuild.yaml file
cloudbuild_yaml = """
steps:
- name: 'gcr.io/cloud-builders/docker'
  args: ['build', '-t', 'gcr.io/YOUR_PROJECT_ID/lightllm-app', '.']
images:
- 'gcr.io/YOUR_PROJECT_ID/lightllm-app'
"""
with open('cloudbuild.yaml', 'w') as file:
    file.write(cloudbuild_yaml)

# Submit build to Google Cloud Build
!gcloud builds submit --config cloudbuild.yaml .

# Deploy to Cloud Run
Deploy the Docker image to Cloud Run using the gcloud command-line tool.

In [None]:
!gcloud run deploy distilbert-app --image gcr.io/YOUR_PROJECT_ID/lightllm-app --platform managed --region us-central1 --allow-unauthenticated

# Test the Deployed Service
Send a request to the deployed Cloud Run service to test the LightLLM integration.

In [None]:
# Test the Deployed Service
import requests

# Replace with your Cloud Run service URL
cloud_run_url = "https://YOUR_CLOUD_RUN_SERVICE_URL"

# Define the prompt to send to the service
prompt = "Hello, how are you?"

# Send a POST request to the deployed service
response = requests.post(
    f"{cloud_run_url}/predict",
    json={"prompt": prompt}
)

# Print the response from the service
print(response.json())

# Monitoring your instances in production
using google cloud monitoring tools

In [None]:
import json

# Enable Cloud Monitoring API
!gcloud services enable monitoring.googleapis.com

# Create a monitoring dashboard
dashboard_config = {
    "displayName": "Cloud Run GPU Utilization",
    "gridLayout": {
        "widgets": [
            {
                "title": "GPU Utilization",
                "xyChart": {
                    "dataSets": [
                        {
                            "timeSeriesQuery": {
                                "timeSeriesFilter": {
                                    "filter": 'metric.type="custom.googleapis.com/gpu/utilization" AND resource.type="cloud_run_revision"',
                                    "aggregation": {
                                        "alignmentPeriod": "60s",
                                        "perSeriesAligner": "ALIGN_MEAN"
                                    }
                                }
                            }
                        }
                    ],
                    "timeshiftDuration": "0s",
                    "yAxis": {
                        "label": "Utilization",
                        "scale": "LINEAR"
                    }
                }
            }
        ]
    }
}

# Save the dashboard configuration to a file
with open('dashboard.json', 'w') as f:
    json.dump(dashboard_config, f)

# Create the dashboard using gcloud
!gcloud monitoring dashboards create --config-from-file=dashboard.json

# Add GPUS to your cloud run services 

In [None]:
# Update Cloud Run service to use GPUs
!gcloud run services update distilbert-app --platform managed --region us-central1 --update-env-vars=GPU_TYPE=nvidia-tesla-t4,GPU_COUNT=1

# Push a change to your serving application
Realize that there is a refinement of changes that need to be made 



In [None]:
# Update Flask Application
%%writefile app.py
from flask import Flask, request, jsonify
from lightllm import LLMPredictor
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# Initialize Flask Application
app = Flask(__name__)

# Load LightLLM Model
predictor = LLMPredictor(model_path="./model")

# Define Route for Text Prediction
@app.route('/predict', methods=['POST'])
def predict():
    # Get JSON data from the POST request
    data = request.get_json()
    
    # Extract the text input from the JSON data
    text = data.get('text')
    
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt")
    
    # Perform inference using the model
    outputs = model(**inputs)
    
    # Apply softmax to get probabilities
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    
    # Return the predictions as a JSON response
    return jsonify(predictions.tolist())

# Example usage
text = "Hello, how are you?"
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

print(predictions)

# Run Flask Application
if __name__ == '__main__':
    app.run(host='0.0.0.0', port=8080)

# Traffic Splitting 
cloud run allows for traffic splitting between the two versions, take advantage of this! 

In [None]:
# Update Cloud Run service to split traffic between revisions
!gcloud run services update-traffic distilbert-app --platform managed --region us-central1 --to-revisions revision-1=50,revision-2=50

# Nirvana
welcome to paradise


In [None]:
from IPython.display import Image, display

# Display the image
image_url = "https://example.com/path/to/buddha_image.jpg"  # Replace with the actual URL of the image
display(Image(url=image_url, width=400, height=300))

# Display the text
print("nirvana")