In [None]:
try:
	__import__('lmdeploy')
	__import__('fastapi')
except ImportError:
	%%capture
	%pip install lmdeploy fastapi

In [None]:
import os
from lmdeploy import pipeline, TurbomindEngineConfig

from pydantic import BaseModel
from fastapi import FastAPI
from starlette.responses import JSONResponse
from dotenv import load_dotenv
from typing import List

load_dotenv()
small_model = os.getenv('FINETUNE_MODEL_NAME')

Hosting with FastAPI

In [None]:
# reference: https://github.com/InternLM/lmdeploy/blob/main/docs/en/inference/pipeline.md
pipe = pipeline(model_path=small_model, model_name=small_model, backend_config=TurbomindEngineConfig(tp=2))

# response = pipe(['Hi, please introduce yourself', 'USA is'])
# print(response)

class Message(BaseModel):
    role: str
    content: str

class MessageList(BaseModel):
    messages: List[Message]

app = FastAPI()

@app.post("/v1/chat/completions")
async def predict(message_list: MessageList):
    response = [pipe([message.content]) for message in message_list.messages]
    return JSONResponse(content=response)

if __name__ == "__main__":
	import uvicorn
	uvicorn.run(app, host='0.0.0.0', port=8000)

In [None]:
### Usage:
# Requests code
import requests
response = requests.post(
	'http://localhost:8000/v1/chat/completions',
	json={
		'messages': [{'role': 'user', 'content': 'Say this is a test!'}]
	}
)
if response.status_code == 200:
	print(response.json())
else:
	print(f"Error: {response.status_code}, {response.text}")

# CURL command
'''
curl http://localhost:8000/v1/chat/completions \
	-H "Content-Type: application/json" \
	-d '{
		"messages": [{"role": "user", "content": "Say this is a test!"}]
	}'
'''

## PyTorch deployment

In [None]:
import torch
from pathlib import Path

def save_model(model: torch.nn.Module,
			   target_dir: str,
			   model_name: str):
	'''Saves a PyTorch model to a target directory.

	Args:
		model: A target PyTorch model to save.
		target_dir: A directory for saving the model to.
		model_name: A filename for the saved model. Should include
		either '.pth' or '.pt' as the file extension.

	Example usage:
		save_model(model=model_0,
				target_dir='models',
				model_name='test.pth')
	'''
	# Create target directory
	target_dir_path = Path(target_dir)
	target_dir_path.mkdir(parents=True,
							exist_ok=True)

	assert model_name.endswith('.pth') or model_name.endswith('.pt'), 'model_name should end with .pt or .pth'
	# Create model save path
	model_save_path = target_dir_path / model_name

	# Save the model state_dict()
	print(f'[INFO] Saving model to: {model_save_path}')
	torch.save(obj=model.state_dict(),
				f=model_save_path)

# Load a pretrained model
from torchvision import models
model = models.efficientnet_b2(weights='DEFAULT')
model.eval()  # Set the model to evaluation mode

for param in model.parameters():  # Freeze the model parameters
	param.requires_grad = False
# # Modify the model for feature extraction
# model.classifier = torch.nn.Identity()  # Remove the classifier layer

# Print the model architecture
print(model)

# Save the model
save_model(model=model,
			target_dir='models',
			model_name='test.pth')

# Check if GPU is available and move the model to GPU if it is
if torch.cuda.is_available():
	model = model.to('cuda')
	print('[INFO] Model moved to GPU.')

Downloading: "https://download.pytorch.org/models/efficientnet_b2_rwightman-c35c1473.pth" to /home/praneeth/.cache/torch/hub/checkpoints/efficientnet_b2_rwightman-c35c1473.pth
100%|██████████| 35.2M/35.2M [00:40<00:00, 904kB/s] 


EfficientNet(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): SiLU(inplace=True)
    )
    (1): Sequential(
      (0): MBConv(
        (block): Sequential(
          (0): Conv2dNormActivation(
            (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
            (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): SiLU(inplace=True)
          )
          (1): SqueezeExcitation(
            (avgpool): AdaptiveAvgPool2d(output_size=1)
            (fc1): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
            (fc2): Conv2d(8, 32, kernel_size=(1, 1), stride=(1, 1))
            (activation): SiLU(inplace=True)
            (scale_activation): Sigmoid()
          )
          (2): Conv2dNormActivat

In [None]:
# Load the model from the saved file
def load_model(model_path: str, model_class: torch.nn.Module):
	'''Loads a PyTorch model from a saved file.

	Args:
		model_path: Path to the saved model file.
		model_class: The class of the model to load.

	Returns:
		A PyTorch model instance with the loaded state_dict.
	'''
	model = model_class()
	model.load_state_dict(torch.load(model_path))
	model.eval()  # Set the model to evaluation mode
	return model

# check the size of the file test.pth
model_path = Path('models/test.pth')
if model_path.exists():
	print(f'[INFO] Model size: {model_path.stat().st_size / (1024 * 1024):.2f} MB')
else:
	print('[ERROR] Model file does not exist.')

# Load the model
loaded_model = load_model(model_path=model_path, model_class=models.efficientnet_b2)

effnetb2_total_params = sum(torch.numel(param) for param in loaded_model.parameters())
print(f'[INFO] Total parameters in the model: {effnetb2_total_params}')

with torch.inference_mode():
	# Example input tensor
	input_tensor = torch.randn(1, 3, 224, 224)  # Batch size of 1, 3 channels, 224x224 image

	if torch.cuda.is_available():
		input_tensor = input_tensor.to('cuda')
		loaded_model = loaded_model.to('cuda')

	output = loaded_model(input_tensor)
	print(f'[INFO] Output shape: {output.shape}')  # Should be [1, 1000] for EfficientNet-B2

[INFO] Model size: 35.16 MB
[INFO] Total parameters in the model: 9109994
[INFO] Output shape: torch.Size([1, 1000])


In [None]:
# load pytorch model
# model_path = Path('models/test.pth')
# loaded_model = load_model(model_path=model_path, model_class=models.efficientnet_b2)

# deploy using flask
from flask import Flask, request, jsonify

app = Flask(__name__)

@app.post('/predict')
def predict_flask():
	data = request.get_json()
	if not data or 'input' not in data:
		return jsonify({'error': 'Invalid input'}), 400

	input_tensor = torch.tensor(data['input'])
	if torch.cuda.is_available():
		input_tensor = input_tensor.to('cuda')
		# loaded_model.to('cuda')  # assume the model is already loaded on GPU

	with torch.inference_mode():  # or torch.no_grad()
		output = loaded_model(input_tensor)
	return jsonify({'output': output.tolist()})

if __name__ == "__main__":
	app.run(host='0.0.0.0', port=5000)

### Usage
import requests
import torch

input_tensor = torch.randn(1, 3, 224, 224)
data = { 'input': input_tensor.tolist() }

url = 'http://localhost:5000/predict'
response = requests.post(url, json=data)
if response.status_code == 200:
	print('Response:', response.json())
else:
	print('Error:', response.status_code, response.text)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://192.168.155.191:5000
Press CTRL+C to quit
127.0.0.1 - - [28/May/2025 11:27:51] "POST /predict HTTP/1.1" 200 -


## For more projects, open [README.md](/README.md)

___