<h1 style="color:#05988A" id='Install-Libraries'> Install Libraries </h1>

<h2 style="color:#05988A"> Required</h2>

In [1]:
%%capture
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0, 1"

!pip install fastapi nest-asyncio pyngrok uvicorn 
!pip install groq -q

!pip install pip3-autoremove
!pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu124
!pip install unsloth
!pip install --upgrade transformers==4.52.4

<h1 style="color:#05988A" id='FastAPI-App-Skeleton'> FastAPI App Skeleton </h1>

In [2]:
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, Field
from fastapi import FastAPI, Depends, HTTPException
import os
from groq import Groq
import re
from typing import List, Optional, Dict
from kaggle_secrets import UserSecretsClient

# Loading GROQ API Key
user_secrets = UserSecretsClient()
groq_api_key = user_secrets.get_secret("GROQ_API_KEY")

app = FastAPI()

# middlewares
app.add_middleware(
    CORSMiddleware, # https://fastapi.tiangolo.com/tutorial/cors/
    allow_origins=['*'], # wildcard to allow all, more here - https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Access-Control-Allow-Origin
    allow_credentials=True, # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Access-Control-Allow-Credentials
    allow_methods=['*'], # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Access-Control-Allow-Methods
    allow_headers=['*'], # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Access-Control-Allow-Headers
)

groq = Groq(api_key = groq_api_key)

def language_identify(chat_msg):
    
    prompt = f'''Identify the chat message into one of these languages: 
    (1) English, (2) Bengali.
    If you can't figure out a category, use "UnIdentified".
    Put the category inside <category> </category> tags. 
    Chat message: {chat_msg}'''

    chat_completion = groq.chat.completions.create(
        messages=[{"role": "user", "content": prompt}],
        # model="llama-3.3-70b-versatile",
        model="deepseek-r1-distill-llama-70b",
        temperature=0.5
    )

    content = chat_completion.choices[0].message.content
    match = re.search(r'<category>(.*)<\/category>', content, flags=re.DOTALL)
    category = "UnIdentified"
    if match:
        category = match.group(1)

    return category


def filter_unique_bengali_sentences_and_rejoin(paragraph):
    """
    Filters unique sentences from a Bengali paragraph and rejoins them.

    Args:
        paragraph (str): The Bengali paragraph as a string.

    Returns:
        str: A new paragraph containing only the unique sentences.
    """
    # Replace common sentence-ending punctuations with a single delimiter for splitting
    paragraph = re.sub(r'([?.!])', r'\1।', paragraph)
    
    # Split the paragraph into a list of sentences using '।' as the delimiter
    sentences = [s.strip() for s in paragraph.split('।') if s.strip()]
    
    # Use a set to store unique sentences while preserving order
    # This is a common pattern to get unique items in a list while maintaining order.
    # The 'dict.fromkeys' method is an efficient and Pythonic way to do this.
    unique_sentences_ordered = list(dict.fromkeys(sentences))
    
    # Join the unique sentences back into a single paragraph
    # The '। ' is used to ensure sentences are separated by a full stop and a space.
    rejoined_paragraph = '। '.join(unique_sentences_ordered)
    
    return rejoined_paragraph + '।' if rejoined_paragraph else ''


<h1 style="color:#05988A" id='ngrok API setup'> ngrok API setup </h1>

#### We will use `ngrok` to run our fastAPI app temporarily

#### [https://ngrok.com](https://ngrok.com)

In [3]:
import nest_asyncio
from pyngrok import ngrok
import uvicorn


# specify a port
port = 8004
!ngrok config add-authtoken 315IFEYCwihgPaWLvk51lTX79tK_6HUBEHQxeNKmvpyuYvRFK
ngrok_tunnel = ngrok.connect(port)

# where we can visit our fastAPI app
print('Public URL:', ngrok_tunnel.public_url)


nest_asyncio.apply()

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml                                
Public URL: https://3825d77f03d3.ngrok-free.app


<h1 style="color:#05988A" id="LLM-App"> LLM App </h1>

#### Now let's run a LLM fastAPI app

In [4]:
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
import torch

max_seq_length = 2048
dtype = None
load_in_4bit = True

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2025-08-22 10:52:40.247017: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755859960.452208      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755859960.510003      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


🦥 Unsloth Zoo will now patch everything to make training faster!


In [5]:
# Request/Response models

class MedicalQuery(BaseModel):
    question: str = Field(..., 
                         description="Medical question to ask")

class MedicalResponse(BaseModel):
    response: str
    

class QnaModel:
    """
    QnaModel: Fine-Tuned LLAMA Model
    """
    def load_model(self):
        """Loads the model"""
        self.model, self.tokenizer = FastLanguageModel.from_pretrained(
        model_name = "samanta-scratch/bilingual-health-qna", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
        )

        self.tokenizer = get_chat_template(
        self.tokenizer,
        chat_template = "llama-3.1",
        )
        FastLanguageModel.for_inference(self.model) # Enable native 2x faster inference

    
    async def infer(self, input_data: MedicalQuery) -> MedicalResponse:  # dependency
        """Runs a prediction"""
        if not self.model:
            # raise RuntimeError("Model is not loaded")
            raise HTTPException(status_code=400, detail="Model is not loaded")

        messages = [
            {"role": "user", "content": f"{input_data.question}"},
        ]
        inputs = self.tokenizer.apply_chat_template(
            messages,
            tokenize = True,
            add_generation_prompt = True, # Must add for generation
            return_tensors = "pt",
        ).to("cuda")


        lang = language_identify(input_data.question)
        if "english" in lang.lower():
            outputs = self.model.generate(input_ids = inputs, max_new_tokens = 128, use_cache = True,
                                temperature = 0.1, min_p = 0.1)
            text = self.tokenizer.batch_decode(outputs)
            response_text = (text[0].split('<|end_header_id|>')[-1]).split('<|eot_id|>')[0]
        else:
            outputs = self.model.generate(input_ids = inputs, max_new_tokens = 256, use_cache = True,
                                temperature = 0.1, min_p = 0.1)
            text = self.tokenizer.batch_decode(outputs)
            text = (text[0].split('<|end_header_id|>')[-1]).split('<|eot_id|>')[0]
            response_text = filter_unique_bengali_sentences_and_rejoin(text)

        
        print(response_text)

        return MedicalResponse(response=response_text)

In [None]:
qna_model = QnaModel()


@app.get('/')
def index():
    return {
    "messages": [
            {"role": "user", "content": "Explain the importance of staying hydrated and its benefits for overall health."},
        ]
    }


# post request that takes a review (text type) and returns a sentiment score
@app.post('/infer')
async def infer(output: MedicalResponse = Depends(qna_model.infer, )) -> MedicalResponse:
    """
    example post request body:
    
    {
        "question": "ভিশন থেরাপি কি?"
    }
    
    example response:
    {
    "response": "ভিশন থেরাপি হল একটি প্রক্রিয়া যা আপনার দৃষ্টি এবং দৃষ্টিভঙ্গি উন্নত করতে সাহায্য করে। এটি প্রায়শই একজন চিকিৎসক বা একজন পেশাদার দৃষ্টিভঙ্গি পরিচালক দ্বারা পরিচালিত হয়"
    }
    """
    return output


# load the model asynchronously on startup
@app.on_event("startup")
async def startup():
    qna_model.load_model()

        on_event is deprecated, use lifespan event handlers instead.

        Read more about it in the
        [FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
        
  @app.on_event("startup")


<h1 style="color:#05988A" id="Run-API"> Run API </h1>

In [None]:
# finally run the app
uvicorn.run(app, port=port)

# it will take some moment (~10 sec) to load the model