In [None]:
%%capture

!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
!pip install --no-deps unsloth
!pip install transformers  wandb vllm


In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048
lora_rank = 32
dtype = None

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "rj25031/psyco-counsil-ai",
    max_seq_length = max_seq_length,
    load_in_4bit = True,
    fast_inference = True,
    max_lora_rank = lora_rank,
    dtype = dtype,
    gpu_memory_utilization = 0.6,
)

model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha = lora_rank,
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [None]:
!pip install fastapi uvicorn pyngrok transformers torch nest_asyncio pymongo sentence-transformers scikit-learn -q

!ngrok config add-authtoken your_ngrock_api_key


In [None]:
from fastapi import FastAPI
from pydantic import BaseModel
from fastapi.middleware.cors import CORSMiddleware
from pyngrok import ngrok
import uvicorn
import re
import nest_asyncio

app = FastAPI()

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)


app.prompt_template = """You are a licensed mental health therapist conducting a private counseling session. Your tone should be warm, empathetic, and non-judgmental. Provide thoughtful, supportive responses based on the client’s concerns.

### Current Client Message:
{user_input}

### Your Response as the Therapist:
"""

class ChatInput(BaseModel):
    prompt: str

def generate_response(user_input: str):
    full_prompt = app.prompt_template.format(user_input=user_input)
    inputs = tokenizer([full_prompt], return_tensors="pt").to("cuda")

    outputs = model.generate(
        **inputs,
        max_new_tokens=150
    )
    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    match = re.search(r"### Your Response as the Therapist:\n(.*?)(?:###|$)", response, re.DOTALL)
    return match.group(1).strip() if match else "I'm here to support you—could you tell me more?"

@app.post("/chat")
async def chat(data: ChatInput):
    response = generate_response(data.prompt)
    return {"response": response}

ngrok_tunnel = ngrok.connect(8000)
print(f"Public API URL: {ngrok_tunnel.public_url}")

nest_asyncio.apply()
uvicorn.run(app, host="0.0.0.0", port=8000, reload=False)
