# LLM Whisper HTTP Server
## with pyNgrok tunnel

In [None]:
!pip install git+https://github.com/openai/whisper.git
!pip install fastapi uvicorn
!pip install pyngrok
!pip install accelerate
!pip install einops
!pip install nest-asyncio
!pip install python-multipart

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-s51_pln3
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-s51_pln3
  Resolved https://github.com/openai/whisper.git to commit ba3f3cd54b0e5b8ce1ab3de13e32122d0d5f98ab
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[31mERROR: Operation cancelled by user[0m[31m


In [None]:
import whisper
WhisperModel = whisper.load_model("base").to("cpu")

## LLM model

In [None]:
import torch
import transformers
from transformers import AutoModelForCausalLM , AutoTokenizer

### https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard
#model_name = "Q-bert/Mamba-130M"
#model_name = "Q-bert/Mamba-370M"
#model_name = "Q-bert/Mamba-790M"
#model_name = "Q-bert/Mamba-1B"
#model_name = "Q-bert/Mamba-3B"
#model_name = "Q-bert/Mamba-3B-slimpj"
#model_name = "ckip-joint/bloom-3b-zh" # zh
model_name = "google/gemma-2b-it"
#model_name = "microsoft/phi-2"
#model_name = "Qwen/Qwen1.5-7B-Chat" # cn
#model_name = "lmsys/vicuna-7b-v1.5-16k" # zh/cn
#model_name = "yentinglin/Taiwan-LLM-7B-v2.0.1-chat" # zh
#model_name = "mistralai/Mistral-7B-Instruct-v0.2"
#model_name = "MediaTek-Research/Breeze-7B-Instruct-v0.1" # zh/cn
#model_name = "FelixChao/Severus-7B" #

#LLM = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype="auto", device_map="cuda") # for Mamba
LLM = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.bfloat16, device_map="cuda") # for the rest models

tokenizer = AutoTokenizer.from_pretrained(model_name)

## HTTP Server with Ngrok

In [None]:
import getpass
import os
import threading

from pyngrok import ngrok, conf


## set ngrok authtoken
print("Enter your authtoken, which can be copied from https://dashboard.ngrok.com/get-started/your-authtoken")
conf.get_default().auth_token = getpass.getpass()

In [None]:
# Open a ngrok tunnel to the HTTP server
public_url = ngrok.connect(5000).public_url
print(" * ngrok tunnel \"{}\" -> \"http://127.0.0.1:{}/\"".format(public_url, 5000))

# ... Update inbound traffic via APIs to use the public-facing ngrok URL


In [None]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
from fastapi import FastAPI, File, UploadFile
from fastapi.responses import Response
from pydantic import BaseModel
import uvicorn
import json

app = FastAPI()

@app.get("/")
def root():
    return Response("Hello World!")

@app.post("/audio/")
def post_audio(audio: UploadFile = File(...)):
    print(audio.filename)
    fname = 'tmp_'+audio.filename
    with open(fname, 'wb') as f:
        content = audio.file.read()
        f.write(content)

    # Whisper transcribe
    result = WhisperModel.transcribe(fname)
    print("Whisper: "+result["text"])

    prompt = result["text"]
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to("cuda")
    output = LLM.generate(input_ids, max_length=128, num_beams=5, no_repeat_ngram_size=2)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    print("LLM: "+generated_text)
    return Response(generated_text)

# start new thread
threading.Thread(uvicorn.run(app, host="127.0.0.1", port=5000, log_level="info")).start()