# LLM Whisper HTTP Server
## with pyNgrok tunnel

In [1]:
!pip install git+https://github.com/openai/whisper.git
!pip install fastapi uvicorn
!pip install pyngrok
!pip install accelerate
!pip install einops
!pip install nest-asyncio
!pip install python-multipart

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-q09ojz8o
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-q09ojz8o
  Resolved https://github.com/openai/whisper.git to commit ba3f3cd54b0e5b8ce1ab3de13e32122d0d5f98ab
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone


In [2]:
import whisper
ASR = whisper.load_model("base").to("cpu")

## LLM model

In [3]:
import torch
import transformers
from transformers import AutoModelForCausalLM , AutoTokenizer

### https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard
model_name = "Q-bert/Mamba-130M"
#model_name = "Q-bert/Mamba-370M"
#model_name = "Q-bert/Mamba-790M"
#model_name = "Q-bert/Mamba-1B"
#model_name = "Q-bert/Mamba-3B"
#model_name = "Q-bert/Mamba-3B-slimpj"
#model_name = "ckip-joint/bloom-3b-zh" # zh
#model_name = "google/gemma-1.1-7b-it"
#model_name = "microsoft/phi-2"
#model_name = "microsoft/Orca-2-7b"
#model_name = "mistralai/Mistral-7B-Instruct-v0.2"
#model_name = "openlm-research/open_llama_3b_v2"
#model_name = "openlm-research/open_llama_7b_v2"
#model_name = "meta-llama/Llama-2-7b-hf"
#model_name = "meta-llama/Llama-2-7b-chat-hf"
#model_name = "lmsys/vicuna-7b-v1.5"
#model_name = "lmsys/vicuna-7b-v1.5-16k"
#model_name = "Nexusflow/Starling-LM-7B-beta"

#model_name = "Qwen/Qwen1.5-7B-Chat" # 通义千问
#model_name = "01-ai/Yi-6B-Chat" # 零一万物
#model_name = "yentinglin/Taiwan-LLM-7B-v2.0.1-chat" # 台大
#model_name = "MediaTek-Research/Breeze-7B-Instruct-v0.1" # 達哥
#model_name = "INX-TEXT/Bailong-instruct-7B" # zh 白龍
#model_name = "taide/TAIDE-LX-7B-Chat" # TAIDE


LLM = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype="auto", device_map="cuda") # for Mamba
#LLM = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.bfloat16, device_map="cuda") # for the rest models

tokenizer = AutoTokenizer.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## HTTP Server with Ngrok

In [4]:
import getpass
import os
import threading

from pyngrok import ngrok, conf


## set ngrok authtoken
print("Enter your authtoken, which can be copied from https://dashboard.ngrok.com/get-started/your-authtoken")
conf.get_default().auth_token = getpass.getpass()

Enter your authtoken, which can be copied from https://dashboard.ngrok.com/get-started/your-authtoken


 ········


In [5]:
# Open a ngrok tunnel to the HTTP server
public_url = ngrok.connect(5000).public_url
print(" * ngrok tunnel \"{}\" -> \"http://127.0.0.1:{}/\"".format(public_url, 5000))

# ... Update inbound traffic via APIs to use the public-facing ngrok URL


 * ngrok tunnel "https://ecf1-2407-4d00-8d00-00-f8.ngrok-free.app" -> "http://127.0.0.1:5000/"


In [6]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
from fastapi import FastAPI, File, UploadFile
from fastapi.responses import Response
from pydantic import BaseModel
import uvicorn
import json

app = FastAPI()

@app.get("/")
def root():
    return Response("Hello World!")

@app.post("/audio")
def post_audio(audio: UploadFile = File(...)):
    print(audio.filename)
    fname = 'tmp_'+audio.filename
    with open(fname, 'wb') as f:
        content = audio.file.read()
        f.write(content)

    # Whisper transcribe
    result = ASR.transcribe(fname)
    print("ASR: "+result["text"])

    prompt = result["text"]
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to("cuda")
    output = LLM.generate(input_ids, max_length=128, num_beams=5, no_repeat_ngram_size=2)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    print("LLM: "+generated_text)
    return Response(generated_text)

# start new thread
threading.Thread(uvicorn.run(app, host="127.0.0.1", port=5000, log_level="info")).start()

INFO:     Started server process [22523]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:5000 (Press CTRL+C to quit)


gTTS.mp3




Whisper:  Hello, how are you?
LLM:  Hello, how are you?

I’m so glad you’re here. I’ve been waiting for you for a long time, and it’s been a pleasure to meet you. We’d like to talk a little bit about what we do and how we can help you, so let me start by saying that we are a small team, but we have a lot of work to do. So, we want to make sure that you get the most out of your time with us. And we’ll be happy to answer any questions you may have. If you need anything, just let us know.

INFO:     2407:4d00:8d00::f8:0 - "POST /audio HTTP/1.1" 200 OK
