# Llama 2 Fastapi Service Example

In [None]:
# import json
# import logging
# from huggingface_hub import hf_hub_download
# from llama_cpp import Llama

# GGML_HUGGINGFACE_REPO = "audreyt/Taiwan-LLaMa-v1.0-GGML"
# GGML_HUGGINGFACE_BIN_FILE = "Taiwan-LLaMa-13b-1.0.ggmlv3.q5_1.bin"

# 設定日誌
# logging.basicConfig(level=logging.INFO,
#                     format='%(asctime)s [%(levelname)s] %(message)s',
#                     datefmt='%Y-%m-%d %H:%M:%S')
# logger = logging.getLogger(__name__)

# class Model:
#     def __init__(self):
#         self.loaded = False        # 模型是否已經加載的標志
#         self.lcpp_llm = None       # 儲存 Llama 模型的變數
#         self.model_path = ""       # 模型的路徑

#     def load(self, model_name_or_path = "TheBloke/Llama-2-13B-chat-GGML", model_basename = "llama-2-13b-chat.ggmlv3.q5_1.bin"):
#         # 從 Hugging Face Model Hub 下載模型並設定其路徑
#         self.model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)
#         logger.info("Finish: Load Llama 2 model.")  # 輸出模型加載完成的信息

#     def predict(self, data):
#         # 如果模型還沒有被加載，則加載模型
#         if not self.loaded:
#             self.loaded = True
#             self.lcpp_llm = Llama(
#                 model_path=self.model_path,
#                 n_threads=2,             # 使用的執行緒數量
#                 n_batch=1024,            # 批次大小
#                 n_gpu_layers=32          # 使用的GPU層數
#             )
#         logger.info("========== Start ==========")
#         # 將 JSON 字符串反序列化成字典
#         data_dict = json.loads(data)
#         logger.info("Input: {}.".format(data_dict))
#         # 使用 Llama 模型進行預測
#         response = self.lcpp_llm(prompt=data_dict['prompt'], max_tokens=data_dict['max_tokens'], temperature=0.5, top_p=0.95, repeat_penalty=1.2, top_k=150, echo=True)
#         logger.info("Response: {}.".format(response))
#         logger.info("==========  End  ==========")

#         return {"answer": response["choices"][0]["text"]}  # 返回模型的預測結果


In [1]:
import openai

import pandas as pd

# from langchain.vectorstores import FAISS
# from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
# from langchain.document_transformers import EmbeddingsRedundantFilter
from langchain_community.document_transformers import EmbeddingsRedundantFilter
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain.retrievers import ContextualCompressionRetriever



# Function to generate text
class Model():
    def ask_chatgpt(self, prompt):
        openai.api_key = ""
        
        completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        temperature=0,
        messages=[
                {"role": "system", "content": "You are an artificial intelligence assistant and National Taiwan University campus guide"},
                {"role": "user", "content": prompt},
            ]
        )
        return completion
    def RAG(self, query):
        web_page = pd.read_csv('document/clean_content_4.csv')
        embeddings = HuggingFaceBgeEmbeddings(model_name = "BAAI/bge-large-zh-v1.5")
        redundant_filter = EmbeddingsRedundantFilter(embeddings=embeddings)
        # relevant_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.55)
        pipeline_compressor = DocumentCompressorPipeline(
            transformers=[redundant_filter]
        )

        db = FAISS.load_local('embeddings/all_bge_large_chatgpt', embeddings)
        retriever = db.as_retriever(search_kwargs={"k": 5})
        compression_retriever = ContextualCompressionRetriever(base_compressor=pipeline_compressor, base_retriever=retriever)


        retrv_urls, retrv_origin_docs, retrv_expand_docs = [], [], []

        retrieve_docs = compression_retriever.get_relevant_documents(query)
        retrv_origin_docs.append([doc.page_content for doc in retrieve_docs])
        url_set = set([doc.metadata['url'] for doc in retrieve_docs])
        retrv_urls.append(url_set)

        contents = {}
        for url in url_set:
            for idx, u in enumerate(web_page['url']):
                if url==u:
                    contents[url] = web_page['content'][idx]

        paragraphs = []
        for i in range(len(retrieve_docs)):
            snippet = retrieve_docs[i].page_content
            str_idx = contents[retrieve_docs[i].metadata['url']].find(snippet)
            if str_idx==-1:
                paragraphs.append(snippet)
            else:
                paragraphs.append(contents[retrieve_docs[i].metadata['url']][str_idx:str_idx+128])
        retrv_expand_docs.append(paragraphs)

        paragraph = [f'\n文檔{i+1}:'+paragraphs[i]+'' for i in range(len(paragraphs))]

        prompt = '以下是參考資料，請忽略不相關的文件，回答盡量簡短精要，切勿重複輸出一樣文句子:{}\n請問:{}'.format(','.join(paragraph), query)
        # print(prompt)

        response = self.ask_chatgpt(prompt)['choices'][0]['message']['content']


        return retrieve_docs, response

    def predict(self, query):

        retrieve_docs, response = self.RAG(query)

        print("回覆:")
        print(response)

        print(f'參考網頁：')
        for doc in retrieve_docs:
            print(doc.metadata['url'])

        return {"answer": response, "urls": [doc.metadata['url'] for doc in retrieve_docs]}

In [2]:
# model_instance = Model()
# model_instance.predict("台大校長是誰？")

## Step 3: Build the fastapi service

In [3]:
import json
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware

app = FastAPI()
model_instance = Model()

# 為 FastAPI 應用加入 CORS 中間件，允許跨域請求
app.add_middleware(
    CORSMiddleware,
    allow_origins=['*'],             # 允許所有來源的跨域請求
    allow_credentials=True,          # 允許憑證（例如 cookies、HTTP認證）的傳遞
    allow_methods=['*'],             # 允許所有的 HTTP 方法
    allow_headers=['*'],             # 允許所有的 HTTP 頭部
)

@app.post("/predict")                # 定義一個 POST 路由，用於模型預測
def predict_text(input: dict):  # 接收一個字典格式的 JSON 輸入
# async def predict_text(json_input: dict):  # 接收一個字典格式的 JSON 輸入
    print(f'API input: {input}')
    result = model_instance.predict(input['prompt'])  # 使用模型實例進行預測
    print(result)
    return result                           # 返回預測結果


## Step 4: Start the fastapi service

In [4]:
import nest_asyncio
from pyngrok import ngrok
import uvicorn

NGROK_TOKEN = '2YZkFDfAxVLJaYozqc43NzGUyj5_6JBBKPaoYUW2W9gwyMDSJ'

# 設定 ngrok 的授權令牌
if NGROK_TOKEN is not None:
    ngrok.set_auth_token(NGROK_TOKEN)

# 建立與 ngrok 的隧道，使外部可以訪問本地的 8000 端口
ngrok_tunnel = ngrok.connect(8000)
public_url = ngrok_tunnel.public_url

print('Public URL:', public_url)  # 輸出公開的 URL
print("You can use {}/predict to get the assistant result.".format(public_url))


# 使用 nest_asyncio 修正異步事件循環的問題
nest_asyncio.apply()

# 啟動 uvicorn 伺服器，使 FastAPI 應用運行在 8000 端口
uvicorn.run(app, port=8000)


t=2024-01-29T15:58:17+0800 lvl=warn msg="ngrok config file found at both XDG and legacy locations, using XDG location" xdg_path=/home/ai2lab/.config/ngrok/ngrok.yml legacy_path=/home/ai2lab/.ngrok2/ngrok.yml
INFO:     Started server process [814261]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)


Public URL: https://9fad-140-112-90-16.ngrok-free.app
You can use https://9fad-140-112-90-16.ngrok-free.app/predict to get the assistant result.
API input: {'prompt': '如何申請口試？'}
回覆:
口試申請需填寫申請表，並根據口試類型檢附相應文件，申請需提前二週進行。具體口試方式可選擇視訊口試、實體口試或部分視訊部分實體。
參考網頁：
https://www.nd.ntu.edu.tw/News_Content.aspx?n=70&s=662
https://www.nd.ntu.edu.tw/News_Content.aspx?n=31&s=495
https://www.nd.ntu.edu.tw/News_Content.aspx?n=70&s=4167
https://www.nd.ntu.edu.tw/News_Content.aspx?n=31&s=495
https://www.nd.ntu.edu.tw/News_Content.aspx?n=3&s=4374
{'answer': '口試申請需填寫申請表，並根據口試類型檢附相應文件，申請需提前二週進行。具體口試方式可選擇視訊口試、實體口試或部分視訊部分實體。', 'urls': ['https://www.nd.ntu.edu.tw/News_Content.aspx?n=70&s=662', 'https://www.nd.ntu.edu.tw/News_Content.aspx?n=31&s=495', 'https://www.nd.ntu.edu.tw/News_Content.aspx?n=70&s=4167', 'https://www.nd.ntu.edu.tw/News_Content.aspx?n=31&s=495', 'https://www.nd.ntu.edu.tw/News_Content.aspx?n=3&s=4374']}
INFO:     140.112.90.16:0 - "POST /predict HTTP/1.1" 200 OK
API input: {'prompt': '台大校訓？'}
回覆

t=2024-01-30T07:38:29+0800 lvl=eror msg="session closed, starting reconnect loop" obj=tunnels.session obj=csess id=f2122ad316b0 err="read EOF from remote peer"
t=2024-01-30T07:38:53+0800 lvl=eror msg="heartbeat timeout, terminating session" obj=tunnels.session obj=csess id=71879c54da93 clientid=95088e6b786171802451251a08cb59d2


API input: {'prompt': '台灣大學口試截止日'}
回覆:
從提供的資料中，無法得知台灣大學口試的截止日。請參考台灣大學相關官方網站或聯繫台灣大學相關單位以獲取正確的口試截止日資訊。
參考網頁：
https://philo.ntu.edu.tw
https://management.ntu.edu.tw/cm/board_file/2/0/000000_34.pdf
http://www.oc.ntu.edu.tw/?cat=11
{'answer': '從提供的資料中，無法得知台灣大學口試的截止日。請參考台灣大學相關官方網站或聯繫台灣大學相關單位以獲取正確的口試截止日資訊。', 'urls': ['https://philo.ntu.edu.tw', 'https://management.ntu.edu.tw/cm/board_file/2/0/000000_34.pdf', 'http://www.oc.ntu.edu.tw/?cat=11']}
INFO:     54.221.131.211:0 - "POST /predict HTTP/1.1" 200 OK


t=2024-01-31T03:24:08+0800 lvl=eror msg="session closed, starting reconnect loop" obj=tunnels.session obj=csess id=f2122ad316b0 err="read EOF from remote peer"
t=2024-01-31T03:24:24+0800 lvl=eror msg="heartbeat timeout, terminating session" obj=tunnels.session obj=csess id=3336cd996a11 clientid=95088e6b786171802451251a08cb59d2


API input: {'prompt': 'hi'}
回覆:
你好！有什麼我可以幫助你的嗎？
參考網頁：
https://ntuweb.cloud.ntu.edu.tw/ntuebook/cht/2009overview_cht.pdf
https://ntuweb.cloud.ntu.edu.tw/ntuebook/cht/2009overview_cht.pdf
https://ntuweb.cloud.ntu.edu.tw/ntuebook/cht/2009overview_cht.pdf
https://oiainternship.ntu.edu.tw/
{'answer': '你好！有什麼我可以幫助你的嗎？', 'urls': ['https://ntuweb.cloud.ntu.edu.tw/ntuebook/cht/2009overview_cht.pdf', 'https://ntuweb.cloud.ntu.edu.tw/ntuebook/cht/2009overview_cht.pdf', 'https://ntuweb.cloud.ntu.edu.tw/ntuebook/cht/2009overview_cht.pdf', 'https://oiainternship.ntu.edu.tw/']}
INFO:     107.20.17.88:0 - "POST /predict HTTP/1.1" 200 OK
API input: {'prompt': '你會說英文嗎？'}
回覆:
是的，我會說英文。
參考網頁：
https://fltc.fltc.ntu.edu.tw/index.php?action=news-detail&cid=3&id=385
https://fltc.fltc.ntu.edu.tw/index.php?action=news-detail&cid=3&id=385
https://ntubeats.ntu.edu.tw/pdf/039fulltext.pdf
{'answer': '是的，我會說英文。', 'urls': ['https://fltc.fltc.ntu.edu.tw/index.php?action=news-detail&cid=3&id=385', 'https://fltc.fltc.nt

t=2024-02-02T13:58:08+0800 lvl=eror msg="session closed, starting reconnect loop" obj=tunnels.session obj=csess id=f2122ad316b0 err="read EOF from remote peer"
t=2024-02-02T13:58:24+0800 lvl=eror msg="heartbeat timeout, terminating session" obj=tunnels.session obj=csess id=b065c736426f clientid=95088e6b786171802451251a08cb59d2


API input: {'prompt': '台大調酒社有開放校外人士參加'}
回覆:
很抱歉，我無法回答您的問題。
參考網頁：
https://ntuweb.cloud.ntu.edu.tw/ntuebook/cht/2014overview_cht.pdf
http://www.bioagri.ntu.edu.tw/bulletin.php
{'answer': '很抱歉，我無法回答您的問題。', 'urls': ['https://ntuweb.cloud.ntu.edu.tw/ntuebook/cht/2014overview_cht.pdf', 'http://www.bioagri.ntu.edu.tw/bulletin.php']}
INFO:     54.196.191.99:0 - "POST /predict HTTP/1.1" 200 OK
API input: {'prompt': '嗎'}
回覆:
嗎是一個用來提問的助詞。
參考網頁：
http://play.google.com/store/apps/details?id=org.coursera.android
{'answer': '嗎是一個用來提問的助詞。', 'urls': ['http://play.google.com/store/apps/details?id=org.coursera.android']}
INFO:     54.196.191.99:0 - "POST /predict HTTP/1.1" 200 OK
API input: {'prompt': '台大調酒社有開放校外人士參加嗎'}
回覆:
很抱歉，根據提供的資料，無法確定台大調酒社是否開放校外人士參加。
參考網頁：
https://ntuweb.cloud.ntu.edu.tw/ntuebook/cht/2014overview_cht.pdf
https://ntuweb.cloud.ntu.edu.tw/ntuebook/cht/2013overview_cht.pdf
http://www.bioagri.ntu.edu.tw/bulletin.php
{'answer': '很抱歉，根據提供的資料，無法確定台大調酒社是否開放校外人士參加。', 'urls': ['https://ntuweb

t=2024-02-08T08:02:24+0800 lvl=eror msg="heartbeat timeout, terminating session" obj=tunnels.session obj=csess id=73e0410dedfb clientid=95088e6b786171802451251a08cb59d2
t=2024-02-08T08:02:24+0800 lvl=eror msg="session closed, starting reconnect loop" obj=tunnels.session obj=csess id=f2122ad316b0 err="session closed"
t=2024-02-08T08:02:34+0800 lvl=eror msg="failed to reconnect session" obj=tunnels.session obj=csess id=f2122ad316b0 err="failed to dial ngrok server with address \"connect.us.ngrok-agent.com:443\": dial tcp 3.134.73.173:443: i/o timeout"
t=2024-02-08T08:06:00+0800 lvl=eror msg="heartbeat timeout, terminating session" obj=tunnels.session obj=csess id=5048918d17b5 clientid=95088e6b786171802451251a08cb59d2
t=2024-02-08T08:06:00+0800 lvl=eror msg="session closed, starting reconnect loop" obj=tunnels.session obj=csess id=f2122ad316b0 err="session closed"
t=2024-02-08T08:06:10+0800 lvl=eror msg="failed to reconnect session" obj=tunnels.session obj=csess id=f2122ad316b0 err="faile