In [None]:
!pip install -q transformers
!pip install -q bitsandbytes # Optimize for supporting to calculate on GPU
!pip install -q accelerate # Speed up training model
!pip install -q langchain
!pip install -q langchainhub
!pip install -q langchain-chroma # A vector database for LLM
!pip install -q langchain_experimental
!pip install -q langchain-community
!pip install -q langchain_huggingface
!pip install -q python-dotenv==1.1.0
!pip install -q pypdf
!pip install -q streamlit

In [None]:
%%writefile app.py
import torch
import re
import tempfile
import os
import streamlit as st

from transformers import BitsAndBytesConfig
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_huggingface.llms import HuggingFacePipeline
from langchain_community.document_loaders import PyPDFLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain_chroma import Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain import hub

@st.cache_resource
def load_embeddings(model_name: str):
    return HuggingFaceEmbeddings(model_name=model_name)

@st.cache_resource
def load_llm(model_name: str, config):
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=config,
        torch_dtype=torch.bfloat16,
        low_cpu_mem_usage=True
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model_pipeline = pipeline(
        task="text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=512,
        pad_token_id=tokenizer.eos_token_id,
        device_map="auto",
    )
    return HuggingFacePipeline(pipeline=model_pipeline)

def remove_invalid_surrogates(text):
  return re.sub(r'[\ud800-\udfff]', '', text)

def process_pdf(uploaded_file, session_state):
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmpf:
        tmpf.write(uploaded_file.getvalue())
        tmpf_path = tmpf.name

    loader = PyPDFLoader(tmpf_path)
    documents = loader.load()

    semanctic_splitter = SemanticChunker(
        embeddings=session_state.embeddings,
        buffer_size=1,
        breakpoint_threshold_type="percentile",
        breakpoint_threshold_amount=95,
        min_chunk_size=500,
        add_start_index=True
    )

    docs = semanctic_splitter.split_documents(documents=documents)
    for doc in docs:
      doc.page_content = remove_invalid_surrogates(doc.page_content)

    vec_db = Chroma.from_documents(
        documents=docs,
        embedding=session_state.embeddings
    )
    retriever = vec_db.as_retriever()

    prompt = hub.pull("rlm/rag-prompt")
    def format_docs(docs):
        return "\n\n".join([doc.page_content for doc in docs])

    rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | prompt
        | session_state.llm
        | StrOutputParser()
    )

    os.unlink(tmpf_path)  # Clean up the temporary file
    return rag_chain, len(docs)

nf4_config = BitsAndBytesConfig(
    load_in_4bit=True, # weights is round to 4-bit number
    bnb_4bit_quant_type="nf4", # A technic for quantizing model to get smaller
    bnb_4bit_use_double_quant=True, # Double quantize
    bnb_4bit_compute_dtype=torch.bfloat16 # brain float16
)


# Make sure model are loaded each interaction
if "rag_chain" not in st.session_state: # Rag chain build from pdf
    st.session_state.rag_chain = None
if "models_loaded" not in st.session_state:
    st.session_state.models_loaded = False
if "embeddings" not in st.session_state:
    st.session_state.embeddings = None
if "llm" not in st.session_state:
    st.session_state.llm = None

st.set_page_config(page_title="RAG Assistant", layout="wide")
st.title("RAG PDF Assistant")

st.markdown("""
**AI allows you to directly ask questions abd get answers from the contecnt of PDF documents in Vietnamese**

**Simple use:**
1. **Upload PDF:** Choose a PDF file to upload and click "Process PDF"
2. **Question:** Type your question about the content of file that you've just uploaded
""")

if not st.session_state.models_loaded:
    st.info("Downloading models...")
    st.session_state.embeddings = load_embeddings("bkai-foundation-models/vietnamese-bi-encoder")
    st.session_state.llm = load_llm("lmsys/vicuna-7b-v1.5", nf4_config)
    st.session_state.models_loaded = True
    st.success("Models loaded successfully!")
    st.rerun()

uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])
if uploaded_file and st.button("Process PDF"):
    with st.spinner("Processing PDF..."):
        st.session_state.rag_chain, num_chunks = process_pdf(uploaded_file, st.session_state)
        st.success(f"PDF processed successfully! Number of chunks: {num_chunks}")

if st.session_state.rag_chain:
    question = st.text_input("Ask a question:")
    if question:
        with st.spinner("Answering..."):
            output = st.session_state.rag_chain.invoke(question)
            answer = output.split("Answer: ")[1].strip() if "Answer:" in output else output
            st.write("Answer: ", answer)

In [None]:
import urllib
print("Password/Enpoint IP for localtunnel is:",urllib.request.urlopen('https://ipv4.icanhazip.com').read().decode('utf8').strip("\n"))

In [None]:
!npm install localtunnel

In [None]:
!streamlit run app.py &>/content/logs.txt &

In [None]:
!npx localtunnel --port 8501