In [None]:
!pip install -q langchain streamlit docx2txt  PyPDF2==3.0.1 openai==0.27.6 faiss-gpu altair==4 tiktoken==0.4.0 huggingface-hub==0.14.1 InstructorEmbedding==1.0.1 sentence-transformers==2.2.2

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.1/8.1 MB[0m [31m37.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.9/71.9 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m709.0/709.0 kB[0m [31m53.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m80.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m22.5 MB/s[0m e

In [None]:
%%writefile htmlTemplate.py
css = '''
<style>
.chat-message {
    padding: 1.5rem; border-radius: 0.5rem; margin-bottom: 1rem; display: flex
}
.chat-message.user {
    background-color: #2b313e
}
.chat-message.bot {
    background-color: #475063
}

.chat-message .message {
  width: 80%;
  padding: 0 1.5rem;
  color: #fff;
}
'''

bot_template = '''
<div class="chat-message bot">
    <div class="message">{{MSG1}}<br><br>{{MSG2}}</div>
</div>
'''

Writing htmlTemplate.py


In [None]:
%%writefile app.py
import streamlit as st
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter,RecursiveCharacterTextSplitter
from langchain.embeddings import  HuggingFaceInstructEmbeddings , HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from htmlTemplate import css, bot_template
from langchain.llms import HuggingFaceHub
from langchain.chat_models import ChatOpenAI
from configparser import ConfigParser
import docx2txt


config = ConfigParser()
config.read('gptConfig.ini')
api_key = config['authorization']['api_key']

def get_text_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(
        # separator="\n",
        chunk_size=2200,
        chunk_overlap=0,
        length_function=len
    )
    chunks = text_splitter.split_text(text)
    return chunks



def get_pdf_text(pdf_docs):
    text_chunks = []
    for pdf in pdf_docs:
        if pdf.name.split('.')[-1] == "pdf":
          pdf_reader = PdfReader(pdf)
          for page in pdf_reader.pages:
              text_chunks.append(page.extract_text())
        elif (pdf.name.split('.')[-1] == "docx") or (pdf.name.split('.')[-1] == "doc"):
            text = docx2txt.process(pdf)
            chunks = get_text_chunks(text)
            text_chunks.extend(chunks)
        elif pdf.name.split('.')[-1] == "txt" :
          text_file = open(pdf.name, "rb")
          data = text_file.read().decode("utf-8")
          chunks = get_text_chunks(data)
          text_chunks.extend(chunks)
          text_file.close()

    return text_chunks





def get_vectorstore(text_chunks):
    embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
    #embeddings = HuggingFaceEmbeddings(model_name="thenlper/gte-large")
    #embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-large-en")
    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
    return vectorstore


def get_conversation_chain(vectorstore):
    llm = ChatOpenAI(openai_api_key=api_key,model_name="gpt-3.5-turbo")

    memory = ConversationBufferMemory(
        memory_key='chat_history', return_messages=True)
    conversation_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vectorstore.as_retriever(),
        memory=memory
    )
    return conversation_chain

def handle_userinput(user_question):
    response = st.session_state.conversation({'question': user_question})
    st.session_state.chat_history = response['chat_history']

    for i in range(len(st.session_state.chat_history) - 1, -1, -2):
        message1 = st.session_state.chat_history[i-1]
        message2 = st.session_state.chat_history[i]
        st.write(bot_template.replace("{{MSG1}}",message1.content).replace("{{MSG2}}",message2.content), unsafe_allow_html=True)



def main():
    st.set_page_config(page_title="Chat with Your Document",
                       page_icon=":books:")
    st.write(css, unsafe_allow_html=True)

    if "conversation" not in st.session_state:
        st.session_state.conversation = None
    if "chat_history" not in st.session_state:
        st.session_state.chat_history = None

    st.header("Chat with Your Document :books:")
    user_question = st.text_input("Ask a question about your documents:",key="input")
    if user_question:
        handle_userinput(user_question)

    with st.sidebar:
        st.subheader("Your documents")
        pdf_docs = st.file_uploader(
            "Upload your Document here and click on 'Process'", accept_multiple_files=True)
        if st.button("Process"):
            with st.spinner("Processing"):
                    # get pdf text
                    text_chunks = get_pdf_text(pdf_docs)

                    # get the text chunks
                    #text_chunks = get_text_chunks(raw_text)

                    # create vector store
                    vectorstore = get_vectorstore(text_chunks)

                    # create conversation chain
                    st.session_state.conversation = get_conversation_chain(vectorstore)

                    st.write("Process Completed")



if __name__ == '__main__':
    main()

Writing app.py


In [None]:
!npm install localtunnel

[K[?25h[37;40mnpm[0m [0m[30;43mWARN[0m [0m[35msaveError[0m ENOENT: no such file or directory, open '/content/package.json'
[0m[37;40mnpm[0m [0m[34;40mnotice[0m[35m[0m created a lockfile as package-lock.json. You should commit this file.
[0m[37;40mnpm[0m [0m[30;43mWARN[0m [0m[35menoent[0m ENOENT: no such file or directory, open '/content/package.json'
[0m[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m content No description
[0m[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m content No repository field.
[0m[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m content No README data
[0m[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m content No license field.
[0m
+ localtunnel@2.0.2
added 22 packages from 22 contributors and audited 22 packages in 1.943s

3 packages are looking for funding
  run `npm fund` for details

found [92m0[0m vulnerabilities

[K[?25h

In [None]:
!streamlit run /content/app.py &>/content/logs.txt &

In [None]:
! npx localtunnel --port 8501

[K[?25hnpx: installed 22 in 2.166s
your url is: https://sweet-papers-follow.loca.lt
^C
