In [1]:
pip install --quiet streamlit langchain_community sentence_transformers pypdf tiktoken chromadb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m48.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.4/290.4 kB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m43.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m559.5/559.5 kB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m70.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━

In [2]:
import streamlit as st
import os
from langchain.chains import RetrievalQA
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
import torch

  from tqdm.autonotebook import tqdm, trange


In [3]:
def load_document(file):
    import os
    name, extension = os.path.splitext(file)

    if extension == '.pdf':
        from langchain.document_loaders import PyPDFLoader
        print(f'Loading {file}')
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        from langchain.document_loaders import Docx2txtLoader
        print(f'Loading {file}')
        loader = Docx2txtLoader(file)
    elif extension == '.txt':
        from langchain.document_loaders import TextLoader
        loader = TextLoader(file)
    else:
        print('Document format is not supported!')
        return None

    data = loader.load()
    return data

In [4]:
def chunk_data(data, chunk_size=256, chunk_overlap=20):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = text_splitter.split_documents(data)
    return chunks

In [11]:
def create_embeddings(chunks):
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    vector_store = Chroma.from_documents(chunks, embeddings)

    # if you want to use a specific directory for chromadb
    # vector_store = Chroma.from_documents(chunks, embeddings, persist_directory='./mychroma_db')
    return vector_store

In [21]:

def ask_and_get_answer(vector_store, q, k=1):
    # Assuming vector_store and relevant imports are handled elsewhere

    # Set up retriever and fetch relevant documents
    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': k})
    documents = retriever.get_relevant_documents(q)

    # Prepare context from relevant documents
    context = " ".join([doc.page_content for doc in documents])

    model_name = "deepset/roberta-base-squad2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)

    qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

    # Perform question answering on the context
    result = qa_pipeline(question=q, context=context)

    return result['answer']

In [15]:
def calculate_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    # print(f'Total Tokens: {total_tokens}')
    # print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.0004:.6f}')
    return total_tokens, total_tokens / 1000 * 0.0004

In [8]:
def clear_history():
    global history
    history = ''

In [24]:
# File upload simulation
from ipywidgets import FileUpload, Textarea, Text, IntSlider, Button, VBox, HBox, Output
from IPython.display import display

vector_store = None
history = ''

# Widgets setup
upload = FileUpload(accept='.pdf, .docx, .txt', multiple=True)
chunk_size_input = IntSlider(value=512, min=100, max=2048, description='Chunk size:')
add_data_button = Button(description='Add Data')

output = Output()
history_output = Output()

def combine_pdf_files(input_files, output_file):
    with open(output_file, 'wb') as output_pdf:
        for input_file in input_files:
            with open(input_file, 'rb') as input_pdf:
                output_pdf.write(input_pdf.read())
                # Optionally, add a newline or separator between files
                output_pdf.write(b"\n\n")

def handle_upload(change):
    global vector_store

    if upload.value:
        combined_file = 'combined.pdf'
        combine_pdf_files(upload.value, combined_file)
        uploaded_filename = combined_file

        data = load_document(uploaded_filename)
        chunks = chunk_data(data, chunk_size=chunk_size_input.value)
        tokens, embedding_cost = calculate_embedding_cost(chunks)

        output.clear_output()
        with output:
            print(f'Chunk size: {chunk_size_input.value}, Chunks: {len(chunks)}')
            print(f'Embedding cost: ${embedding_cost:.4f}')

        vector_store = create_embeddings(chunks)
        with output:
            print('File uploaded, chunked, and embedded successfully.')

add_data_button.on_click(handle_upload)

question_input = Text(description='Question:')
submit_button = Button(description='Submit Question')
answer_output = Textarea(description='LLM Answer:', disabled=True, layout={'width': 'auto', 'height': '100px'})

def handle_question(change):
    global history

    if question_input.value and vector_store:
        answer = ask_and_get_answer(vector_store, question_input.value)
        answer_output.value = answer

        value = f'Q: {question_input.value} \nA: {answer}'
        history = f'{value} \n {"-" * 100} \n {history}'
        with history_output:
            history_output.clear_output()
            print(history)

submit_button.on_click(handle_question)

clear_history_button = Button(description='Clear History')
clear_history_button.on_click(lambda _: clear_history())

display(VBox([HBox([upload, chunk_size_input]), add_data_button, output, question_input, submit_button, answer_output, clear_history_button, history_output]))

VBox(children=(HBox(children=(FileUpload(value={}, accept='.pdf, .docx, .txt', description='Upload', multiple=…



Loading combined.pdf
