In [None]:
!pip install tiktoken
!pip install faiss-cpu
!pip install --upgrade openai
!pip install PyPDF2
!pip install openai
#!pip install PyPDF2
!pip install langchain
!pip install PyPDF2
!pip install tabula-py

In [None]:
#NEW CODE with history
import os
import pickle
from PyPDF2 import PdfReader
import tabula
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks import get_openai_callback

def setup_openai_api_key():
    api_key = input("Enter your OpenAI API key: ")
    os.environ["OPENAI_API_KEY"] = api_key

def extract_text_from_pdf(pdf_path):
      # Use PyPDF2 to extract text from the PDF
    text = ""
    with open(pdf_path, "rb") as pdf_file:
        pdf_reader = PdfReader(pdf_file)
        for page in pdf_reader.pages:
            text += page.extract_text()

    return text


def extract_table_info(pdf_path):
       # Use tabula-py to extract tables from the PDF
    tables = tabula.read_pdf(pdf_path, pages='all', multiple_tables=True)
    table_info = []

    for table in tables:
        # Convert the table data to a list of dictionaries for easier access
        table_data = table.to_dict(orient="records")
        table_text = "\n".join([", ".join(str(val) for val in row.values()) for row in table_data])
        table_info.append(table_text)

    return table_info

def main():
    # Upload a PDF file
    pdf_path = input("Enter the path to your PDF file: ")
    pdf_name = os.path.basename(pdf_path)

    # Extract text from the PDF using PyPDF2
    text = extract_text_from_pdf(pdf_path)

    # Preprocess the text to handle any encoding issues or anomalies
    text = text.encode("ascii", "ignore").decode()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000,
        chunk_overlap=200,
        length_function=len
    )
    text_chunks = text_splitter.split_text(text=text)

    # Extract tables from the PDF using tabula-py
    table_info = extract_table_info(pdf_path)

    # Create vector store for text chunks
    text_store_name = pdf_name[:-4] + "_text"
    if os.path.exists(f"{text_store_name}.pkl"):
        with open(f"{text_store_name}.pkl", "rb") as f:
            TextVectorStore = pickle.load(f)
        print("Text embeddings loaded from disk.")
    else:
        embeddings = OpenAIEmbeddings()
        TextVectorStore = FAISS.from_texts(text_chunks, embedding=embeddings)
        with open(f"{text_store_name}.pkl", "wb") as f:
            pickle.dump(TextVectorStore, f)
        print("Text embeddings created and saved to disk.")

    # Create vector store for table data
    table_store_name = pdf_name[:-4] + "_tables"
    if os.path.exists(f"{table_store_name}.pkl"):
        with open(f"{table_store_name}.pkl", "rb") as f:
            TableVectorStore = pickle.load(f)
        print("Table embeddings loaded from disk.")
    else:
        embeddings = OpenAIEmbeddings()
        TableVectorStore = FAISS.from_texts(table_info, embedding=embeddings)
        with open(f"{table_store_name}.pkl", "wb") as f:
            pickle.dump(TableVectorStore, f)
        print("Table embeddings created and saved to disk.")

    # Initialize conversation history
    conversation = []

    # Accept user prompts (questions)
    while True:
        query = input("Ask a question about the PDF file (or type 'exit' to quit): ")
        if query.lower() == "exit":
            break

        # Append the current query to conversation history
        conversation.append(query)

        # Customize OpenAI LLM parameters
        llm = OpenAI(temperature=0, model_name="gpt-4")

        # Combine conversation history with the new query as context
        conversation_context = "\n".join(conversation)

        # Perform similarity search between query and text chunks
        text_docs = TextVectorStore.similarity_search(query=conversation_context, k=3)

        # Perform similarity search between query and table data
        table_docs = TableVectorStore.similarity_search(query=conversation_context, k=3)

        # Combine the results from text and table searches
        all_docs = text_docs + table_docs

        chain = load_qa_chain(llm=llm, chain_type="stuff")
        with get_openai_callback() as cb:
            response = chain.run(input_documents=all_docs, question=query)
            tokens = cb.total_tokens
            cost = cb.total_cost

        # Append the response to conversation history
        conversation.append(response)

        print("Cost:", cost)
        print("Number of Tokens:", tokens)
        print(response)

if __name__ == '__main__':
    setup_openai_api_key()
    main()
