In [None]:
%pip install langchain
%pip install openai
%pip install PyPDF2
%pip install faiss-cpu

In [2]:
import os
import openai
import pinecone
from rate_limiter import Api
from uuid import uuid4
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import ElasticVectorSearch, Pinecone, Weaviate, FAISS

In [3]:
# Get your API keys from openai, you will need to create an account. 
load_dotenv()
openai.organization = os.getenv("org_openai")
openai.api_key = os.getenv("key_openai")

pinecone.init(api_key=os.getenv("key_pinecone"), environment=os.getenv("env_pinecone"))
vdb = pinecone.Index(os.getenv("idx_pinecone"))

In [8]:
# location of the pdf file/files.
locations = [
  'C:/Users/shaj6/Documents/Programming/repositories/chairGPT/assets/2022 Annual Report.pdf',
  'C:/Users/shaj6/Documents/Programming/repositories/chairGPT/assets/2021 Annual Report.pdf',
  'C:/Users/shaj6/Documents/Programming/repositories/chairGPT/assets/2021-lululemon-impact-report-03-09-22.pdf',
  'C:/Users/shaj6/Documents/Programming/repositories/chairGPT/assets/code-of-conduct-november-2021-english.pdf'
]
readers = []
for loc in locations:
    readers.append(PdfReader(loc))

In [9]:
readers

[<PyPDF2._reader.PdfReader at 0x14b73a6e710>,
 <PyPDF2._reader.PdfReader at 0x14b70e23f10>,
 <PyPDF2._reader.PdfReader at 0x14b60586350>,
 <PyPDF2._reader.PdfReader at 0x14b73a6fbe0>]

In [10]:
# read data from the file and put them into a variable called raw_text
files = []
for idx, reader in enumerate(readers):
    raw_text = ''
    for i, page in enumerate(reader.pages):
        text = page.extract_text()
        if text:
            raw_text += text
    obj = {'raw_text':raw_text, 'file_location':locations[idx]}
    files.append(obj)

In [11]:
print(len(files[0]['raw_text']))

97262


In [26]:
# Chunk input text
text_splitter = CharacterTextSplitter(      
    separator = "\n",
    chunk_size = 1000,
    chunk_overlap  = 200,
    length_function = len,
)
for file in files:
    raw_text = file['raw_text']
    file['chunks'] = text_splitter.split_text(raw_text)

In [27]:
chunk_count = 0
char_count = 0
for file in files:
  chunk_count += len(file['chunks'])
  for chunk in file['chunks']:
    char_count += len(chunk)
print(chunk_count)
print(char_count / chunk_count)

338
951.0118343195267


In [14]:
# Convert to embeddings using OpenAI

# OpenAI embedding rate limit with 50% buffer
rpm_limit_openai = 3000 * 0.75
rpm_limit_pinecone = 3000 * 0.75
openai_request = Api(average_rate_limit=rpm_limit_openai, max_retries=5)
pinecone_request = Api(average_rate_limit=rpm_limit_pinecone, max_retries=3)
for file in files:
   doc_id = str(uuid4())
   payload = []
   for i, chunk in enumerate(file['chunks']):
      # create vector id
      vec_id = str(uuid4())

      # get vector representation of text chunk
      chunk = chunk.encode(encoding='ASCII',errors='ignore').decode()  # fix any UNICODE errors
      response = openai_request.send_request(openai.Embedding.create,input=chunk,engine='text-embedding-ada-002')
      vector_value = response['data'][0]['embedding']  # this is a normal list

      # vector metadata as dictionary
      metadata = {
         'document_id':doc_id,
         'file_location': file['file_location'],
         'chunk_index': i
      }

      # create and append vector obj to the payload
      vector_obj = (vec_id, vector_value, metadata)
      payload.append(vector_obj)

      # status update
      if (i % 50 == 0):
         print(f"File {doc_id}, chunk {i}")
   
   # push batched payload to pinecone ensuring payload contains less than 80 vectors (abide by 2MB Pinecone limit)
   pinecone_request.send_payload(vdb.upsert, payload, payload_length_limit=80)
   print("submitted pinecone")

File 9936b7ef-1470-4138-87ee-7de1557ea66e, chunk 0
File 9936b7ef-1470-4138-87ee-7de1557ea66e, chunk 50
File 9936b7ef-1470-4138-87ee-7de1557ea66e, chunk 100
submitted pinecone
File 72ce801e-9ecb-4e3d-96ff-de93dacf5ce8, chunk 0
File 72ce801e-9ecb-4e3d-96ff-de93dacf5ce8, chunk 50
File 72ce801e-9ecb-4e3d-96ff-de93dacf5ce8, chunk 100
submitted pinecone
File 30fd37c3-3f0f-41ef-8428-dbc1573801c3, chunk 0
File 30fd37c3-3f0f-41ef-8428-dbc1573801c3, chunk 50
submitted pinecone
File b455d3bd-5f60-4b61-8fc9-e69e2c78a7e0, chunk 0
submitted pinecone


In [None]:
docsearch = FAISS.from_texts(texts, embeddings)

In [None]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

In [None]:
chain = load_qa_chain(OpenAI(), chain_type="stuff")

In [None]:
query = "who are the authors of the article?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' The authors of the article are Yuvanesh Anand, Zach Nussbaum, Brandon Duderstadt, Benjamin Schmidt and Andriy Mulyar.'

In [None]:
query = "What was the cost of training the GPT4all model?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' $100'

In [None]:
query = "How was the model trained?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' The model was trained with LoRA (Hu et al., 2021) on the 437,605 post-processed examples for four epochs. Detailed model hyper-parameters and training code can be found in the associated repository and model training log.'

In [None]:
query = "what was the size of the training dataset?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' The final training dataset contains 437,605 prompt-generation pairs.'

In [None]:
query = "How is this different from other models?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' This model is different from other models because it is based on LLaMA, it is licensed only for research purposes, and it is trained on a dataset of post-processed examples. It also has a TSNE visualization of the final training data, and a zoomed-in view to show generations related to personal health and wellness.'

In [None]:
query = "What is Google Bard?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

" I don't know."