In [None]:
#import libraries

import openai
import langchain
import pinecone
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.llms import OpenAI

In [None]:
from google.colab import userdata
from getpass import getpass
import os

In [None]:
def read_doc(directory):
  file_loader = PyPDFDirectoryLoader(directory)
  documents = file_loader.load()
  return documents

In [None]:
doc = read_doc("/content/")
doc

[Document(metadata={'source': '/content/constitution .pdf', 'page': 0}, page_content=' \nNATIONAL  CONSTITUTION  CENTER   \n   \n \n \n \n \n  \n \nTHE  \nCONSTITUTION  \nof the United  States \n \n \n \n \n \n  \n \n  \n \n   \n \n  \n \n  \n \n  \n \n  \n \n  \n \n   \n '),
 Document(metadata={'source': '/content/constitution .pdf', 'page': 1}, page_content='C O N S T I T U T I O N O F T H E U N I T E D S T A T E S   \n \n \n \nWe the People of the United States, in Order to form a \nmore perfect Union, establish Justice, insure domestic \nTranquility, provide for the common defence, promote \nthe general  Welfare, and secure the Blessings of Liberty to \nourselves  and our Posterity,  do ordain  and establish  this \nConstitution for the United States of America  \n \n \nArticle.   I. \nSECTION.  1 \nAll legislative Powers herein granted shall be vested in a \nCongress of the United States, which shall consist of a Sen-  \nate and House of Representatives. \nSECTI ON. 2 \nThe House 

In [None]:
#Divide the docs into chunks

def chunk_data(docs, chunk_size=1000, chunk_overlap=20):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
  doc = text_splitter.split_documents(docs)
  return docs

In [None]:
documents = chunk_data(docs=doc)
documents

[Document(metadata={'source': '/content/constitution .pdf', 'page': 0}, page_content=' \nNATIONAL  CONSTITUTION  CENTER   \n   \n \n \n \n \n  \n \nTHE  \nCONSTITUTION  \nof the United  States \n \n \n \n \n \n  \n \n  \n \n   \n \n  \n \n  \n \n  \n \n  \n \n  \n \n   \n '),
 Document(metadata={'source': '/content/constitution .pdf', 'page': 1}, page_content='C O N S T I T U T I O N O F T H E U N I T E D S T A T E S   \n \n \n \nWe the People of the United States, in Order to form a \nmore perfect Union, establish Justice, insure domestic \nTranquility, provide for the common defence, promote \nthe general  Welfare, and secure the Blessings of Liberty to \nourselves  and our Posterity,  do ordain  and establish  this \nConstitution for the United States of America  \n \n \nArticle.   I. \nSECTION.  1 \nAll legislative Powers herein granted shall be vested in a \nCongress of the United States, which shall consist of a Sen-  \nate and House of Representatives. \nSECTI ON. 2 \nThe House 

In [None]:
#Embedding initialization

os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")

embeddings = OpenAIEmbeddings()


In [None]:
vectors = embeddings.embed_query("How's the weather today?")
len(vectors)

1536

In [None]:
#Pinecone DB vector search
from pinecone import Pinecone, ServerlessSpec


os.environ["PINECONE_API_KEY"] = userdata.get("DB_API_KEY")

pc = Pinecone()
index_name = "vector-index"

if not pc.has_index(index_name):
  pc.create_index(
      name=index_name,
      dimension=1536,
      metric="cosine",
      spec=ServerlessSpec(
          cloud='aws',
          region='us-east-1'
      )
  )



In [None]:
from langchain_pinecone import PineconeVectorStore

vectorstore = PineconeVectorStore.from_documents(documents, embeddings, index_name=index_name)

In [None]:
#Cosine similarity results

def retrieve_query(query, k=3):
  matching_results = vectorstore.similarity_search(query, k=k)
  return matching_results

In [None]:
from langchain.chains.question_answering import load_qa_chain
from langchain_openai import OpenAI

In [None]:
llm = OpenAI(model="gpt-3.5-turbo-instruct", temperature=0.5)

chain = load_qa_chain(llm, chain_type="stuff")

In [53]:
def retrieve_answers(query):
  doc_search = retrieve_query(query)
  response = chain.run(input_documents=doc_search, question=query)
  return response

In [54]:
my_query = "How shall the House of Representatives be composed?"
answer = retrieve_answers(my_query)
print(answer)

 The House of Representatives shall be composed of Members chosen every second Year by the People of the several States, and the Electors in each State shall have the Qualifications requisite for Electors of the most numerous Branch of the State Legislature.
