In [1]:
import os
import sys
from dotenv import load_dotenv
from pathlib import Path
import time
from tqdm import tqdm
import math

from langchain.document_loaders import TextLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline

from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA

from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import DirectoryLoader
from langchain import hub
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
import textwrap

# from langchain.indexes import VectorstoreIndexCreator
# from langchain.chat_models import ChatOpenAI

In [2]:
# dotenv_path = Path('../.env')
# load_dotenv(dotenv_path=dotenv_path)

load_dotenv()

True

In [3]:
vectorstore = Chroma(embedding_function=OpenAIEmbeddings(), persist_directory='../vectorstore/crescent')

In [4]:
# loader = TextLoader('../data/txt_samples/cbc_news.txt')

loader = DirectoryLoader('../data/crescent_docs/', glob="**/*.pdf", loader_cls=PyPDFLoader, show_progress=True)

docs = loader.load()

100%|██████████| 7/7 [00:35<00:00,  5.07s/it]


In [5]:
chunk_size = 1000
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=200, add_start_index=True)
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=200, add_start_index=True)
splits = text_splitter.split_documents(docs)
print(len(splits))

batch_size = int((150000 / 3 - 0.001) // chunk_size)
batch_num = int(math.ceil((len(splits) / batch_size)))
print(batch_size)
print(batch_num)
batches = []
for i in range(batch_num):
    batch = splits[batch_size*i:batch_size*(i+1)]

    batches += [batch]

409
49
9


In [6]:
n = 0
for batch in tqdm(batches):
    t1 = time.time()
    vectorstore.add_documents(documents=batch, embedding=OpenAIEmbeddings(), persist_directory='../vectorstore/crescent')

    t2 = time.time()
    if 22 - (t2-t1) > 0:
        time.sleep(22 - (t2-t1))

100%|██████████| 9/9 [03:18<00:00, 22.01s/it]


In [4]:
# retriever = vectorstore.as_retriever()
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
# prompt = hub.pull("rlm/rag-prompt")
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.0)

In [5]:
template = """Answer the question based on the following context.

{context}

Question: {question}

If there is no information in the context, think rationally and provide an answer based on your own knowledge.
"""

prompt = ChatPromptTemplate.from_template(template)

In [6]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain_answer = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain_with_sources = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_answer)

In [10]:
def format_response(resp):
    span_start = resp["context"][0].metadata["start_index"]
    span_end = span_start + len(resp["context"][0].page_content)
    
    # s = f"""{textwrap.fill(resp['answer'], width=130)}
    s = f"""{resp['answer']}

    
Source: {resp['context'][0].metadata['source']}

Page: {resp['context'][0].metadata['page']}, span: {span_start}--{span_end}
"""
    return s

In [8]:
resp = rag_chain_with_sources.invoke("What is the contact information for Crescent Point?")
# resp = rag_chain_with_sources.invoke("What are growth prospects for crescent point in the next years?")
# resp = rag_chain_with_sources.invoke("What revenue is declared in the last quarter statement?")
resp

{'context': [Document(page_content='FOR MORE INFORMATION ON CRESCENT POINT ENERGY, PLEASE CONTACT:  \nShant Madian,  Vice President,  Capital Markets, or  \nSarfraz Somani , Manager , Investor Relati ons \nTelephone: (403) 693 -0020 Toll -free (US and Canada): 888 -693-0020  Fax: (403) 693 -0070  \nAddress: Crescent Point Energy Corp. Suite 2000, 585 - 8th Avenue S.W. Calgary AB  T2P 1G1  \nwww.crescentpointenergy.com  \nCrescent Point shares are traded on the Toronto Stock Exchange and New York Stock Exchange under the symbol CPG .', metadata={'page': 4, 'source': '..\\data\\crescent_docs\\crescent_point_announces_disposition_of_its_north_dakota_assets_-_august_24_2023.pdf', 'start_index': 3}),
  Document(page_content='Crescent Point and  the Offering.  \nCopies of the Prospectus and the Prospectus Supplement are available free of charge on the SEC website (http://www.sec.gov). \nAlternatively, copies may be obtained upon request in Canada by contacting BMO Nesbitt Burns Inc. (“BMO Ca

In [11]:
print(format_response(resp))

The contact information for Crescent Point Energy is as follows:

- Shant Madian, Vice President, Capital Markets
- Sarfraz Somani, Manager, Investor Relations
- Telephone: (403) 693-0020
- Toll-free (US and Canada): 888-693-0020
- Fax: (403) 693-0070
- Address: Crescent Point Energy Corp. Suite 2000, 585 - 8th Avenue S.W. Calgary AB T2P 1G1

    
Source: ..\data\crescent_docs\crescent_point_announces_disposition_of_its_north_dakota_assets_-_august_24_2023.pdf

Page: 4, span: 3--499



UsageError: Line magic function `%ai` not found.
