In [2]:
import os
import langchain
import openai
import sys
import numpy as np
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())

openai_api_key = os.environ['OPENAI_API_KEY']

In [3]:
# Load a pdf file using langchain document loaders. Multiple documents can be loaded as well. 

from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("data/Southern Region CP7 Strategic Business Plan.pdf")

pages = loader.load_and_split()

len(pages)

78

In [None]:
# Add sample code for loading multiple documents
loaders = [
# add file path here

]

In [None]:
# Review examine a page that is loaded into pages object
pages[7]

In [5]:
# Split document using Recursive Character Text Splitter. It splits on /n/n, /n, ".", ",", " ", ""
# Experiment with different chunk size and check resulting splits


from langchain.text_splitter import RecursiveCharacterTextSplitter

r_spliter = RecursiveCharacterTextSplitter(
    chunk_size = 400,
    chunk_overlap = 20
)

splits = r_spliter.split_documents(pages)

len(splits)



457

In [6]:
# Use Chroma database to store all chuncks and persist them into a directory.
# Use !rm -rf, for removing old database file 

from langchain.vectorstores import Chroma
persist_directory = 'data/mysplits_11Jan'
# !rm -rf ./data/mysplits

In [8]:
# Use OpenAI embeddings to embed all splits. Create an embeding_model object and pass that to Chroma DB for embedding.
from langchain.embeddings import OpenAIEmbeddings
embeddings_model = OpenAIEmbeddings()

In [9]:


vectordb = Chroma.from_documents(

    documents= splits,
    embedding=embeddings_model,
    #persist_directory=persist_directory
    
)

In [9]:
print(vectordb._collection.count())

457


In [13]:
vectordb.persist()

In [10]:
question = "How many freight terminal are there?, Please list of first 5 terminals."

In [11]:
question1 = "Does this document have information on Covid and its impact on revenue?"

In [12]:
docs = vectordb.similarity_search(question, k=3)

In [13]:
docs1 = vectordb.similarity_search(question1, k=5)

In [14]:
docs1[2]

Document(page_content='and to forecast  how this will develop over the \nduration of this plan through  to 2029.   \nWe have s een an improvement in train service \nperformance as a result of reduced passenger \nnumbers during the Covid period. We have also seen \nthe impact on performance of returning passenger \ncrowding.  \nOverall infrastructure reliability has improved over', metadata={'page': 6, 'source': 'data/Southern Region CP7 Strategic Business Plan.pdf'})

In [15]:
docs[0]

Document(page_content='asset  renewals and improvements in the \nmanagement of external events . \n \nFreight  \nThere are 32 ac tive freight terminals on the Southern \nregion . Colas Rail , DB Cargo , DC Rail , Freightliner , GB \nRailfreight , Rail Operations (UK) Limited , Victa \nRailfreight all operate in the Southern region \ndelivering services for end users such as  Associated', metadata={'page': 12, 'source': 'data/Southern Region CP7 Strategic Business Plan.pdf'})

In [16]:
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(model= "gpt-3.5-turbo", temperature=0)

In [17]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever = vectordb.as_retriever()
)

In [18]:
result = qa_chain({"query" : question})

result

{'query': 'How many freight terminal are there?, Please list of first 5 terminals.',
 'result': "There are 32 active freight terminals on the Southern region. However, I don't have the specific information about the first five terminals."}

In [19]:
result1 = qa_chain({"query" : question1})

result1


{'query': 'Does this document have information on Covid and its impact on revenue?',
 'result': 'Yes, the document mentions that the Covid pandemic had a major impact on industry revenues. It states that revenue has recovered to approximately 85% of pre-Covid levels.'}

In [20]:
# Prompt

from langchain.prompts import PromptTemplate
# Build prompt

template = """ Use the following pieces of context to answer the question at the end. Add name of the documents at the end. take it from the context data. {context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

In [26]:
qa_chain1 = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [22]:
question = "Does this document have any information on COVID impact?"

In [29]:
result1 = qa_chain1({"query" : question})

In [30]:
result1["result"]

'Yes, this document provides information on the impact of the Covid pandemic on industry revenues, changes in working patterns, and the recovery of revenue to approximately 85% of pre-Covid levels.'

In [31]:
result["source_documents"]

[Document(page_content='Railway. The Covid pandemic had a major impact on \nindustry revenues. It has also been a catalyst for \nchanges in working patterns. Revenue  has recovered \nto approximately 85% of pre -Covid levels but this \nhas been complicated by Industrial Relations \nchallenges. Considerable uncertainty  remains which \nmakes it  difficult to define  the post -Covid base case ,', metadata={'page': 6, 'source': 'data/Southern Region CP7 Strategic Business Plan.pdf'}),
 Document(page_content='and to forecast  how this will develop over the \nduration of this plan through  to 2029.   \nWe have s een an improvement in train service \nperformance as a result of reduced passenger \nnumbers during the Covid period. We have also seen \nthe impact on performance of returning passenger \ncrowding.  \nOverall infrastructure reliability has improved over', metadata={'page': 6, 'source': 'data/Southern Region CP7 Strategic Business Plan.pdf'}),
 Document(page_content='the pandemic no