# Question Answering

## PDF reader

https://www.impromptubook.com/

In [None]:
%pip install langchain openai chromadb tiktoken pypdf llama-index tqdm

In [3]:
import os
import tomli
import openai
with open('../.streamlit/secrets.toml','rb') as f:
    toml_dict = tomli.load(f)
openai.api_key = toml_dict['OPEN_AI_KEY']
os.environ['OPENAI_API_KEY'] = toml_dict['OPEN_AI_KEY']
# os.environ['PINECONE_API_KEY'] = toml_dict['PINECONE_API_KEY']
# os.environ['PINECONE_API_ENV'] = toml_dict['PINECONE_API_ENV']
os.environ['aws_access_key_id'] = toml_dict['aws']['aws_access_key_id']
os.environ['aws_secret_access_key'] = toml_dict['aws']['aws_secret_access_key']

### PDF split

In [2]:
# break down into 1 pdf per page
import pypdf

def pdf_to_pages(file):
	"extract text (pages) from pdf file"
	pages = []
	pdf = pypdf.PdfReader(file)
	for p in range(len(pdf.pages)):
		page = pdf.pages[p]
		text = page.extract_text()
		pages += [text]
	return pages

In [3]:
file = "../book/impromptu-rh.pdf"
pdf_to_pages(file)

['',
 'Impromptu\nAmplifying Our Humanity \nThrough AI\nBy Reid Hoffman  \nwith GPT-4',
 'Impromptu: AmplIfyIng our HumAnIty tHrougH AI  \nby Reid Hoffman with GPT-4\nISBNs:  979-8-9878319-1-5 Trade Paperback  \n979-8-9878319-2-2 Hardcover \n979-8-9878319-0-8 Ebook\nCopyright 2023 Dallepedia LLC\nPublished by Dallepedia LLC. All rights reserved. No portion \nof this work may be reproduced in any form, with the limited \nexception of brief quotations in editorial reviews, without express \nwritten permission from the publisher.',
 '“The analytical engine weaves algebraic patterns, like \nthe loom weaves flowers and leaves. Artificial intel-\nligence can embroider this fabric of logic with the \ncolours of imagination and creativity.”\n—ADA LOVELACE, AS IMAGINED BY GPT-4\n“Artificial intelligence is not a separate entity from us, \nbut a reflection of our own mind. By cultivating it with \nskillful means and ethical values, we can enhance our \nown enlightenment and benefit all beings.”\

In [13]:
import os
from PyPDF2 import PdfReader, PdfWriter 

input_pdf = PdfReader(open(file, 'rb'))

# Split the PDF file into multiple files
for page_num in range(len(input_pdf.pages)):
    # Create a new PDF file for each page
    output = PdfWriter()
    output.add_page(input_pdf.pages[page_num])

    # Save the new PDF file
    output_filename = '../book/impromptu/impromptu-%s.pdf' % (page_num + 1)
    with open(output_filename, 'wb') as output_file:
        output.write(output_file)

### Document loaders

In [1]:
# load document with langchain
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("../book/impromptu-rh.pdf")
documents = loader.load_and_split()
documents

[Document(page_content='Impromptu\nAmplifying Our Humanity \nThrough AI\nBy Reid Hoffman  \nwith GPT-4', metadata={'source': '../book/impromptu-rh.pdf', 'page': 1}),
 Document(page_content='Impromptu: AmplIfyIng our HumAnIty tHrougH AI  \nby Reid Hoffman with GPT-4\nISBNs:  979-8-9878319-1-5 Trade Paperback  \n979-8-9878319-2-2 Hardcover \n979-8-9878319-0-8 Ebook\nCopyright 2023 Dallepedia LLC\nPublished by Dallepedia LLC. All rights reserved. No portion \nof this work may be reproduced in any form, with the limited \nexception of brief quotations in editorial reviews, without express \nwritten permission from the publisher.', metadata={'source': '../book/impromptu-rh.pdf', 'page': 2}),
 Document(page_content='“The analytical engine weaves algebraic patterns, like \nthe loom weaves flowers and leaves. Artificial intel-\nligence can embroider this fabric of logic with the \ncolours of imagination and creativity.”\n—ADA LOVELACE, AS IMAGINED BY GPT-4\n“Artificial intelligence is not a se

In [52]:
# load single PDF
from llama_index import download_loader
PDFReader = download_loader("PDFReader")
loader = PDFReader()
documents = loader.load_data("../book/impromptu-rh.pdf")
documents

[Document(text='\nImpromptu\nAmplifying Our Humanity \nThrough AI\nBy Reid Hoffman  \nwith GPT-4\nImpromptu: AmplIfyIng our HumAnIty tHrougH AI  \nby Reid Hoffman with GPT-4\nISBNs:  979-8-9878319-1-5 Trade Paperback  \n979-8-9878319-2-2 Hardcover \n979-8-9878319-0-8 Ebook\nCopyright 2023 Dallepedia LLC\nPublished by Dallepedia LLC. All rights reserved. No portion \nof this work may be reproduced in any form, with the limited \nexception of brief quotations in editorial reviews, without express \nwritten permission from the publisher.\n“The analytical engine weaves algebraic patterns, like \nthe loom weaves flowers and leaves. Artificial intel-\nligence can embroider this fabric of logic with the \ncolours of imagination and creativity.”\n—ADA LOVELACE, AS IMAGINED BY GPT-4\n“Artificial intelligence is not a separate entity from us, \nbut a reflection of our own mind. By cultivating it with \nskillful means and ethical values, we can enhance our \nown enlightenment and benefit all bein

In [54]:
# load folder
from llama_index import SimpleDirectoryReader
documents = SimpleDirectoryReader('../book/impromptu').load_data()
documents

[Document(text='', doc_id='d004c8ca-3750-4141-8fcc-aae0f96f23a1', embedding=None, doc_hash='dc937b59892604f5a86ac96936cd7ff09e25f18ae6b758e8014a24c7fa039e91', extra_info=None),
 Document(text='3Introduction: Moments of Enlightenment\ninspector needs to document or report the change for \ncompliance purposes. \n- A joke answer: The number of restaurant inspectors \nneeded to change a light bulb is four. One to hold the \nladder, one to unscrew the old bulb, one to screw in the \nnew bulb, and one to write a citation for using the wrong \nwattage.\nConsider all that’s going on here. First, while GPT-4 appar-\nently “assumes” that I may be asking it to complete a joke, it \nalso wants to cover all the bases—thus, the initial “factual” \nanswer. And while this answer isn’t actually factual—a restau-\nrant inspector’s job is to inspect, not to perform repairs—it does \nsuggest a fair degree of “knowledge” about the conditions and \nconventions of an inspection process. \nThe joke answer con

### Connect to S3

In [5]:
import boto3
s3_client = boto3.client('s3',aws_access_key_id = os.environ['aws_access_key_id'],
                    aws_secret_access_key = os.environ['aws_secret_access_key'])
s3_bucket = 'book48'
object_name = 'index/index-impromptu.json'
file_name = 'index-impromptu.json'
s3_client.download_file(s3_bucket, object_name,file_name)

### Create index

In [6]:
from llama_index import LLMPredictor, ServiceContext
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage
chat = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo")
llm_predictor = LLMPredictor(llm=chat)
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)



In [7]:
res = chat([HumanMessage(content='What does impromptu mean?')])
res.dict()['content']

'Impromptu means done or said without preparation or planning; spontaneous.'

In [8]:
from llama_index import GPTSimpleVectorIndex

# load from disk
index = GPTSimpleVectorIndex.load_from_disk('../index/index-impromptu.json',service_context=service_context)

In [58]:
documents[0].to_dict()

{'text': '',
 'doc_id': 'd004c8ca-3750-4141-8fcc-aae0f96f23a1',
 'embedding': None,
 'doc_hash': 'dc937b59892604f5a86ac96936cd7ff09e25f18ae6b758e8014a24c7fa039e91',
 'extra_info': None}

In [59]:
from llama_index import GPTSimpleVectorIndex

index = GPTSimpleVectorIndex.from_documents(documents,service_context=service_context)

INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 89246 tokens
INFO:chromadb.db.duckdb:Exiting: Cleaning up .chroma directory


In [60]:
# save to disk
index.save_to_disk('../index/index-impromptu2.json')
# load from disk
index = GPTSimpleVectorIndex.load_from_disk('../index/index-impromptu2.json')

In [None]:
index.service_context.llm_predictor.llm

### Query index

In [69]:
# query = 'what is the opinion of the author?'
query = 'what are the opportunities of AI?'
response = index.query(query)

INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 534 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 7 tokens


In [70]:
response.response

'\nThe opportunities of AI for Opportunity@Work are to help employers discover skilled technical workers who have not gone through traditional routes such as college or university. AI can help match employers with workers who have acquired their skills through alternative routes such as community college, workforce training, bootcamps, certificate programs, military service, or learning in frontline jobs. AI can also help employers visualize skills data to better understand the skills of potential employees.'

### Add follow-up questions

In [71]:
follow_up = f'''
Provide a list of of 3 follow-up questions the initial query and the result associated. 
Here is the initial query:
{query}
Here is the result associated:
{response.response}
Format the output as a JSON file with a list of the 3 follow-up questions in a field called follow-up
'''
res = chat([HumanMessage(content=follow_up)])
jason = res.dict()['content']

In [73]:
import json
json.loads(jason)

{'initial_query': 'what are the opportunities of AI?',
 'result': 'The opportunities of AI for Opportunity@Work are to help employers discover skilled technical workers who have not gone through traditional routes such as college or university. AI can help match employers with workers who have acquired their skills through alternative routes such as community college, workforce training, bootcamps, certificate programs, military service, or learning in frontline jobs. AI can also help employers visualize skills data to better understand the skills of potential employees.',
 'follow-up': ['How does AI specifically match employers with skilled workers?',
  'What are some examples of industries that can benefit from AI matching workers with alternative skill acquisition routes?',
  'How does AI visualize skills data for employers?']}

### Get sources

In [74]:
response.get_formatted_sources()

'> Source (Doc id: 5a4f53d9-5cc8-452b-8ef4-a371b0e4abb1): 148Impromptu: Amplifying Our Humanity Through AI\ndegrees, it has unfortunately become far too com...'

In [81]:
response.source_nodes[0].node.to_dict()

{'text': '148Impromptu: Amplifying Our Humanity Through AI\ndegrees, it has unfortunately become far too common to pre-\nemptively exclude anyone without a bachelors’ degree from \nconsideration of a vast range of jobs, even ones they have the \nskills to do well.\nThat’s not fair to millions of job seekers and harms employ -\ners who miss skilled talent. It’s bad for our economy and \nsociety. The opportunity to attend, fund, and complete college \nis unevenly distributed. Geography, poverty, family obliga-\ntions, conflicts between class and work schedules, and many \nother factors prevent many ambitious and capable people \nfrom getting a college degree. Those affected tend to be dis-\nproportionately from Black and Hispanic communities, not to \nmention rural Americans and military veterans of all racial and \nethnic backgrounds.\nOpportunity@Work addresses this unfairness and underval-\nued talent by providing the business case, visualized skills data, \nand tools to “tear the pap

In [79]:
print(response.source_nodes[0].node.get_text())

148Impromptu: Amplifying Our Humanity Through AI
degrees, it has unfortunately become far too common to pre-
emptively exclude anyone without a bachelors’ degree from 
consideration of a vast range of jobs, even ones they have the 
skills to do well.
That’s not fair to millions of job seekers and harms employ -
ers who miss skilled talent. It’s bad for our economy and 
society. The opportunity to attend, fund, and complete college 
is unevenly distributed. Geography, poverty, family obliga-
tions, conflicts between class and work schedules, and many 
other factors prevent many ambitious and capable people 
from getting a college degree. Those affected tend to be dis-
proportionately from Black and Hispanic communities, not to 
mention rural Americans and military veterans of all racial and 
ethnic backgrounds.
Opportunity@Work addresses this unfairness and underval-
ued talent by providing the business case, visualized skills data, 
and tools to “tear the paper ceiling” so “if you can 

## QA

### load_qa_chain

In [39]:
# load document with langchain
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("../book/impromptu-chap1.pdf")
chap1 = loader.load_and_split()
chap1

[Document(page_content='23\nEDUCATION\nIf Hollywood central casting ever wants to portray a \nbeloved instructor from an idealized past, they could do worse \nthan University of Texas at Austin Professor Steven Mintz. Over \nfour decades of teaching, Professor Mintz has published books \nand articles on topics as diverse as the psychology of prominent \nAnglo-American literary families and political good vs. evil.\nIn collared shirts, with graying hair, Mintz can’t suppress his \nsmile as he teaches. Students adore him: among hundreds who \nhave anonymously rated Mintz online, his average rating is a \nperfect five out of five, with posts such as “easily the best orator \nI’ve ever witnessed,” “his lectures feel more like storytelling \nthan class,” and “passionate about what he teaches.”\nProfessor Mintz, frankly, excelled as a professor long before the \ndevelopment of LLMs. So you might have expected him to have \nreacted with indifference or hostility to the late 2022 public \nrele

In [41]:
from langchain.chains.question_answering import load_qa_chain
from langchain.chat_models import ChatOpenAI
chat = ChatOpenAI(model_name='gpt-3.5-turbo')

chain = load_qa_chain(llm=chat, chain_type="map_reduce")
query = 'what is the potential of AI in education?'
chain.run(input_documents=chap1, question=query)

'The potential of AI in education is to improve learning outcomes by providing personalized feedback, adaptive content, data analysis, and interactive simulations that can help students develop their skills and curiosity. AI can also be used as a tool for inquiry to generate multiple possible answers or solutions and foster critical thinking and problem-solving skills. Furthermore, AI can be used to create detailed and specific lesson plans for diverse students, such as learners with special needs or different levels of prior knowledge. The use of AI in education may change what skills are valuable for students to learn, as they may need to focus more on higher-order thinking skills, such as critical analysis, synthesis, evaluation, and creativity, rather than on memorization, repetition, or imitation. However, the human elements of education, such as empathy, motivation, and socialization, cannot be replaced by AI, and teachers should use AI as a complementary resource, not a substitu

### RetrievalQA

In [32]:
from langchain.chains import RetrievalQA
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

In [33]:
# split the documents into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
# select which embeddings we want to use
embeddings = OpenAIEmbeddings()
# create the vectorestore to use as the index
db = Chroma.from_documents(texts, embeddings)

INFO:chromadb.telemetry.posthog:Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.
INFO:chromadb:Running Chroma using direct local API.
INFO:clickhouse_connect.driver.ctypes:Successfully imported ClickHouse Connect C data optimizations
INFO:clickhouse_connect.driver.ctypes:Successfully import ClickHouse Connect C/Numpy optimizations
INFO:clickhouse_connect.json_impl:Using python library for writing JSON byte strings


In [36]:
# expose this index in a retriever interface
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k":3})
# create a chain to answer questions 
qa = RetrievalQA.from_chain_type(
    llm=chat, chain_type="stuff", retriever=retriever, return_source_documents=True)
query = 'what are the opportunities of using AI?'
result = qa({"query": query})

In [37]:
result

{'query': 'what are the opportunities of using AI?',
 'result': 'According to the given context, there are several opportunities of using AI, including:\n\n- AI can help identify opportunities for improvement and cost savings in businesses by analyzing data sets and recommending strategies and tactics.\n- AI can help with forecasting and predictive analysis to better anticipate market trends, customer behavior, and other economic conditions.\n- AI can generate insights from customer feedback and employee surveys, allowing for more targeted and effective management consulting activities.\n- AI can help in automated skills assessment, job matching, resume optimization, and job interview preparation, enabling individuals to stay updated with the latest industry trends and develop the skills required to stay competitive.\n- AI can lead to a reduction in the overall number of jobs in some professions, but it can also enable the remaining professionals to become more productive and effective

In [38]:
print(result['result'])

According to the given context, there are several opportunities of using AI, including:

- AI can help identify opportunities for improvement and cost savings in businesses by analyzing data sets and recommending strategies and tactics.
- AI can help with forecasting and predictive analysis to better anticipate market trends, customer behavior, and other economic conditions.
- AI can generate insights from customer feedback and employee surveys, allowing for more targeted and effective management consulting activities.
- AI can help in automated skills assessment, job matching, resume optimization, and job interview preparation, enabling individuals to stay updated with the latest industry trends and develop the skills required to stay competitive.
- AI can lead to a reduction in the overall number of jobs in some professions, but it can also enable the remaining professionals to become more productive and effective.
- AI can create new opportunities for growth and advancement, and it 

### VectorstoreIndexCreator

Wrapper for the logic above

Source:

https://python.langchain.com/en/latest/modules/chains/getting_started.html
https://github.com/hwchase17/langchain/blob/master/langchain/indexes/vectorstore.py#L21-L74

In [42]:
index = VectorstoreIndexCreator(
    # split the documents into chunks
    text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0),
    # select which embeddings we want to use
    embedding=OpenAIEmbeddings(),
    # use Chroma as the vectorestore to index and search embeddings
    vectorstore_cls=Chroma
).from_loaders([loader])
index.query(llm=chat, question=query, chain_type="stuff")

INFO:chromadb.telemetry.posthog:Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.
INFO:chromadb:Running Chroma using direct local API.


'According to the context provided, AI has the potential to transform the way we learn and deliver instruction. AI-driven tools will be used to automate and streamline some of the more mundane aspects of teaching, such as grading and content creation, but their potential for more meaningful applications will be limited by cost, access, and privacy concerns. If these issues can be addressed, then AI will become a powerful tool in education, transforming the way we learn and deliver instruction. AI-driven tools will be used to provide personalized, individualized learning experiences, as well as to create and curate content, giving teachers more time to focus on engaging and inspiring their students. AI can also develop customized lesson plans and activity guides based on the preferences of teachers, suggest personalized interventions or strategies to address specific learning challenges, and synthesize a wide range of resources and materials that suit their curriculum goals and pedagogi

### ConversationalRetrievalChain

conversation memory + RetrievalQAChain

Allow for passing in chat history which can be used for follow up questions.

Source: https://python.langchain.com/en/latest/modules/chains/index_examples/chat_vector_db.html

In [43]:
from langchain.chains import ConversationalRetrievalChain

In [44]:
# split the documents into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
# select which embeddings we want to use
embeddings = OpenAIEmbeddings()
# create the vectorestore to use as the index
db = Chroma.from_documents(texts, embeddings)
# expose this index in a retriever interface
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k":2})
# create a chain to answer questions 
qa = ConversationalRetrievalChain.from_llm(chat, retriever)
chat_history = []
result = qa({"question": query, "chat_history": chat_history})

INFO:chromadb.telemetry.posthog:Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.
INFO:chromadb:Running Chroma using direct local API.
INFO:chromadb.db.duckdb:Exiting: Cleaning up .chroma directory


In [45]:
result["answer"]

"AI has the potential to transform the way we learn and deliver instruction in education. With AI-driven tools, teachers can provide personalized, individualized learning experiences that are tailored to each student's needs and interests. AI can identify the topics and skills that students need to focus on, and provide guidance and support as needed. It can also be used to automate and streamline some of the more mundane aspects of teaching, such as grading and content creation, giving teachers more time to focus on engaging and inspiring their students. However, the potential for more meaningful applications of AI in education will depend on a range of factors, including cost, access, and privacy concerns."

In [46]:
chat_history = [(query, result["answer"])]
query = 'what are the risks of using AI?'
result = qa({"question": query, "chat_history": chat_history})


In [47]:
chat_history

[('what is the potential of AI in education?',
  "AI has the potential to transform the way we learn and deliver instruction in education. With AI-driven tools, teachers can provide personalized, individualized learning experiences that are tailored to each student's needs and interests. AI can identify the topics and skills that students need to focus on, and provide guidance and support as needed. It can also be used to automate and streamline some of the more mundane aspects of teaching, such as grading and content creation, giving teachers more time to focus on engaging and inspiring their students. However, the potential for more meaningful applications of AI in education will depend on a range of factors, including cost, access, and privacy concerns.")]

In [48]:
result['answer']

'According to the context given, the technology may create an educational system that is less equitable and accessible. Additionally, the potential for more meaningful applications of AI in education will be limited by cost, access, and privacy concerns. While AI-driven tools may be used to automate and streamline some aspects of teaching, such as grading and content creation, it is important to ensure that AI is not used as a replacement for human interaction with students.'