# First we gotta install/import the secret sauce ☝

In [61]:
!pip3 install langchain
!pip install pypdf
!pip3 install pinecone-client
!pip install python-dotenv
!pip install openai
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.5.1


In [52]:
import os
from dotenv import load_dotenv

from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Next let's load the textbook we want to query 📚

In [44]:
#got to load this everytime you close colab
loader = PyPDFLoader("/content/Introduction_to_the_theory_of_computation_third_edition_-_Michael_Sipser.pdf")

In [45]:
data = loader.load()  #pyPDF already splits up into documents, if not using that then use RecursiveCharacterTextSplitter

In [46]:
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[30].page_content)} characters in your document')

You have 482 document(s) in your data
There are 2529 characters in your document


In [47]:
data[30]

Document(page_content='0.2 MATHEMATICAL NOTIONS AND TERMINOLOGY 7\nEXAMPLE 0.6\nIfAandBare as in Example 0.5,\nA×B×A={\n(1,x ,1),(1,x ,2),(1,y,1),(1,y,2),(1,z,1),(1,z,2),\n(2,x ,1),(2,x ,2),(2,y,1),(2,y,2),(2,z,1),(2,z,2)}\n.\nIf we have the Cartesian product of a set with itself, we use the shorthand\nk\ued17\n \ued1a\ued19\n \ued18\nA×A×···× A=Ak.\nEXAMPLE 0.7\nThe set N2equals N×N .I t c o n s i s t s o f a l l o r d e r e d p a i r s o f n a t u r a l n u m b e r s .\nWe also may write it as {(i, j)|i, j≥1}.\nFUNCTIONS AND RELATIONS\nFunctions are central to mathematics. A function is an object that sets up an\ninput–output relationship. A function takes an input and produces an output.\nIn every function, the same input always produces the same output. If fis a\nfunction whose output value is bwhen the input value is a,w ew r i t e\nf(a)=b.\nAf u n c t i o na l s oi sc a l l e da mapping ,a n d ,i f f(a)=b,w es a yt h a t fmaps atob.\nFor example, the absolute value function absta

# Now let's create embeddings 🧠


In [48]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

In [57]:
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY', 'super-secret-openai-api-key')
embeddings = OpenAIEmbeddings(openai_api_key = OPENAI_API_KEY)

In [59]:
pinecone.init (
    api_key='4680f661-b526-466f-b9e8-bdd6b60fafa2',
    environment='us-west4-gcp-free'
)
index_name = "tb-chatbot"

In [63]:
docsearch = Pinecone.from_texts([t.page_content for t in data], embeddings, index_name=index_name)

#Query to find where the chatbot is getting its info from 📑

In [65]:
query = "What is a DFA?"
docs = docsearch.similarity_search(query)

In [66]:
docs

[Document(page_content='48 CHAPTER 1 / REGULAR LANGUAGES\nFIGURE 1.27\nThe nondeterministic ﬁnite automaton N1\nThe difference between a deterministic ﬁnite automaton, abbreviated DFA,\nand a nondeterministic ﬁnite automaton, abbreviated NFA,i si m m e d i a t e l ya p -\nparent. First, every state of a DFAalways has exactly one exiting transition arrow\nfor each symbol in the alphabet. The NFAshown in Figure 1.27 violates that\nrule. State q1has one exiting arrow for 0,b u ti th a st w of o r 1;q2has one arrow\nfor0,b u ti th a sn o n ef o r 1.I n a n NFA,as t a t em a yh a v ez e r o ,o n e ,o rm a n y\nexiting arrows for each alphabet symbol.\nSecond, in a DFA,l a b e l so nt h et r a n s i t i o na r r o w sa r es y m b o l sf r o mt h ea l p h a -\nbet. This NFAhas an arrow with the label ε.I ng e n e r a l ,a n NFAmay have arrows\nlabeled with members of the alphabet or ε.Z e r o ,o n e ,o rm a n ya r r o w sm a ye x i t\nfrom each state with the label ε.\nHow does an NFAcompute?

# The grand finale ✅

In [67]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [68]:
llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")

In [73]:
query = "What is a GNFA?"
docs = docsearch.similarity_search(query)

In [74]:
chain.run(input_documents=docs, question=query)

' A generalized nondeterministic finite automaton (GNFA) is a type of finite automaton wherein the transition arrows may have any regular expressions as labels, instead of only members of the alphabet or ε. A GNFA reads blocks of symbols from the input, not necessarily just one symbol at a time as in an ordinary NFA. A GNFA is nondeterministic and so may have several different ways to process the same input string. It accepts its input if its processing can cause the GNFA to be in an accept state at the end of the input.'