In [1]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain import OpenAI, VectorDBQA
from langchain.document_loaders import DirectoryLoader
import magic
import os
import nltk
from io import StringIO

In [2]:
documents=[]
relevant='./data/relevant/'
generic='./data/generic'

for text_file in os.listdir(relevant):
    with open(os.path.join(relevant,text_file), 'r') as File:
        documents.append(File.read())
for text_file in os.listdir(generic):
    with open(os.path.join(generic,text_file), 'r') as File:
        documents.append(File.read())

In [3]:
len(documents)

20

In [4]:
text_splitter=CharacterTextSplitter(chunk_size=256,chunk_overlap=0)

In [5]:
texts=text_splitter.create_documents(documents)

Created a chunk of size 439, which is longer than the specified 256
Created a chunk of size 997, which is longer than the specified 256
Created a chunk of size 2735, which is longer than the specified 256
Created a chunk of size 288, which is longer than the specified 256
Created a chunk of size 1233, which is longer than the specified 256
Created a chunk of size 422, which is longer than the specified 256
Created a chunk of size 1102, which is longer than the specified 256
Created a chunk of size 1118, which is longer than the specified 256
Created a chunk of size 1371, which is longer than the specified 256
Created a chunk of size 797, which is longer than the specified 256
Created a chunk of size 1413, which is longer than the specified 256
Created a chunk of size 681, which is longer than the specified 256
Created a chunk of size 1171, which is longer than the specified 256
Created a chunk of size 812, which is longer than the specified 256
Created a chunk of size 1554, which is lo

In [6]:
embeddings= OpenAIEmbeddings(openai_api_key=os.getenv('OPENAI_API_KEY'))

In [7]:
docsearch = Chroma.from_documents(texts, embeddings)

Using embedded DuckDB without persistence: data will be transient


In [8]:
qa = VectorDBQA.from_chain_type(llm=OpenAI(), chain_type="stuff", vectorstore=docsearch)



In [9]:
query = "Where was Shakespeare born?"
qa.run(query)

' Shakespeare was born in Stratford-upon-Avon.'

In [10]:
query= "Which year was Shakespeare born?"
qa.run(query)

' 1564'

In [11]:
query= "What was Shakespeare's most famous work?"
qa.run(query)

' Hamlet.'

In [12]:
matches=docsearch.similarity_search(query)
len(matches) # Number of chunks that match the query

4

### ChatBot: Baseline version

In [2]:
from IPython.display import display
import ipywidgets as widgets

In [3]:
from langchain.chains import ConversationalRetrievalChain

In [4]:
chat_history=[]
def on_submit(_):
    query=input_box.value
    input_box.value=""

    if query.lower()=="exit":
        print("Exiting.....thanks :)")
        return 
    result=qa({"question":query, "chat_history":chat_history})
    chat_history.append((query,result['answer']))

    display(widgets.HTML(f'<b>User: </b> {query}'))
    display(widgets.HTML(f'<b><font color="blue">Chatbot:</font></b> {result["answer"]}'))
print("Welcome to the Shakespeare chatbot! Type 'exit' to come back to the real world! ")

input_box=widgets.Text(placeholder='Hark! Pose thy query forthwith! ')
input_box.on_submit(on_submit)

Welcome to the Shakespeare chatbot! Type 'exit' to come back to the real world! 


  input_box.on_submit(on_submit)
