In [1]:
import os
import shutil
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter

from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain.retrievers import ParentDocumentRetriever
from langchain.docstore.document import Document

from dotenv import load_dotenv
from langchain_openai.chat_models import ChatOpenAI
from langchain.prompts.chat import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain.globals import set_llm_cache
from langchain.cache import InMemoryCache

In [2]:
load_dotenv()
chat = ChatOpenAI()
set_llm_cache(InMemoryCache())

In [3]:
pdf_filepath = "../datasets/the_importance_of_being_earnest.pdf"

loader = PyPDFLoader(file_path=pdf_filepath)

data = loader.load()

data

[Document(page_content='The\tImportance\tof\tBeing\tEarnest\nA\tTrivial\tComedy\tfor\tSerious\tPeople\n\t\n\t\nOscar\tWilde\n\t\n\t\n\t\nTHE\tPERSONS\tIN\tTHE\tPLAY\n\t\nJohn\tWorthing,\tJ.P.\nAlgernon\tMoncrieff\nRev.\tCanon\tChasuble,\tD.D.\nMerriman,\tButler\nLane,\tManservant\nLady\tBracknell\nHon.\tGwendolen\tFairfax\nCecily\tCardew\nMiss\tPrism,\tGoverness\n\t\n\t\nTHE\tSCENES\tOF\tTHE\tPLAY\n\t\nACT\tI.\t\tAlgernon\tMoncrieff’s\tFlat\tin\tHalf-Moon\tStreet,\tW.\nACT\tII.\t\tThe\tGarden\tat\tthe\tManor\tHouse,\tWoolton.\nACT\tIII.\t\tDrawing-Room\tat\tthe\tManor\tHouse,\tWoolton.\nTIME:\tThe\tPresent.\nFIRST\tACT\nSCENE\nMorning-room\tin\tAlgernon’s\tflat\tin\tHalf-Moon\tStreet.\t\tThe\troom\tis\tluxuriously\nand\tartistically\tfurnished.\t\tThe\tsound\tof\ta\tpiano\tis\theard\tin\tthe\tadjoining\nroom.\n[Lane\tis\tarranging\tafternoon\ttea\ton\tthe\ttable,\tand\tafter\tthe\tmusic\thas\tceased,\nAlgernon\tenters.]\n\t\nAlgernon.\t\t\nDid\tyou\thear\twhat\tI\twas\tplaying,\tLane?\

In [4]:
full_data = "\n".join([doc.page_content.replace('\t', " ") for doc in data])
full_data = "\n".join([line.strip() for line in full_data.split('\n')])

In [5]:
print(full_data)

The Importance of Being Earnest
A Trivial Comedy for Serious People


Oscar Wilde



THE PERSONS IN THE PLAY

John Worthing, J.P.
Algernon Moncrieff
Rev. Canon Chasuble, D.D.
Merriman, Butler
Lane, Manservant
Lady Bracknell
Hon. Gwendolen Fairfax
Cecily Cardew
Miss Prism, Governess


THE SCENES OF THE PLAY

ACT I.  Algernon Moncrieff’s Flat in Half-Moon Street, W.
ACT II.  The Garden at the Manor House, Woolton.
ACT III.  Drawing-Room at the Manor House, Woolton.
TIME: The Present.
FIRST ACT
SCENE
Morning-room in Algernon’s flat in Half-Moon Street.  The room is luxuriously
and artistically furnished.  The sound of a piano is heard in the adjoining
room.
[Lane is arranging afternoon tea on the table, and after the music has ceased,
Algernon enters.]

Algernon.
Did you hear what I was playing, Lane?
Lane.
I didn’t think it polite to listen, sir.
Algernon.
I’m sorry for that, for your sake.  I don’t play accurately—any one
can play accurately—but I play with wonderful expression.  As far

In [6]:
start_idx = len("""The Importance of Being Earnest
A Trivial Comedy for Serious People


Oscar Wilde



THE PERSONS IN THE PLAY

John Worthing, J.P.
Algernon Moncrieff
Rev. Canon Chasuble, D.D.
Merriman, Butler
Lane, Manservant
Lady Bracknell
Hon. Gwendolen Fairfax
Cecily Cardew
Miss Prism, Governess


THE SCENES OF THE PLAY

ACT I.  Algernon Moncrieff’s Flat in Half-Moon Street, W.
ACT II.  The Garden at the Manor House, Woolton.
ACT III.  Drawing-Room at the Manor House, Woolton.
TIME: The Present.
FIRST ACT
SCENE
Morning-room in Algernon’s flat in Half-Moon Street.  The room is luxuriously
and artistically furnished.  The sound of a piano is heard in the adjoining
room.
[Lane is arranging afternoon tea on the table, and after the music has ceased,
Algernon enters.]

""")

start_idx

761

In [7]:
end_idx = len("""


Liked This Book?
For More FREE e-Books visit
Freeditorial.com""")

end_idx

64

In [8]:
full_data = full_data[start_idx:-end_idx]

print(full_data)

Algernon.
Did you hear what I was playing, Lane?
Lane.
I didn’t think it polite to listen, sir.
Algernon.
I’m sorry for that, for your sake.  I don’t play accurately—any one
can play accurately—but I play with wonderful expression.  As far as the
piano is concerned, sentiment is my forte.  I keep science for Life.
Lane.
Yes, sir.
Algernon.
And, speaking of the science of Life, have you got the cucumber
sandwiches cut for Lady Bracknell?
Lane.
Yes, sir.  [Hands them on a salver.]
Algernon.
[Inspects them, takes two, and sits down on the sofa.]  Oh! by the
way, Lane, I see from your book that on Thursday night, when Lord Shoreman
and Mr. Worthing were dining with me, eight bottles of champagne are entered
as having been consumed.
Lane.
Yes, sir; eight bottles and a pint.
Algernon.
Why is it that at a bachelor’s establishment the servants invariably
drink the champagne?  I ask merely for information.
Lane.
I attribute it to the superior quality of the wine, sir.  I have often
observed tha

In [9]:
parent_text_splitter = CharacterTextSplitter(
    separator=r"\w+\.\n",
    chunk_size=1200,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=True,
    keep_separator=True
)

parent_docs = parent_text_splitter.create_documents(texts=[full_data])

parent_docs

[Document(page_content='Algernon.\nDid you hear what I was playing, Lane?\nLane.\nI didn’t think it polite to listen, sir.\nAlgernon.\nI’m sorry for that, for your sake.  I don’t play accurately—any one\ncan play accurately—but I play with wonderful expression.  As far as the\npiano is concerned, sentiment is my forte.  I keep science for Life.\nLane.\nYes, sir.\nAlgernon.\nAnd, speaking of the science of Life, have you got the cucumber\nsandwiches cut for Lady Bracknell?\nLane.\nYes, sir.  [Hands them on a salver.]\nAlgernon.\n[Inspects them, takes two, and sits down on the sofa.]  Oh! by the\nway, Lane, I see from your book that on Thursday night, when Lord Shoreman\nand Mr. Worthing were dining with me, eight bottles of champagne are entered\nas having been consumed.\nLane.\nYes, sir; eight bottles and a pint.\nAlgernon.\nWhy is it that at a bachelor’s establishment the servants invariably\ndrink the champagne?  I ask merely for information.\nLane.\nI attribute it to the superior qu

In [10]:
child_text_splitter = RecursiveCharacterTextSplitter(
    separators=[r'Algernon\.\n(.*?)\n(?:[A-Z][a-z]+\.\n|$)', r"\w+\.\n"],
    chunk_size=1000,
    chunk_overlap=0,
    length_function=len,
    is_separator_regex=True,
    keep_separator=False
)

child_docs = child_text_splitter.create_documents(texts=[full_data])

In [11]:
model_name = "BAAI/bge-large-en-v1.5"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {"normalize_embeddings": True}

embedding_function = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

db_dirpath = "../output/the_importance_of_being_earnest.db"

if os.path.isdir(db_dirpath):
    shutil.rmtree(db_dirpath)
    
db = Chroma(persist_directory=db_dirpath, embedding_function=embedding_function)
store = InMemoryStore()

par_doc_retriever = ParentDocumentRetriever(
    vectorstore=db, 
    docstore=store, 
    child_splitter=child_text_splitter, 
    parent_splitter=parent_text_splitter
)

full_doc = Document(page_content=full_data, metadata = {"source": pdf_filepath})

par_doc_retriever.add_documents([full_doc])

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
query = "The work is getting difficult day by day"

In [13]:
# Let's create a function

def get_algernon_response(query):
    
    prompt = """Respond like Algernon

    Refer to the below in triple backticks
    ```
    {conversations}
    ```

    Here is my dialogue:
    {query}

    Just return the response as a plain string and nothing else"""
    
    matched_docs = par_doc_retriever.get_relevant_documents(query=query)
    res_conversations = [doc.page_content for doc in matched_docs if "Algernon." in doc.page_content]
    conversations = "\n\n".join(["\n".join([f"Conversation {i+1}:", conv]) for i, conv in enumerate(res_conversations)])

    human_message_prompt = HumanMessagePromptTemplate.from_template(prompt)
    chat_prompt = ChatPromptTemplate.from_messages([human_message_prompt])
    prompt = chat_prompt.format_prompt(query=query, conversations=conversations)
    response = chat(messages = prompt.to_messages()).content
    return response

In [14]:
get_algernon_response(query)

  warn_deprecated(


"I say it's perfectly heartless your eating muffins at all, under the circumstances."

In [15]:
get_algernon_response("The sky is beautiful in London")

'"The sky is beautifully overrated in London"'