In [1]:
#! pip install langchain

In [2]:
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

In [3]:
# The course will show the pip installs you would need to install packages on your own machine.
# These packages are already installed on this platform and should not be run again.
#! pip install pypdf 

In [10]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("javanotes5.pdf")
pages = loader.load()

In [11]:
len(pages)

699

In [12]:
page = pages[0]

In [13]:
print(page.page_content[0:500])

Introduction to Programming Using Java
Version 5.0, December 2006
(Version 5.0.2, with minor corrections, November 2007)
David J. Eck
Hobart and William Smith Colleges


In [14]:
page.metadata

{'source': 'javanotes5.pdf', 'page': 0}

In [15]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=150,
    length_function=len
)

In [16]:
docs = text_splitter.split_documents(pages)

In [17]:
len(docs)

2502

In [18]:
for i, doc in enumerate(docs):
    print(f"--- Page {i + 1} ---")
    print(doc)
    print()  # Blank line for readability


--- Page 1 ---
page_content='Introduction to Programming Using Java\nVersion 5.0, December 2006\n(Version 5.0.2, with minor corrections, November 2007)\nDavid J. Eck\nHobart and William Smith Colleges' metadata={'source': 'javanotes5.pdf', 'page': 0}

--- Page 2 ---
page_content='ii\nc/circlecopyrt1996–2007, David J. Eck\nDavid J. Eck (eck@hws.edu)\nDepartment of Mathematics and Computer Science\nHobart and William Smith Colleges\nGeneva, NY 14456\nThis book can be distributed in unmodiﬁed form with no restri ctions.\nModiﬁed versions can be made and distributed provided they a re distributed\nunder the same license as the original. More speciﬁcally: Th is work is\nlicensed under the Creative Commons Attribution-Share Ali ke 2.5 License.\nTo view a copy of this license, visit http://creativecommon s.org/licenses/by-\nsa/2.5/ or send a letter to Creative Commons, 543 Howard Stre et, 5th\nFloor, San Francisco, California, 94105, USA.\nThe web site for this book is: http://math.hws.edu/ja

In [19]:
print(docs[0])  # to see how each item looks


page_content='Introduction to Programming Using Java\nVersion 5.0, December 2006\n(Version 5.0.2, with minor corrections, November 2007)\nDavid J. Eck\nHobart and William Smith Colleges' metadata={'source': 'javanotes5.pdf', 'page': 0}


In [20]:
len(pages)

699

In [21]:
from langchain.embeddings.openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

In [22]:
import numpy as np

In [23]:
from langchain.vectorstores import Chroma

In [24]:
persist_directory = 'docs/chroma/'

In [25]:
!rm -rf ./docs/chroma  # remove old database files if any

In [26]:
vectordb = Chroma.from_documents(
    documents=docs,
    embedding=embedding,
    persist_directory=persist_directory
)

In [27]:
print(vectordb._collection.count())

2502


In [28]:
question = "which web site should i refer to learn java using book"

In [29]:
docs = vectordb.similarity_search(question,k=3)

In [30]:
len(docs)

3

In [31]:
docs[0].page_content

'that you read the exercise solutions if you want to get the mos t out of this book. This is\ncertainly not a Java reference book, and it is not even close t o a comprehensive survey of all\nthe features of Java. It is notwritten as a quick introduction to Java for people who alread y\nknow another programming language. Instead, it is directed mainly towards people who are\nlearning programming for the ﬁrst time, and it is as much abou t general programming concepts\nas it is about Java in particular. I believe that Introduction to Programming using Java is\nfully competitive with the conventionally published, prin ted programming textbooks that are\navailable on the market. (Well, all right, I’ll confess that I think it’s better.)\nThere are several approaches to teaching Java. One approach uses graphical user interface\nprogramming from the very beginning. Some people believe th at object oriented programming'

In [33]:
vectordb.persist()

In [44]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

In [48]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
persist_directory = 'docs/chroma/'

In [49]:
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))


In [50]:
# Wrap our vectorstore
llm = OpenAI(temperature=0, model="gpt-3.5-turbo-instruct")
compressor = LLMChainExtractor.from_llm(llm)

In [51]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever()
)

In [53]:
question = "what is class?"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)

Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised APIError: HTTP code 500 from API (500 error
).
Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised APIError: HTTP code 500 from API (500 error
).
Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised APIError: HTTP code 500 from API (500 error
).
Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised APIError: HTTP code 500 from API (500 error
).


Document 1:

A class seems like it should be a pretty important thing. A cla ss is a high-level building block
of a program, representing a potentially complex idea and it s associated data and behaviors.
However, such trivial classes are often useful an d even essential. Fortunately, in Java,
I can ease the embarrassment, because one class can be nested inside another class.
----------------------------------------------------------------------------------------------------
Document 2:

- "An object certainly doesn’t “belong” to a class in the same way that a me mber variable “belongs” to a class."
- "From the point of view of programming, it is more exac t to say that classes are used to create objects."
- "A class is a kind of factory for constructi ng objects."
- "The non-static parts of the class specify, or describe, what variables and subrouti nes the objects will contain."
- "This is part of the explanation of how objects diﬀer from classes: Ob jects are created and destroyed a

In [55]:
import datetime
current_date = datetime.datetime.now().date()
if current_date < datetime.date(2023, 9, 2):
    llm_name = "gpt-3.5-turbo-0301"
else:
    llm_name = "gpt-3.5-turbo"
print(llm_name)

gpt-3.5-turbo


In [56]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name=llm_name, temperature=0)

In [57]:
from langchain.chains import RetrievalQA

In [58]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)

In [59]:
result = qa_chain({"query": question})

In [60]:
result["result"]

'A class in programming is a high-level building block that represents a potentially complex idea and its associated data and behaviors. It is used to create objects, which are instances of the class. Classes can be nested inside another class, and they serve as a factory for constructing objects. In Java, every class is a subclass of the Object class, which is at the top of a class hierarchy that includes every other class.'

In [61]:
from langchain.prompts import PromptTemplate

# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)


In [62]:
# Run chain
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [63]:
question = "what is class in java?"

In [64]:
result = qa_chain({"query": question})

In [65]:
result["result"]

'A class in Java is a high-level building block representing a potentially complex idea and its associated data and behaviors. It is a type that can be used to specify the type of a variable, formal parameter, or return type of a function. Thanks for asking!'

In [68]:
result["source_documents"][0]

Document(page_content='204 CHAPTER 5. OBJECTS AND CLASSES\npublic class myClass { . . .\nis exactly equivalent to\npublic class myClass extends Object { . . .\nThis means that class Object is at the top of a huge class hierarchy that includes every\nother class. (Semantially, Object is an abstract class, in fact the most abstract class of all.\nCuriously, however, it is not declared to be abstract syntactially, which means that you can\ncreate objects of type Object . What you would do with them, however, I have no idea.)\nSince every class is a subclass of Object , a variable of type Object can refer to any object\nwhatsoever, of any type. Java has several standard data stru ctures that are designed to hold\nObject s, but since every object is an instance of class Object , these data structures can actually\nhold any object whatsoever. One example is the “ArrayList” d ata structure, which is deﬁned by\nthe class ArrayList in the package java.util . (ArrayList is discussed more fully i

In [69]:
result["source_documents"][1]

Document(page_content='averages. (Again, this is what it means to say that an instanc e method belongs to an individual\nobject, not to the class.)\nIn Java, a class is a type, similar to the built-in types such as intandboolean . So, a class\nname can be used to specify the type of a variable in a declarat ion statement, the type of a\nformal parameter, or the return type of a function. For examp le, a program could deﬁne a\nvariable named stdof typeStudent with the statement\nStudent std;\nHowever, declaring a variable does notcreate an object! This is an important point, which is\nrelated to this Very Important Fact:\nIn Java, no variable can ever hold an object.\nA variable can only hold a reference to an object.\nYou should think of objects as ﬂoating around independently in the computer’s memory. In\nfact, there is a special portion of memory called the heap where objects live. Instead of holding\nan object itself, a variable holds the information necessar y to ﬁnd the object in 

In [70]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.document_loaders import TextLoader
from langchain.chains import RetrievalQA,  ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader

In [71]:
def load_db(file, chain_type, k):
    # load documents
    loader = PyPDFLoader(file)
    documents = loader.load()
    # split documents
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
    docs = text_splitter.split_documents(documents)
    # define embedding
    embeddings = OpenAIEmbeddings()
    # create vector database from data
    db = DocArrayInMemorySearch.from_documents(docs, embeddings)
    # define retriever
    retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": k})
    # create a chatbot chain. Memory is managed externally.
    qa = ConversationalRetrievalChain.from_llm(
        llm=ChatOpenAI(model_name=llm_name, temperature=0), 
        chain_type=chain_type, 
        retriever=retriever, 
        return_source_documents=True,
        return_generated_question=True,
    )
    return qa 


In [74]:
import panel as pn
import param

class cbfs(param.Parameterized):
    chat_history = param.List([])
    answer = param.String("")
    db_query  = param.String("")
    db_response = param.List([])
    
    def __init__(self,  **params):
        super(cbfs, self).__init__( **params)
        self.panels = []
        self.loaded_file = "javanotes5.pdf"
        self.qa = load_db(self.loaded_file,"stuff", 4)
    
    def call_load_db(self, count):
        if count == 0 or file_input.value is None:  # init or no file specified :
            return pn.pane.Markdown(f"Loaded File: {self.loaded_file}")
        else:
            file_input.save("temp.pdf")  # local copy
            self.loaded_file = file_input.filename
            button_load.button_style="outline"
            self.qa = load_db("temp.pdf", "stuff", 4)
            button_load.button_style="solid"
        self.clr_history()
        return pn.pane.Markdown(f"Loaded File: {self.loaded_file}")

    def convchain(self, query):
        if not query:
            return pn.WidgetBox(pn.Row('User:', pn.pane.Markdown("", width=600)), scroll=True)
        result = self.qa({"question": query, "chat_history": self.chat_history})
        self.chat_history.extend([(query, result["answer"])])
        self.db_query = result["generated_question"]
        self.db_response = result["source_documents"]
        self.answer = result['answer'] 
        self.panels.extend([
            pn.Row('User:', pn.pane.Markdown(query, width=600)),
            pn.Row('ChatBot:', pn.pane.Markdown(self.answer, width=600, style={'background-color': '#F6F6F6'}))
        ])
        inp.value = ''  #clears loading indicator when cleared
        return pn.WidgetBox(*self.panels,scroll=True)

    @param.depends('db_query ', )
    def get_lquest(self):
        if not self.db_query :
            return pn.Column(
                pn.Row(pn.pane.Markdown(f"Last question to DB:", styles={'background-color': '#F6F6F6'})),
                pn.Row(pn.pane.Str("no DB accesses so far"))
            )
        return pn.Column(
            pn.Row(pn.pane.Markdown(f"DB query:", styles={'background-color': '#F6F6F6'})),
            pn.pane.Str(self.db_query )
        )

    @param.depends('db_response', )
    def get_sources(self):
        if not self.db_response:
            return 
        rlist=[pn.Row(pn.pane.Markdown(f"Result of DB lookup:", styles={'background-color': '#F6F6F6'}))]
        for doc in self.db_response:
            rlist.append(pn.Row(pn.pane.Str(doc)))
        return pn.WidgetBox(*rlist, width=600, scroll=True)

    @param.depends('convchain', 'clr_history') 
    def get_chats(self):
        if not self.chat_history:
            return pn.WidgetBox(pn.Row(pn.pane.Str("No History Yet")), width=600, scroll=True)
        rlist=[pn.Row(pn.pane.Markdown(f"Current Chat History variable", styles={'background-color': '#F6F6F6'}))]
        for exchange in self.chat_history:
            rlist.append(pn.Row(pn.pane.Str(exchange)))
        return pn.WidgetBox(*rlist, width=600, scroll=True)

    def clr_history(self,count=0):
        self.chat_history = []
        return 


In [76]:
import panel as pn
pn.extension()


In [77]:
cb = cbfs()

file_input = pn.widgets.FileInput(accept='.pdf')
button_load = pn.widgets.Button(name="Load DB", button_type='primary')
button_clearhistory = pn.widgets.Button(name="Clear History", button_type='warning')
button_clearhistory.on_click(cb.clr_history)
inp = pn.widgets.TextInput( placeholder='Enter text here…')

bound_button_load = pn.bind(cb.call_load_db, button_load.param.clicks)
conversation = pn.bind(cb.convchain, inp) 

jpg_pane = pn.pane.Image( './img/convchain.jpg')

tab1 = pn.Column(
    pn.Row(inp),
    pn.layout.Divider(),
    pn.panel(conversation,  loading_indicator=True, height=300),
    pn.layout.Divider(),
)
tab2= pn.Column(
    pn.panel(cb.get_lquest),
    pn.layout.Divider(),
    pn.panel(cb.get_sources ),
)
tab3= pn.Column(
    pn.panel(cb.get_chats),
    pn.layout.Divider(),
)
tab4=pn.Column(
    pn.Row( file_input, button_load, bound_button_load),
    pn.Row( button_clearhistory, pn.pane.Markdown("Clears chat history. Can use to start a new topic" )),
    pn.layout.Divider(),
    pn.Row(jpg_pane.clone(width=400))
)
dashboard = pn.Column(
    pn.Row(pn.pane.Markdown('# ChatWithYourData_Bot')),
    pn.Tabs(('Conversation', tab1), ('Database', tab2), ('Chat History', tab3),('Configure', tab4))
)
dashboard