In [1]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import WikipediaLoader
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [2]:
#pip install --upgrade langchain langchain_community

In [3]:
#pip install pypdf -q

In [4]:
import getpass
import os
if 'OPENAI_API_KEY' not in os.environ:
    os.environ['OPENAI_API_KEY'] = getpass.getpass("Use your own key")

In [5]:
#!pip install docx2txt -q

In [6]:
pip install wikipedia -q

Note: you may need to restart the kernel to use updated packages.


In [7]:
def load_document(file):
    import os
    name, extension = os.path.splitext(file)

    if extension == '.pdf':
        
        from langchain.document_loaders import PyPDFLoader
        print(f'Loading {file}')
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        from langchain.document_loaders import Docx2txtLoader
        loader = Docx2txtLoader(file)
    else:
        print('Document format not supported!')
        return None
        
    data = loader.load()
    return data
    
# wikipedia
def load_from_wikipedia(query, lang='en', load_max_docs=2):
    from langchain.document_loaders import WikipediaLoader
    loader = WikipediaLoader(query=query, lang=lang, load_max_docs=load_max_docs)
    data = loader.load()
    return data
    

## Embedding and Uploading to a Vector Database(Pinecone)

In [8]:
def insert_or_fetch_embeddings(index_name, chunks):
    import pinecone
    from langchain_community.vectorstores import Pinecone
    from langchain_openai import OpenAIEmbeddings
    from pinecone import PodSpec

    pc = pinecone.Pinecone()
    embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)


    if index_name in pc.list_indexes().names():
        print(f'Index {index_name} already exists. Loading embeddings ...', end='')
        vector_store = Pinecone.from_existings_index(index_name, embeddings)
        print('Ok')
    else:
        print(f'Creating index {index_name} and embeddings ...', end='')
        pc.create_index(
            name=index_name,
            dimension=1536,
            metric='cosine',
            spec=PodSpec(environment='gcp-starter')
        )
        vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)
        print('Ok')
        return vector_store

In [9]:
def delete_pinecone_index(index_name='all'):
    import pinecone
    pc = pinecone.Pinecone()
    if index_name == 'all':
        indexes = pc.list_indexes().names()
        print('Deleting all indexes ...')
        for index in indexes:
            pc.delete_index(index)
        print('Ok')
    else:
        print(f'Deleting index {index_name}...', end = '')
        pc.delete.index(index_name)
        print('Ok')

## Running Code

In [10]:
data = load_document('us_constitution.pdf')
print(data[1].page_content)
print(data[10].metadata)

print(f'You have {len(data)} pages in your data')
print(f'There are {len(data[20].page_content)} Characters in the page')

Loading us_constitution.pdf
The
House
of
Representatives
shall
be
composed
of
Members
chosen
every
second
Y ear
by
the
People
of
the
several
States,
and
the
Electors
in
each
State
shall
have
the
Qualifications
requisite
for
Electors
of
the
most
numerous
Branch
of
the
State
Legislature.
No
Person
shall
be
a
Representative
who
shall
not
have
attained
to
the
Age
of
twenty
five
Y ears,
and
been
seven
Y ears
a
Citizen
of
the
United
States,
and
who
shall
not,
when
elected,
be
an
Inhabitant
of
that
State
in
which
he
shall
be
chosen.
Representatives
and
direct
T axes
shall
be
apportioned
among
the
several
States
which
may
be
included
within
this
Union,
according
to
their
respective
Numbers,
which
shall
be
determined
by
adding
to
the
whole
Number
of
free
Persons,
including
those
bound
to
Service
for
a
T erm
of
Y ears,
and
excluding
Indians
not
taxed,
three
fifths
of
all
other
Persons.
The
actual
Enumeration
shall
be
made
within
three
Y ears
after
the
first
Meeting
of
the
Congress
of
the
United


In [11]:
data = load_document('cover_lt.docx')
print(data[0].page_content)

Cover Letter

To Whom so ever it may concern

February 2024

Dear Hiring Manager,

I trust this letter finds you well. As I explore new career opportunities, I am eager to express my interest in potential data related roles within your esteemed organization. As a current Master International student at the University of Texas at Dallas, pursuing Information Technology and Management I have completed the Graduate Certification’s in Applied Machine Learning, Business Intelligence, and Data Mining. Moreover, I’m simultaneously working as a Graduate Teaching Assistant.

Having worked at Anglo Eastern Shipping Management in the dual capacity of Marine Engineer and Data Analytics Specialist, I have led in significant cost savings, notably achieving $1.5M/year through insightful analysis. Proficient in Python, SQL, and AWS technologies, I executed a project that resulted in a 30% reduction in breakdowns ($300K savings). I am enthusiastic about the prospect of leveraging my unique skill set to

In [12]:
data = load_from_wikipedia('GPT-4')
print(data[0].page_content)

Generative Pre-trained Transformer 4 (GPT-4) is a multimodal large language model created by OpenAI, and the fourth in its series of GPT foundation models. It was launched on March 14, 2023, and made publicly available via the paid chatbot product ChatGPT Plus, via OpenAI's API, and via the free chatbot Microsoft Copilot.  As a transformer-based model, GPT-4 uses a paradigm where pre-training using both public data and "data licensed from third-party providers" is used to predict the next token. After this step, the model was then fine-tuned with reinforcement learning feedback from humans and AI for human alignment and policy compliance.: 2 Observers reported that the iteration of ChatGPT using GPT-4 was an improvement on the previous iteration based on GPT-3.5, with the caveat that GPT-4 retains some of the problems with earlier revisions. GPT-4, equipped with vision capabilities (GPT-4V), is capable of taking images as input on ChatGPT. OpenAI has declined to reveal various technica

In [13]:
def chunk_data(data, chunk_size=256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    chunks = text_splitter.split_documents(data)
    return chunks
    

In [14]:
chunks = chunk_data(data)
print(len(chunks))
print(chunks[10].page_content)

38
== Capabilities ==


In [15]:
delete_pinecone_index()

  from tqdm.autonotebook import tqdm


Deleting all indexes ...
Ok


In [16]:
index_name = 'askadocument'
vector_store = insert_or_fetch_embeddings(index_name, chunks)

Creating index askadocument and embeddings ...Ok


## Asking and Getting Answers

In [17]:
# We will retrieve the most relevant chunk of text from our vector database then
# we will feed those chunks to LLM to get the final answer

def ask_and_get_answer(vector_store, q):
    from langchain.chains import RetrievalQA
    from langchain.chat_models import ChatOpenAI

    llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)
    
    retriever = vector_store.as_retriever(search_type='similarity', searc_kwargs={'k': 3})
    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

    answer = chain.invoke(q)
    return answer
    
        

In [18]:
q = 'What is the whole document about?'
answer = ask_and_get_answer(vector_store, q)
print(answer)

  warn_deprecated(


{'query': 'What is the whole document about?', 'result': "I don't have access to the content of the document you are referring to. If you can provide more context or specific information, I may be able to help you better."}


In [19]:
import time
i = 1
print('Write Quit or Exit to quit.')
while True:
    q = input(f'Question #{i}: ')
    i = i+1
    if q.lower() in ['quit','exit']:
        print("Quitting ... bye bye !")
        time.sleep(2)
        break
    answer = ask_and_get_answer(vector_store, q)
    print(f'\nAnswer: {answer}')
    print(f'\n {"-" * 50} \n')

Write Quit or Exit to quit.


Question #1:  quit


Quitting ... bye bye !


In [20]:
delete_pinecone_index()

Deleting all indexes ...
Ok


In [21]:
data = load_from_wikipedia('ChatGPT', 'ro')
chunks = chunk_data(data)
index_name = 'chatgpt'
vector_store = insert_or_fetch_embeddings(index_name, chunks)

Creating index chatgpt and embeddings ...Ok


In [22]:
q = "Ce este ChatGPT?" 
answer = ask_and_get_answer(vector_store, q)
print(answer)

{'query': 'Ce este ChatGPT?', 'result': 'ChatGPT este un model de limbaj generativ bazat pe tehnologia GPT-3 (Generative Pre-trained Transformer 3), dezvoltat de OpenAI. Acesta poate genera texte care sunt coerente și rezonabile, având capacitatea de a răspunde la întrebări și de a purta conversații în mod natural.'}


In [23]:
q = "Cand a fost lansat GPT4?" 
answer = ask_and_get_answer(vector_store, q)
print(answer)

{'query': 'Cand a fost lansat GPT4?', 'result': 'GPT-4 a fost lansat pe 14 martie 2023 și este disponibil doar pentru abonații plătitori.'}


## It gives good memory but it lacks memory!

## Chroma as Vector DB

In [24]:
#pip install -q chromadb

Note: you may need to restart the kernel to use updated packages.


In [36]:
def create_embeddings_chroma(chunks, persistent_directory='./chroma_db'):
    from langchain.vectorstores import Chroma
    from langchain_openai import OpenAIEmbeddings

    # Instantiating an embedding model to convert text to numerical representations
    embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)
    vector_store = Chroma.from_documents(chunks, embeddings, persist_directory=persistent_directory)
    return vector_store

# Function to load the existing embeddings from disk to a vector store object
def load_embeddings_chroma(persist_directory='./chroma_db'):
    from langchain.vectorstores import Chroma
    from langchain_openai import OpenAIEmbeddings

      # Instantiating an embedding model to convert text to numerical representations
    embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)
    vector_store = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
    return vector_store
    
    

In [37]:
# Lets load a PDF File
data = load_document('rag_powered_by_google_search.pdf')
chunks = chunk_data(data, chunk_size=256)
vector_store = create_embeddings_chroma(chunks)

Loading rag_powered_by_google_search.pdf


In [38]:
q = 'What is vertex AI Search?'
answer = ask_and_get_answer(vector_store, q)
print(answer)

{'query': 'What is vertex AI Search?', 'result': 'Vertex AI Search is a feature that offers customizable answers, search tuning, vector search, grounding, and compliance updates for enterprises. It also includes new generative AI capabilities and enterprise-ready features.'}


In [39]:
db = load_embeddings_chroma()
q = 'How many pairs of questions and answers had the StackOverflow dataset?'
answer = ask_and_get_answer(vector_store, q)
print(answer)

{'query': 'How many pairs of questions and answers had the StackOverflow dataset?', 'result': 'The StackOverflow dataset had 8 million pairs of questions and answers.'}


### As we can see from the pdf it answers correctly
### However there is a drawback to this, if I ask a follow up question, it will not have access to the previous chat history and will respond that it doesn't know the context

In [46]:
# For example,
q = 'Multiply that number by 2. '
answer = ask_and_get_answer(vector_store, q)
print(answer)

{'query': 'Multiply that number by 2. ', 'result': "I don't know."}


### As we can see it could not answer, next we add Memory!

### Adding Memory (Chat History)

## Common requirement for RAG is support for follow up questions.
## Follow up questions can contain references to past chat history




In [57]:
from langchain_openai import ChatOpenAI
# This chain is used to have a conversation based on the retrieved documents
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
# This acts as a buffer for storing conversation

llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature = 0)
# Retriever is a crucial component that helps LLM's find and access relevant information
# it does this by searching the relevant data and retrieving the information
# Here the retriever will search by similarity and will retrieve the top k most similar chunks
retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 5})

# Creating a memory object that will be passed to the conversational retrieval chain as an argument
# the memory will be automatically updated with the questions and answers
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# Creating the Conversational Retrieval Chains
crc = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    chain_type='stuff', # use all of the text from the documents
    verbose = True
)


In [59]:
def ask_question(q, chain):
    result = chain.invoke({'question': q})
    return result

In [60]:
# Using the same rag powered google search pdf
# Loading document
data = load_document('rag_powered_by_google_search.pdf')
chunks = chunk_data(data, chunk_size=256)
vector_store = create_embeddings_chroma(chunks)


Loading rag_powered_by_google_search.pdf


In [61]:
#result = {}
q = "How many pairs of questions and answers had the StackOverflow dataset?"
result = ask_question(q, crc)
print(result)



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
simple similarity search was highly e ective because the dataset had 8
million pairs of questions and answers. However, datasets do not
usually contain pre-existing question-and-answer or query-and-

simple similarity search was highly e ective because the dataset had 8
million pairs of questions and answers. However, datasets do not
usually contain pre-existing question-and-answer or query-and-

simple similarity search was highly e ective because the dataset had 8
million pairs of questions and answers. However, datasets do not
usually contain pre-existing question-and-answer or query-and-

simple similarity search was highly e ective because the dataset had 8
million p

In [62]:
# if only want answer
print(result['answer'])

The StackOverflow dataset had 8 million pairs of questions and answers.


In [63]:
q = 'Multiple that number by 10.'
result = ask_question(q, crc)




[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

Chat History:

Human: How many pairs of questions and answers had the StackOverflow dataset?
Assistant: The StackOverflow dataset had 8 million pairs of questions and answers.
Follow Up Input: Multiple that number by 10.
Standalone question:[0m

[1m> Finished chain.[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
simple similarity search was highly e ective because the dataset had 8
million pairs of questions and answers. However, datasets do not
usually contain pre-existing question-and-a

In [64]:
print(result['answer'])

The result of multiplying the number of pairs of questions and answers in the StackOverflow dataset by 10 would be 80 million pairs.


In [None]:
#!pip install streamlit
#!pip install streamlit_chat

In [None]:
'''import streamlit as st
from streamlit_chat import message
from dotenv import load_dotenv, find_dotenv
from langchain_openai import ChatOpenAI
from langchain.chains import LLMChain
from langchain.prompts import  PromptTemplate
from langchain.memory import ConversationBufferMemory
#from langchain_community.chat_message_histories import StreamLitChatMessageHistory

In [None]:
'''from langchain_community.chat_message_histories import (
    StreamlitChatMessageHistory,
)

In [None]:
'''load_dotenv(find_dotenv())

template = """You are an AI chatbot having a conversation with a human

{history}
Human: {human_input}
AI: """

prompt = PromptTemplate(input_variable=["history","human_input"], template=template)

msgs = StreamlitChatMessageHistory(key="special_app_key")

memory = ConversationBufferMemory(memory_key="history", chat_memory=msgs)

def load_chain():
    llm = ChatOpenAI(temperature = 0)
    llm_chain = LLMChain(llm=llm, prompt=prompt, memory=memory)
    return llm_chain
    

In [65]:
q = 'Divide the result by 80'
result = ask_question(q, crc)
print(result['answer'])



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

Chat History:

Human: How many pairs of questions and answers had the StackOverflow dataset?
Assistant: The StackOverflow dataset had 8 million pairs of questions and answers.
Human: Multiple that number by 10.
Assistant: The result of multiplying the number of pairs of questions and answers in the StackOverflow dataset by 10 would be 80 million pairs.
Follow Up Input: Divide the result by 80
Standalone question:[0m

[1m> Finished chain.[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
simp

In [66]:
result

{'question': 'Divide the result by 80',
 'chat_history': [HumanMessage(content='How many pairs of questions and answers had the StackOverflow dataset?'),
  AIMessage(content='The StackOverflow dataset had 8 million pairs of questions and answers.'),
  HumanMessage(content='Multiple that number by 10.'),
  AIMessage(content='The result of multiplying the number of pairs of questions and answers in the StackOverflow dataset by 10 would be 80 million pairs.'),
  HumanMessage(content='Divide the result by 80'),
  AIMessage(content='The result of dividing 80 million pairs by 80 is 1 million.')],
 'answer': 'The result of dividing 80 million pairs by 80 is 1 million.'}

In [67]:
# To check the chat history
for item in result['chat_history']:
    print(item)

content='How many pairs of questions and answers had the StackOverflow dataset?'
content='The StackOverflow dataset had 8 million pairs of questions and answers.'
content='Multiple that number by 10.'
content='The result of multiplying the number of pairs of questions and answers in the StackOverflow dataset by 10 would be 80 million pairs.'
content='Divide the result by 80'
content='The result of dividing 80 million pairs by 80 is 1 million.'


## Using a Custom Prompt

In [94]:
# Using prompt engineering technique to ask questions in a specific ways
from langchain_openai import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate

llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0)
retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 5})
memory = ConversationBufferMemory(memory_key='chat_history', return_message=True)

# Define the template for system prompt, this sets the general behavior of the LLM
system_template = r'''' 
Use the following pieces of context to answer the user's question.
---------------
Context: ```{context}```
'''
# Now the template for the user prompt 

user_template = '''
Question: ```{question}```
Chat History: ```{chat_history}```
'''

messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template(user_template)
]

qa_prompt = ChatPromptTemplate.from_messages(messages)

crc=ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    chain_type='stuff',
    combine_docs_chain_kwargs={'prompt': qa_prompt},
    verbose=True
)

In [88]:
print(qa_prompt)

input_variables=['chat_history', 'context', 'question'] messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], template="' \nUse the following pieces of context to answer the user's question.\n---------------\nContext: ```{context}```\n")), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['chat_history', 'question'], template='\nQuestion: ```{question}```\nChat History: ```{chat_history}```\n'))]


In [83]:
db = load_embeddings_chroma()
q = 'How many pairs of questions and answers had the StackOverflow dataset?'
result = ask_question(q, crc)
print(result)



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: ' 
Use the following pieces of context to answer the user's question.
---------------
Context: ```simple similarity search was highly e ective because the dataset had 8
million pairs of questions and answers. However, datasets do not
usually contain pre-existing question-and-answer or query-and-

simple similarity search was highly e ective because the dataset had 8
million pairs of questions and answers. However, datasets do not
usually contain pre-existing question-and-answer or query-and-

simple similarity search was highly e ective because the dataset had 8
million pairs of questions and answers. However, datasets do not
usually contain pre-existing question-and-answer or query-and-

simple similarity search was highly e ective because the dataset had 8
million pairs of questions and answers. However, datasets do not
usually contain pre-exi

In [89]:
db = load_embeddings_chroma()
q = 'How many pairs of questions and answers had the StackOverflow dataset?'
result = ask_question(q, crc)
print(result)



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: ' 
Use the following pieces of context to answer the user's question.
---------------
Context: ```simple similarity search was highly e ective because the dataset had 8
million pairs of questions and answers. However, datasets do not
usually contain pre-existing question-and-answer or query-and-

simple similarity search was highly e ective because the dataset had 8
million pairs of questions and answers. However, datasets do not
usually contain pre-existing question-and-answer or query-and-

simple similarity search was highly e ective because the dataset had 8
million pairs of questions and answers. However, datasets do not
usually contain pre-existing question-and-answer or query-and-

simple similarity search was highly e ective because the dataset had 8
million pairs of questions and answers. However, datasets do not
usually contain pre-exi

In [95]:
q = 'When was elon musk born?'
result = ask_question(q, crc)
print(result)



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: ' 
Use the following pieces of context to answer the user's question.
---------------
Context: ```house AI processors back in 2013 — the Tensor Processing Unit (TPU).
TPUs are speci cally tailored to provide the underlying power needed
to suppo  machine learning and AI workloads, but their genesis is

house AI processors back in 2013 — the Tensor Processing Unit (TPU).
TPUs are speci cally tailored to provide the underlying power needed
to suppo  machine learning and AI workloads, but their genesis is

house AI processors back in 2013 — the Tensor Processing Unit (TPU).
TPUs are speci cally tailored to provide the underlying power needed
to suppo  machine learning and AI workloads, but their genesis is

house AI processors back in 2013 — the Tensor Processing Unit (TPU).
TPUs are speci cally tailored to provide the underlying power needed
to sup