In [22]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import WikipediaLoader
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [23]:
#pip install --upgrade langchain langchain_community

In [24]:
#pip install pypdf -q

In [25]:
import getpass
import os
if 'OPENAI_API_KEY' not in os.environ:
    os.environ['OPENAI_API_KEY'] = getpass.getpass("Use your own key")

In [26]:
#!pip install docx2txt -q

In [27]:
pip install wikipedia -q

Note: you may need to restart the kernel to use updated packages.


In [28]:
def load_document(file):
    import os
    name, extension = os.path.splitext(file)

    if extension == '.pdf':
        
        from langchain.document_loaders import PyPDFLoader
        print(f'Loading {file}')
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        from langchain.document_loaders import Docx2txtLoader
        loader = Docx2txtLoader(file)
    else:
        print('Document format not supported!')
        return None
        
    data = loader.load()
    return data
    
# wikipedia
def load_from_wikipedia(query, lang='en', load_max_docs=2):
    from langchain.document_loaders import WikipediaLoader
    loader = WikipediaLoader(query=query, lang=lang, load_max_docs=load_max_docs)
    data = loader.load()
    return data
    

## Embedding and Uploading to a Vector Database(Pinecone)

In [29]:
def insert_or_fetch_embeddings(index_name, chunks):
    import pinecone
    from langchain_community.vectorstores import Pinecone
    from langchain_openai import OpenAIEmbeddings
    from pinecone import PodSpec

    pc = pinecone.Pinecone()
    embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)


    if index_name in pc.list_indexes().names():
        print(f'Index {index_name} already exists. Loading embeddings ...', end='')
        vector_store = Pinecone.from_existings_index(index_name, embeddings)
        print('Ok')
    else:
        print(f'Creating index {index_name} and embeddings ...', end='')
        pc.create_index(
            name=index_name,
            dimension=1536,
            metric='cosine',
            spec=PodSpec(environment='gcp-starter')
        )
        vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)
        print('Ok')
        return vector_store

In [30]:
def delete_pinecone_index(index_name='all'):
    import pinecone
    pc = pinecone.Pinecone()
    if index_name == 'all':
        indexes = pc.list_indexes().names()
        print('Deleting all indexes ...')
        for index in indexes:
            pc.delete_index(index)
        print('Ok')
    else:
        print(f'Deleting index {index_name}...', end = '')
        pc.delete.index(index_name)
        print('Ok')

## Running Code

In [31]:
data = load_document('us_constitution.pdf')
print(data[1].page_content)
print(data[10].metadata)

print(f'You have {len(data)} pages in your data')
print(f'There are {len(data[20].page_content)} Characters in the page')

Loading us_constitution.pdf
The
House
of
Representatives
shall
be
composed
of
Members
chosen
every
second
Y ear
by
the
People
of
the
several
States,
and
the
Electors
in
each
State
shall
have
the
Qualifications
requisite
for
Electors
of
the
most
numerous
Branch
of
the
State
Legislature.
No
Person
shall
be
a
Representative
who
shall
not
have
attained
to
the
Age
of
twenty
five
Y ears,
and
been
seven
Y ears
a
Citizen
of
the
United
States,
and
who
shall
not,
when
elected,
be
an
Inhabitant
of
that
State
in
which
he
shall
be
chosen.
Representatives
and
direct
T axes
shall
be
apportioned
among
the
several
States
which
may
be
included
within
this
Union,
according
to
their
respective
Numbers,
which
shall
be
determined
by
adding
to
the
whole
Number
of
free
Persons,
including
those
bound
to
Service
for
a
T erm
of
Y ears,
and
excluding
Indians
not
taxed,
three
fifths
of
all
other
Persons.
The
actual
Enumeration
shall
be
made
within
three
Y ears
after
the
first
Meeting
of
the
Congress
of
the
United


In [32]:
data = load_document('cover_lt.docx')
print(data[0].page_content)

Cover Letter

To Whom so ever it may concern

February 2024

Dear Hiring Manager,

I trust this letter finds you well. As I explore new career opportunities, I am eager to express my interest in potential data related roles within your esteemed organization. As a current Master International student at the University of Texas at Dallas, pursuing Information Technology and Management I have completed the Graduate Certification’s in Applied Machine Learning, Business Intelligence, and Data Mining. Moreover, I’m simultaneously working as a Graduate Teaching Assistant.

Having worked at Anglo Eastern Shipping Management in the dual capacity of Marine Engineer and Data Analytics Specialist, I have led in significant cost savings, notably achieving $1.5M/year through insightful analysis. Proficient in Python, SQL, and AWS technologies, I executed a project that resulted in a 30% reduction in breakdowns ($300K savings). I am enthusiastic about the prospect of leveraging my unique skill set to

In [33]:
data = load_from_wikipedia('GPT-4')
print(data[0].page_content)

Generative Pre-trained Transformer 4 (GPT-4) is a multimodal large language model created by OpenAI, and the fourth in its series of GPT foundation models. It was launched on March 14, 2023, and made publicly available via the paid chatbot product ChatGPT Plus, via OpenAI's API, and via the free chatbot Microsoft Copilot.  As a transformer-based model, GPT-4 uses a paradigm where pre-training using both public data and "data licensed from third-party providers" is used to predict the next token. After this step, the model was then fine-tuned with reinforcement learning feedback from humans and AI for human alignment and policy compliance.: 2 Observers reported that the iteration of ChatGPT using GPT-4 was an improvement on the previous iteration based on GPT-3.5, with the caveat that GPT-4 retains some of the problems with earlier revisions. GPT-4, equipped with vision capabilities (GPT-4V), is capable of taking images as input on ChatGPT. OpenAI has declined to reveal various technica

In [34]:
def chunk_data(data, chunk_size=256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    chunks = text_splitter.split_documents(data)
    return chunks
    

In [35]:
chunks = chunk_data(data)
print(len(chunks))
print(chunks[10].page_content)

38
== Capabilities ==


In [36]:
delete_pinecone_index()

Deleting all indexes ...
Ok


In [37]:
index_name = 'askadocument'
vector_store = insert_or_fetch_embeddings(index_name, chunks)

Creating index askadocument and embeddings ...Ok


## Asking and Getting Answers

In [38]:
# We will retrieve the most relevant chunk of text from our vector database then
# we will feed those chunks to LLM to get the final answer

def ask_and_get_answer(vector_store, q):
    from langchain.chains import RetrievalQA
    from langchain.chat_models import ChatOpenAI

    llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)
    
    retriever = vector_store.as_retriever(search_type='similarity', searc_kwargs={'k': 3})
    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

    answer = chain.invoke(q)
    return answer
    
        

In [39]:
q = 'What is the whole document about?'
answer = ask_and_get_answer(vector_store, q)
print(answer)

{'query': 'What is the whole document about?', 'result': "I'm sorry, but you haven't provided any specific document or topic to give you a specific answer. If you can provide more context or details, I'd be happy to help summarize the document for you."}


In [40]:
import time
i = 1
print('Write Quit or Exit to quit.')
while True:
    q = input(f'Question #{i}: ')
    i = i+1
    if q.lower() in ['quit','exit']:
        print("Quitting ... bye bye !")
        time.sleep(2)
        break
    answer = ask_and_get_answer(vector_store, q)
    print(f'\nAnswer: {answer}')
    print(f'\n {"-" * 50} \n')

Write Quit or Exit to quit.


Question #1:  exit


Quitting ... bye bye !


In [41]:
delete_pinecone_index()

Deleting all indexes ...
Ok


In [43]:
data = load_from_wikipedia('ChatGPT', 'ro')
chunks = chunk_data(data)
index_name = 'chatgpt'
vector_store = insert_or_fetch_embeddings(index_name, chunks)

Creating index chatgpt and embeddings ...Ok


In [44]:
q = "Ce este ChatGPT?" 
answer = ask_and_get_answer(vector_store, q)
print(answer)

{'query': 'Ce este ChatGPT?', 'result': 'ChatGPT (Chat Generative Pre-trained Transformer) este un chatbot lansat de OpenAI în noiembrie 2022. Acesta este un membru al familiei de modele de limbaj generative pre-antrenate și a fost inițial bazat pe GPT-3.5. A fost creat pentru a oferi conversații articulate și răspunsuri detaliate în diferite domenii. O versiune bazată pe GPT-4 a fost lansată pe 14 martie 2023, disponibilă doar pentru abonații plătitori.'}


In [45]:
q = "Cand a fost lansat GPT4?" 
answer = ask_and_get_answer(vector_store, q)
print(answer)

{'query': 'Cand a fost lansat GPT4?', 'result': 'GPT-4 a fost lansat pe 14 martie 2023.'}


## It gives good memory but it lacks memory!

## Chroma as Vector DB

In [46]:
pip install -q chromadb

Note: you may need to restart the kernel to use updated packages.


In [56]:
def create_embeddings_chroma(chunks, persistent_directory='./chroma_db'):
    from langchain.vectorstores import Chroma
    from langchain_openai import OpenAIEmbeddings

    # Instantiating an embedding model to convert text to numerical representations
    embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)
    vector_store = Chroma.from_documents(chunks, embeddings, persist_directory=persistent_directory)
    return vector_store

# Function to load the existing embeddings from disk to a vector store object
def load_embeddings_chroma(persist_directory='./chroma_db'):
    from langchain.vectorstores import Chroma
    from langchain_openai import OpenAIEmbeddings

      # Instantiating an embedding model to convert text to numerical representations
    embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)
    vector_store = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
    return vector_store
    
    

In [57]:
# Lets load a PDF File
data = load_document('rag_powered_by_google_search.pdf')
chunks = chunk_data(data, chunk_size=256)
vector_store = create_embeddings_chroma(chunks)

Loading rag_powered_by_google_search.pdf


In [58]:
q = 'What is vertex AI Search?'
answer = ask_and_get_answer(vector_store, q)
print(answer)

{'query': 'What is vertex AI Search?', 'result': "Vertex AI Search is a feature within Google's Vertex AI platform that offers new generative AI capabilities and enterprise-ready features. It provides customizable answers, search tuning, vector search, grounding, and compliance updates specifically designed for enterprises. It aims to enhance search capabilities and provide powerful search features to businesses using the Vertex AI platform."}


In [59]:
db = load_embeddings_chroma()
q = 'How many pairs of questions and answers had the StackOverflow dataset?'
answer = ask_and_get_answer(vector_store, q)
print(answer)

{'query': 'How many pairs of questions and answers had the StackOverflow dataset?', 'result': 'The StackOverflow dataset had 8 million pairs of questions and answers.'}


### As we can see from the pdf it answers correctly
### However there is a drawback to this, if I ask a follow up question, it will not have access to the previous chat history and will respond that it doesn't know the context

In [60]:
# For example,
q = 'Multiply that number by 2. '
answer = ask_and_get_answer(vector_store, q)
print(answer)

{'query': 'Multiply that number by 2. ', 'result': "I'm sorry, but I cannot provide a specific number to multiply by 2 based on the given context. If you have a specific number or query in mind, please provide it, and I would be happy to help further."}


### As we can see it could not answer, next we add Memory!

### Adding memory to RAG