In [2]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import WikipediaLoader
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [3]:
#pip install --upgrade langchain langchain_community

In [4]:
#pip install pypdf -q

In [5]:
import getpass
import os
if 'OPENAI_API_KEY' not in os.environ:
    os.environ['OPENAI_API_KEY'] = getpass.getpass("Use your own key")

In [6]:
#!pip install docx2txt -q

In [7]:
pip install wikipedia -q

Note: you may need to restart the kernel to use updated packages.


In [8]:
def load_document(file):
    import os
    name, extension = os.path.splitext(file)

    if extension == '.pdf':
        
        from langchain.document_loaders import PyPDFLoader
        print(f'Loading {file}')
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        from langchain.document_loaders import Docx2txtLoader
        loader = Docx2txtLoader(file)
    else:
        print('Document format not supported!')
        return None
        
    data = loader.load()
    return data
    
# wikipedia
def load_from_wikipedia(query, lang='en', load_max_docs=2):
    from langchain.document_loaders import WikipediaLoader
    loader = WikipediaLoader(query=query, lag=lang, load_max_docs=load_max_docs)
    data = loader.load()
    return data
    

## Embedding and Uploading to a Vector Database(Pinecone)

In [30]:
def insert_or_fetch_embeddings(index_name, chunks):
    import pinecone
    from langchain_community.vectorstores import Pinecone
    from langchain_openai import OpenAIEmbeddings
    from pinecone import PodSpec

    pc = pinecone.Pinecone()
    embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)


    if index_name in pc.list_indexes().names():
        print(f'Index {index_name} already exists. Loading embeddings ...', end='')
        vector_store = Pinecone.from_existings_index(index_name, embeddings)
        print('Ok')
    else:
        print(f'Creating index {index_name} and embeddings ...', end='')
        pc.create_index(
            name=index_name,
            dimension=1536,
            metric='cosine',
            spec=PodSpec(environment='gcp-starter')
        )
        vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)
        print('Ok')
        return vector_store

In [31]:
def delete_pinecone_index(index_name='all'):
    import pinecone
    pc = pinecone.Pinecone()
    if index_name == 'all':
        indexes = pc.list_indexes().names()
        print('Deleting all indexes ...')
        for index in indexes:
            pc.delete_index(index)
        print('Ok')
    else:
        print(f'Deleting index {index_name}...', end = '')
        pc.delete.index(index_name)
        print('Ok')

## Running Code

In [32]:
data = load_document('us_constitution.pdf')
print(data[1].page_content)
print(data[10].metadata)

print(f'You have {len(data)} pages in your data')
print(f'There are {len(data[20].page_content)} Characters in the page')

Loading us_constitution.pdf
The
House
of
Representatives
shall
be
composed
of
Members
chosen
every
second
Y ear
by
the
People
of
the
several
States,
and
the
Electors
in
each
State
shall
have
the
Qualifications
requisite
for
Electors
of
the
most
numerous
Branch
of
the
State
Legislature.
No
Person
shall
be
a
Representative
who
shall
not
have
attained
to
the
Age
of
twenty
five
Y ears,
and
been
seven
Y ears
a
Citizen
of
the
United
States,
and
who
shall
not,
when
elected,
be
an
Inhabitant
of
that
State
in
which
he
shall
be
chosen.
Representatives
and
direct
T axes
shall
be
apportioned
among
the
several
States
which
may
be
included
within
this
Union,
according
to
their
respective
Numbers,
which
shall
be
determined
by
adding
to
the
whole
Number
of
free
Persons,
including
those
bound
to
Service
for
a
T erm
of
Y ears,
and
excluding
Indians
not
taxed,
three
fifths
of
all
other
Persons.
The
actual
Enumeration
shall
be
made
within
three
Y ears
after
the
first
Meeting
of
the
Congress
of
the
United


In [53]:
data = load_document('cover_lt.docx')
print(data[0].page_content)

Cover Letter

To Whom so ever it may concern

February 2024

Dear Hiring Manager,

I trust this letter finds you well. As I explore new career opportunities, I am eager to express my interest in potential data related roles within your esteemed organization. As a current Master International student at the University of Texas at Dallas, pursuing Information Technology and Management I have completed the Graduate Certification’s in Applied Machine Learning, Business Intelligence, and Data Mining. Moreover, I’m simultaneously working as a Graduate Teaching Assistant.

Having worked at Anglo Eastern Shipping Management in the dual capacity of Marine Engineer and Data Analytics Specialist, I have led in significant cost savings, notably achieving $1.5M/year through insightful analysis. Proficient in Python, SQL, and AWS technologies, I executed a project that resulted in a 30% reduction in breakdowns ($300K savings). I am enthusiastic about the prospect of leveraging my unique skill set to

In [54]:
data = load_from_wikipedia('GPT-4')
print(data[0].page_content)

TypeError: WikipediaLoader.__init__() got an unexpected keyword argument 'lag'

In [55]:
def chunk_data(data, chunk_size=256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    chunks = text_splitter.split_documents(data)
    return chunks
    

In [56]:
chunks = chunk_data(data)
print(len(chunks))
print(chunks[10].page_content)

15
Step Functions, I'm utilizing Computer Vision models in Fast AI and torch vision for transfer learning. Incorporating wandb, Resnet model architecture, and Efficient Net with Inception, my proficiency extends to working with Microsoft Azure Machine


In [57]:
delete_pinecone_index()

Deleting all indexes ...
Ok


In [58]:
index_name = 'askadocument'
vector_store = insert_or_fetch_embeddings(index_name, chunks)

Creating index askadocument and embeddings ...Ok


## Asking and Getting Answers

In [59]:
# We will retrieve the most relevant chunk of text from our vector database then
# we will feed those chunks to LLM to get the final answer

def ask_and_get_answer(vector_store, q):
    from langchain.chains import RetrievalQA
    from langchain.chat_models import ChatOpenAI

    llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)
    
    retriever = vector_store.as_retriever(search_type='similarity', searc_kwargs={'k': 3})
    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

    answer = chain.invoke(q)
    return answer
    
        

In [60]:
q = 'What is the whole document about?'
answer = ask_and_get_answer(vector_store, q)
print(answer)

{'query': 'What is the whole document about?', 'result': "I don't have access to the whole document that you are referring to. Could you please provide more specific information or context so I can assist you better?"}
