In [1]:
pip install -r ./requirements.txt -q


Note: you may need to restart the kernel to use updated packages.


In [2]:
pip show langchain

Name: langchain
Version: 0.0.354
Summary: Building applications with LLMs through composability
Home-page: https://github.com/langchain-ai/langchain
Author: 
Author-email: 
License: MIT
Location: /Users/sagar/anaconda3/envs/langproj/lib/python3.12/site-packages
Requires: aiohttp, dataclasses-json, jsonpatch, langchain-community, langchain-core, langsmith, numpy, pydantic, PyYAML, requests, SQLAlchemy, tenacity
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip show openai

Name: openai
Version: 0.28.1
Summary: Python client library for the OpenAI API
Home-page: https://github.com/openai/openai-python
Author: OpenAI
Author-email: support@openai.com
License: 
Location: /Users/sagar/anaconda3/envs/langproj/lib/python3.12/site-packages
Requires: aiohttp, requests, tqdm
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override = True)



True

In [4]:
def load_document(file):
    from langchain.document_loaders import PyPDFLoader
    print(f'Loading {file}')
    loader = PyPDFLoader(file)
    data = loader.load()
    return data




In [5]:
def chunk_data(data, chunk_size = 256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 256, chunk_overlap = 10)
    chunks =text_splitter.split_documents(data)
    return chunks



In [26]:
def insert_or_fetch_embeddings(index_name, chunks):
    import pinecone
    from pinecone import Pinecone, PodSpec
    from langchain.vectorstores import Pinecone as Pineconevs
    from langchain_openai import OpenAIEmbeddings

    embeddings = OpenAIEmbeddings()
    pc = Pinecone(api_key = os.environ.get('PINECONE_API_KEY'))

    if index_name in pc.list_indexes().names():
        print(f'Index {index_name} already exists. Loading embeddings...', end = '')
        vector_store = Pineconevs.from_existing_index(index_name, embeddings)
        print('ok')
        return vector_store
    else:
        print(f'Creating Index {index_name} and embeddings...', end = '')
        pc.create_index(index_name, dimension = 1536, metric = 'cosine', spec=PodSpec(
		environment='gcp-starter'
	))
        vector_store = Pineconevs.from_documents(chunks, embeddings, index_name = index_name)
        print('OK')
        return vector_store




In [7]:
def delete_pinecone_index(index_name = 'all'):
    import pinecone
    from pinecone import Pinecone
    pc = Pinecone(api_key=os.environ.get('PINECONE_API_KEY'))
    if index_name == 'all':
        indexes = pc.list_indexes()
        print('Deleting all indices...')
        for index in indexes:
            pc.delete_index(index)
        print('OK')
    else:
        print(f'Deleting index {index_name}...',end = '')
        pc.delete_index(index_name)




In [18]:
data = load_document('files/Sagar_Tetali_CV.pdf')
print(data[0].metadata)
print(f'You have {len(data)} pages in your data')



Loading files/Sagar_Tetali_CV.pdf
{'source': 'files/Sagar_Tetali_CV.pdf', 'page': 0}
You have 2 pages in your data


In [14]:
def ask_document(vector_store, query):
    from langchain.chains import RetrievalQA
    from langchain_openai import ChatOpenAI

    llm = ChatOpenAI(model = 'gpt-3.5-turbo', temperature = 0)
    retriever = vector_store.as_retriever(search_type = 'similarity', search_kwargs = {'k':3})
    chain = RetrievalQA.from_chain_type(llm = llm, retriever = retriever, return_source_documents = True)
    answer = chain.invoke({'query': query})
    return answer

In [10]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens/ 1000 * 0.0004:.6f}')

In [24]:
chunks = chunk_data(data)
print(len(chunks))
print(chunks[10].page_content)
print_embedding_cost(chunks)


16
, RNNs, LSTMS, and data augmentation.  Experience Machine Learning Engineer |  September 2019 - March 2020 BigLittle Innovations Bengaluru, India  Built modules that extract a company’s real-time business processes from event logs, check conformance
Total Tokens: 944
Embedding Cost in USD: 0.000378


In [28]:
delete_pinecone_index('askadocument')
index_name = 'askadocument'
vector_store = insert_or_fetch_embeddings(index_name, chunks)

Deleting index askadocument...Creating Index askadocument and embeddings...OK


In [25]:
print(vector_store)




<langchain_community.vectorstores.pinecone.Pinecone object at 0x1160deb10>


In [6]:
indexes = pc.list_indexes()
print(indexes)


{'indexes': [{'dimension': 1536,
              'host': 'askadocument-293s6kc.svc.gcp-starter.pinecone.io',
              'metric': 'cosine',
              'name': 'askadocument',
              'spec': {'pod': {'environment': 'gcp-starter',
                               'pod_type': 'starter',
                               'pods': 1,
                               'replicas': 1,
                               'shards': 1}},
              'status': {'ready': True, 'state': 'Ready'}}]}


In [22]:
delete_pinecone_index(index_name = 'askadocument')

Deleting index askadocument...

In [1]:
pip show pinecone-client

Name: pinecone-client
Version: 3.0.0
Summary: Pinecone client and SDK
Home-page: https://www.pinecone.io
Author: Pinecone Systems, Inc.
Author-email: support@pinecone.io
License: Apache-2.0
Location: /Users/sagar/anaconda3/envs/langproj/lib/python3.12/site-packages
Requires: certifi, tqdm, typing-extensions, urllib3
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [32]:
answer = ask_document(vector_store, "where did this person study?")
print(answer)

{'query': 'where did this person study?', 'result': 'This person studied at the University of Pennsylvania in Philadelphia, PA, USA for their Masters in Engineering (MSE) in Computer Graphics and Game Technology. They also studied at GITAM University in Visakhapatnam, Andhra Pradesh, India for their Bachelor of Technology.', 'source_documents': [Document(page_content='Education University of Pennsylvania, Philadelphia, PA, USA, |  2017- 2019 Masters in Engineering (MSE), Computer Graphics and Game Technology GPA: 3.61/4.00 GITAM University, Visakhapatnam, Andhra Pradesh, India |  2012 - 2016 Bachelor of Technology', metadata={'page': 0.0, 'source': 'files/Sagar_Tetali_CV.pdf'}), Document(page_content='and an MSE in Computer Graphics and Game technology from the University of Pennsylvania. After a stint as a freelance tech writer and media professional over the pandemic years, I’m looking for developer roles, preferably with an ML focus. Education', metadata={'page': 0.0, 'source': 'fil