# PINECONE 

In [97]:
import os
os.environ['PINECONE_API_KEY'] = "xx"
openai_api_key = os.getenv('OPENAI_API_KEY')
# Set the API key
os.environ['OPENAI_API_KEY'] = "xx"
# Now you can use the API key
openai_api_key = os.getenv('OPENAI_API_KEY')

In [2]:
!pip install -q pinecone-client

In [4]:
!pip install --upgrade -q pinecone-client

In [6]:
pip show pinecone-client

Name: pinecone-client
Version: 5.0.1
Summary: Pinecone client and SDK
Home-page: https://www.pinecone.io
Author: Pinecone Systems, Inc.
Author-email: support@pinecone.io
License: Apache-2.0
Location: /opt/anaconda3/lib/python3.12/site-packages
Requires: certifi, pinecone-plugin-inference, pinecone-plugin-interface, tqdm, typing-extensions, urllib3
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [14]:
from pinecone import Pinecone, ServerlessSpec

# Initilizing and authenticating the pinecone client
pc = Pinecone(api_key = PINECONE_API_KEY)

# checking authentication
pc.list_indexes()

{'indexes': [{'deletion_protection': 'disabled',
              'dimension': 1536,
              'host': 'pdfreader-7zxjnjq.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'pdfreader',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}

## Working with Pinecone Indexes

In [18]:
# listing all indexes
pc.list_indexes()

{'indexes': [{'deletion_protection': 'disabled',
              'dimension': 1536,
              'host': 'pdfreader-7zxjnjq.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'pdfreader',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}

In [20]:
# getting a list with the index names 
pc.list_indexes().names()

['pdfreader']

In [24]:
# creating a Serverless Pinecone index 
# starter free plan permits 1 project, up to 5 indexes, up to 100 namespaces per index
index_name = 'langchain'

if index_name not in pc.list_indexes().names():
    print(f'Creating index {index_name}')
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric='cosine',
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        ) 
    )
    print('Index created! 😊')
else:
    print(f'Index {index_name} already exists!')


Creating index langchain
Index created! 😊


# Splitting and Embedding Text Using LangChain (Similarity Search)

This notebook uses the latest versions of the libraries OpenAI, LangChain, and Pinecone.

In [38]:
!pip install -q -r ./requirements.txt

In [40]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

with open('churchill_speech.txt') as f:
    churchill_speech = f.read()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len
)

In [42]:
chunks = text_splitter.create_documents([churchill_speech])
# print(chunks[2])
# print(chunks[10].page_content)
print(f'Now you have {len(chunks)}')

Now you have 300


In [44]:
print(chunks[2])


page_content='From the moment that the French defenses at Sedan and on the Meuse were broken at the end of the'


#### Embedding Cost

In [47]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-3-small')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    # check prices here: https://openai.com/pricing
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.00002:.6f}')
    
print_embedding_cost(chunks)

Total Tokens: 4820
Embedding Cost in USD: 0.000096


### Creating embeddings

In [56]:
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536, openai_api_key = openai_api_key)  # 512 works as well

In [58]:
# embedding of the first chunk
vector = embeddings.embed_query(chunks[0].page_content)
vector

[0.021066106855869293,
 0.04208586364984512,
 0.07930497080087662,
 0.019165754318237305,
 0.00021817123342771083,
 -0.030753273516893387,
 -0.003893406130373478,
 -0.0015005836030468345,
 -0.009774071164429188,
 0.018030177801847458,
 0.03260727599263191,
 -0.0025912586133927107,
 0.016987301409244537,
 -0.0353882797062397,
 -0.004588657058775425,
 0.01667443849146366,
 -0.013603745959699154,
 0.0039165811613202095,
 0.009820421226322651,
 0.020903881639242172,
 -0.026095090433955193,
 -0.004550997633486986,
 -0.042734768241643906,
 -0.01885289140045643,
 -0.034136828035116196,
 0.007728874683380127,
 -0.0034530803095549345,
 -0.04711484909057617,
 0.023012811318039894,
 -0.012931670062243938,
 0.030544698238372803,
 -0.01813446544110775,
 -0.0016251493943855166,
 0.03682513162493706,
 0.027902742847800255,
 -0.052931781858205795,
 0.04444972053170204,
 -0.013244532980024815,
 -0.04261889308691025,
 0.002014779718592763,
 -0.006645441520959139,
 -0.048621226102113724,
 0.0299653224647

### Inserting the Embeddings into a Pinecone Index

In [None]:
# I'm importing the necessary libraries and initializing the Pinecone client
import os
import pinecone

from langchain_community.vectorstores import Pinecone

pc = pinecone.Pinecone()

In [61]:
# deleting all indexes
indexes = pc.list_indexes().names()
for i in indexes:
    print('Deleting all indexes ... ', end='')
    pc.delete_index(i)
    print('Done')

Deleting all indexes ... Done
Deleting all indexes ... Done


In [101]:
# creating a new index
from pinecone import ServerlessSpec

index_name = 'churchill-speech'
if index_name not in pc.list_indexes().names():
    print(f'Creating index {index_name}')
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric='cosine',
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        ) 
    )
    print('Index created! 😊')
else:
    print(f'Index {index_name} already exists!')

Index churchill-speech already exists!


In [99]:
vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings)


In [103]:
from langchain_pinecone import PineconeVectorStore  

# processing the input documents, generating embeddings using the provided `OpenAIEmbeddings` instance,
# inserting the embeddings into the index and returning a new Pinecone vector store object. 
vectorstore_from_docs = PineconeVectorStore.from_documents(
        chunks,
        index_name=index_name,
        embedding=embeddings
    )

In [111]:
vector_store = vectorstore_from_docs

In [115]:
query = 'Where should we fight?'
result = vector_store.similarity_search(query)
print(result)

[Document(page_content='shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and'), Document(page_content='end, we shall fight in France, we shall fight on the seas and oceans, we shall fight with growing'), Document(page_content='streets, we shall fight in the hills; we shall never surrender, and even if, which I do not for a'), Document(page_content='number of the enemy, and fought fiercely on some of the old grounds that so many of us knew so')]


### Answering in Natural Language using an LLM

In [118]:
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI

# Initialize the LLM with the specified model and temperature
llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=0.2)

# Use the provided vector store with similarity search and retrieve top 3 results
retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})

# Create a RetrievalQA chain using the defined LLM, chain type 'stuff', and retriever
chain = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=retriever)


In [120]:
query = 'Answer only from the provided input. Where should we fight?'
answer = chain.invoke(query)
print(answer)

{'query': 'Answer only from the provided input. Where should we fight?', 'result': 'We shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and streets, we shall fight in the hills.'}


In [None]:
query = 'Who was the king of Belgium at that time?'
answer = chain.invoke(query)
print(answer)