### Splitting and Embedding Text Using LangChain

In [1]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

with open('files/churchill_speech.txt') as f:
    churchill_speech = f.read()


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len
)

In [4]:
chunks = text_splitter.create_documents([churchill_speech])
print(chunks[2],"\n")
print(chunks[10].page_content,"\n")
print(f'Now you have {len(chunks)}')

page_content='From the moment that the French defenses at Sedan and on the Meuse were broken at the end of the' metadata={} 

penetration were realized and when a new French Generalissimo, General Weygand, assumed 

Now you have 300


#### Embedding Cost

In [5]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.0004:.6f}')
    
print_embedding_cost(chunks)

Total Tokens: 4820
Embedding Cost in USD: 0.001928


### Creating embeddings

In [6]:
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

In [7]:
vector = embeddings.embed_query(chunks[0].page_content)
print(vector)

[-0.04451269283890724, -0.03777839615941048, -0.0028665410354733467, -0.008035242557525635, 0.015738872811198235, 0.022549696266651154, -0.028442207723855972, -0.009757080115377903, 0.0010315083200111985, 0.007206209469586611, 0.007792910095304251, 0.03277868777513504, 0.0074102794751524925, -0.01175951398909092, 0.00634847953915596, -0.005340885370969772, 0.013251773081719875, -0.002566813724115491, 0.013545122928917408, -0.011045269668102264, -0.00816278625279665, -0.026835160329937935, 0.029615608975291252, -0.0037593457382172346, -0.014476191252470016, -0.01841728575527668, 0.010898594744503498, -0.018659619614481926, 0.0030482904985547066, -0.014310384169220924, 0.007110551930963993, -0.008596434257924557, -0.016555150970816612, 0.005127250216901302, -0.01835351437330246, -0.023876149207353592, -0.022460415959358215, -0.008723977953195572, 0.022562451660633087, -0.0127734849229455, 0.013659912161529064, 0.004706356208771467, 0.008755863644182682, 0.002954227151349187, -0.027957541

### Inserting the Embeddings into a Pinecone Index

In [8]:
import os
import pinecone
from langchain.vectorstores import Pinecone

pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))

  from tqdm.autonotebook import tqdm


In [9]:
# deleting all indexes
indexes = pinecone.list_indexes()
for i in indexes:
    print('Deleting all indexes ... ', end='')
    pinecone.delete_index(i)
    print('Done')

Deleting all indexes ... Done


In [10]:
# creating an index
index_name = 'churchill-speech'
if index_name not in pinecone.list_indexes():
    print(f'Creating index {index_name} ...')
    pinecone.create_index(index_name, dimension=1536, metric='cosine')
    print('Done!')

Creating index churchill-speech ...
Done!


In [11]:
vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)

### Asking Questions (Similarity Search)

In [12]:
query = 'Where should we fight?'
result = vector_store.similarity_search(query)
print(result)

[Document(page_content='shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and', metadata={}), Document(page_content='front, now on that, fighting', metadata={}), Document(page_content='end, we shall fight in France, we shall fight on the seas and oceans, we shall fight with growing', metadata={}), Document(page_content='When we consider how much greater would be our advantage in defending the air above this Island', metadata={})]


In [13]:
for r in result:
    print(r.page_content)
    print('-' * 50)

shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and
--------------------------------------------------
front, now on that, fighting
--------------------------------------------------
end, we shall fight in France, we shall fight on the seas and oceans, we shall fight with growing
--------------------------------------------------
When we consider how much greater would be our advantage in defending the air above this Island
--------------------------------------------------


### Answering in Natural Language using an LLM

In [14]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)

retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})

chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)


In [15]:
query = 'Where should we fight?'
answer = chain.run(query)
print(answer)

We should fight on the beaches, on the landing grounds, in the fields, in France, on the seas and oceans.


In [16]:
query = 'Who was the king of Belgium at that time?'
# query = 'What about the French Armies??'
answer = chain.run(query)
print(answer)

The king of Belgium at that time was King Leopold.
