### Splitting and Embedding Text Using LangChain

In [1]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

with open('./churchill_speech.txt') as f:
    churchill_speech = f.read()


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=20,
    length_function=len
)

In [5]:
chunks = text_splitter.create_documents([churchill_speech])
# print(chunks[2])
# print(chunks[10].page_content)
print(f'Now you have {len(chunks)}')

Now you have 84


#### Embedding Cost

In [6]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.0004:.6f}')
    
print_embedding_cost(chunks)

Total Tokens: 4621
Embedding Cost in USD: 0.001848


### Creating embeddings

In [16]:
from langchain.embeddings import OpenAIEmbeddings
# embeddings = OpenAIEmbeddings() # this won't work because only 1 chunk is supported currently.
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002", chunk_size=1)

In [14]:
vector = embeddings.embed_query(chunks[0].page_content)
len(vector)

1536

### Inserting the Embeddings into a Pinecone Index

In [11]:
import os
import pinecone
from langchain.vectorstores import Pinecone
from tqdm.autonotebook import tqdm

pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))

In [12]:
# deleting all indexes
indexes = pinecone.list_indexes()
for i in indexes:
    print('Deleting all indexes ... ', end='')
    pinecone.delete_index(i)
    print('Done')

In [13]:
# creating an index
index_name = 'churchill-speech'
if index_name not in pinecone.list_indexes():
    print(f'Creating index {index_name} ...')
    pinecone.create_index(index_name, dimension=1536, metric='cosine')
    print('Done!')

Creating index churchill-speech ...
Done!


In [17]:
vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)

### Asking Questions (Similarity Search)

In [18]:
query = 'Where should we fight?'
result = vector_store.similarity_search(query)
print(result)

[Document(page_content='shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and in the\nstreets, we shall fight in the hills; we shall never surrender, and even if, which I do not for a moment', metadata={}), Document(page_content='end, we shall fight in France, we shall fight on the seas and oceans, we shall fight with growing\nconfidence and growing strength in the air, we shall defend our Island, whatever the cost may be, we', metadata={}), Document(page_content='draw all the distinctions which we should like to do. If parachute landings were attempted and fierce\nfighting attendant upon them followed, these unfortunate people would be far better out of the way,', metadata={}), Document(page_content='I return to the Army. In the long series of very fierce battles, now on this front, now on that, fighting\non three fronts at once, battles fought by two or three divisions against an equal or somewhat larger', metadata={})]


In [18]:
for r in result:
    print(r.page_content)
    print('-' * 50)

shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and
--------------------------------------------------
front, now on that, fighting
--------------------------------------------------
end, we shall fight in France, we shall fight on the seas and oceans, we shall fight with growing
--------------------------------------------------
When we consider how much greater would be our advantage in defending the air above this Island
--------------------------------------------------


### Answering in Natural Language using an LLM

In [19]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)

retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})

chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)


In [20]:
query = 'Where should we fight?'
answer = chain.run(query)
print(answer)

Based on the context provided, it seems that the speaker is declaring their determination to fight in various locations including beaches, landing grounds, fields, France, seas and oceans. The context doesn't provide information about a specific situation, so it's unclear who the speaker is referring to or what they are fighting for.


In [21]:
query = 'Who was the king of Belgium at that time?'
# query = 'What about the French Armies??'
answer = chain.run(query)
print(answer)

The king of Belgium at that time was King Leopold.
