## Splitting and Embedding Tesxt Using LangChain

In [5]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
with open('Churchill_speech.txt') as f:
    churchill_speech = f.read()


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100, # usually use higher value, maximum size of the chunk
    chunk_overlap=20, # this is the maximum overlap between chunks needed to maintain some continuity
    length_function=len # It indicates how to length of the function is calculated, the default is to count the # of characters, but in LLM we focus on tokens 
)

In [7]:

chunks = text_splitter.create_documents([churchill_speech])
print(chunks[0])

page_content='Winston Churchill Speech - We Shall Fight on the Beaches\nWe Shall Fight on the Beaches\nJune 4, 1940'


In [8]:
print(chunks[1])
print(f'No you have {len(chunks)}')

page_content='June 4, 1940\nHouse of Commons'
No you have 304


## Cost of TextEmbeddings from OpenAI
### We will use OpenAI textembedding, which have a cost  To Save on money we will calculate the cost of embeddings to avoid any surprises

In [13]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Token Takens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.0004:.6f}')
    
print_embedding_cost(chunks)

Token Takens: 4840
Embedding Cost in USD: 0.001936


In [24]:
#from langchain.embeddings import OpenAIEmbeddings
#embedding = OpenAIEmbeddings()


In [22]:
import os
from langchain.embeddings.openai import OpenAIEmbeddings

os.environ["OPENAI_API_KEY"] = "Use Your own Key Dude!"

embedding = OpenAIEmbeddings()


In [None]:
#vector = embedding.embed_query('abc')
#vector

In [23]:
vector = embedding.embed_query(chunks[0].page_content)
print(vector)

[-0.044567573656022125, -0.0378875395789288, -0.0029496059181736843, -0.007993097388439452, 0.015743980127830393, 0.022589743056483196, -0.028581378879143218, -0.009650358895679658, 0.0010493331318405562, 0.007336567129750246, 0.007789127035119314, 0.0327882728296835, 0.00741305618686828, -0.011696439194526596, 0.006374081108250191, -0.005386098268716172, 0.013168851797818924, -0.0024986397385547204, 0.013589540447814895, -0.01096341941305807, -0.008171572320709483, -0.026847628780446265, 0.029626728860761727, -0.0038658801793480554, -0.014456415497163373, -0.018523080208145856, 0.010835938116855967, -0.018612315811635728, 0.00305477831350332, -0.014341682144316964, 0.007081604071684753, -0.008560391112316214, -0.01650886976768816, 0.005150257591345508, -0.01833185733252013, -0.023851811800438823, -0.022373024294146077, -0.008745239084941521, 0.02267898052261821, -0.012671672693721062, 0.013615037265848859, 0.004605273699663786, 0.008751613056619369, 0.00296554131302959, -0.02789297876

## Inserting the Embeddings into a Pinecone Index

In [26]:
import pinecone
from tqdm.autonotebook import tqdm
from langchain_community.vectorstores import Pinecone
pc = pinecone.Pinecone()

In [29]:
for i in pc.list_indexes().names():
    print('Deleting all indexes ... ', end='' )
    pc.delete_index(i)
    print('Done')

Deleting all indexes ... Done


In [33]:
index_name='churchill-speech'
try:
    
    if index_name not in pc.list_indexes().names():
        print(f'Creating index {index_name} ...')
        pc.create_index(
            name=index_name,
            dimension=1536,
            metric='cosine',
            spec=pinecone.PodSpec(
                environment='gcp-starter'
             )
        )
    print('Done')
except pinecone.PineconeApiException as e:
    print(f"Error creating index: {e}")

Creating index churchill-speech ...
Done


In [None]:
Pinecone.from_documents(chunks, embeddings, index_name)

# 