## dotenv and chatmodels setup

In [1]:
from dotenv import load_dotenv, find_dotenv
import os

load_dotenv(find_dotenv(), override=True)

True

In [2]:
from langchain_groq import ChatGroq

llm = ChatGroq(model="llama3-8b-8192", temperature=0.1)

response = llm.invoke("What is the capital of France?")
print(response.content)

The capital of France is Paris.


### Splitting and embedding text using LangChain

In [3]:
import builtins

with builtins.open("churchill_speech.txt", "r") as f:
    churchill_speech = f.read()

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len
    )


In [5]:
chunk = text_splitter.create_documents([churchill_speech])
print(chunk[0].page_content)

print(f"Number of chunks: {len(chunk)}")

Winston Churchill Speech - We Shall Fight on the Beaches
We Shall Fight on the Beaches
June 4, 1940
Number of chunks: 300


In [6]:
from langchain_ollama import OllamaEmbeddings

embeddings = OllamaEmbeddings(model="llama3.2:1b")

#embeddings = OllamaEmbeddings(model="granite-embedding:30m")


In [7]:
embeddings.embed_query("Hello, world!")

[-0.005824115,
 0.025235519,
 -0.0047849505,
 0.0012894155,
 0.058595683,
 -0.019648818,
 0.033948883,
 0.012429378,
 -0.011544082,
 0.03133756,
 0.016796356,
 -0.012211987,
 -0.030778013,
 0.00093506044,
 0.028454887,
 -0.010865619,
 -0.010862088,
 0.0009561084,
 0.029545587,
 -0.024795,
 -0.031838525,
 0.029556815,
 -0.01299685,
 0.008947782,
 -0.021700045,
 -0.0011771823,
 -0.025649428,
 0.02433983,
 -0.01373819,
 0.039861005,
 0.006719213,
 -0.00022857972,
 -0.002905432,
 0.008477759,
 -0.031552266,
 0.018538808,
 0.03201294,
 0.022340693,
 0.009024955,
 -0.024851872,
 0.016145239,
 0.0036088203,
 -0.008739508,
 0.00759577,
 -0.0019686387,
 -0.013738286,
 -0.012259793,
 -0.018082019,
 0.016324837,
 0.005838634,
 -0.011596039,
 0.0021304032,
 0.033084217,
 -0.008224766,
 -0.037045058,
 0.0016400196,
 -0.0007109463,
 0.011312622,
 -0.006934533,
 0.042416826,
 -0.013909867,
 0.004783334,
 -0.015386162,
 -0.012617385,
 0.014932508,
 -0.008887835,
 -0.0026352298,
 0.028043238,
 -0.01385

In [8]:
vec = embeddings.embed_query("test")
print(len(vec))

2048


### Inserting embeddings into a pinecone index

In [9]:
import pinecone
from langchain.vectorstores import Pinecone

pc = pinecone.Pinecone()

In [10]:
pc.list_indexes().names()

['dotproduct-index', 'churchill-speech', 'datacamp-index', 'pinecone-datacamp']

In [76]:
index_name = "churchill-speech"
if index_name not in pc.list_indexes().names():
    print(f"Creating index {index_name}...")
    pc.create_index(
        name=index_name,
        dimension=2048,
        metric="cosine",
        spec=pinecone.ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        ),
    )
print(f"Index {index_name} is ready.")

Creating index churchill-speech...
Index churchill-speech is ready.


In [77]:
#pc.delete_index(index_name)

### Inserting data to pinecone index

In [13]:
from langchain_pinecone import PineconeVectorStore
index_name = "churchill-speech"

vectorstore = PineconeVectorStore.from_documents(chunk, embedding=embeddings, index_name=index_name)

### loading data from vector store

In [15]:
vector_store = PineconeVectorStore(
    index=pc.Index(index_name),
    embedding=embeddings,
)

### Asking questions - Similarity Search

In [37]:
query = "Belgium"
docs = vectorstore.similarity_search(query)
print(docs)

[Document(id='9f8df337-cda4-4543-88d4-4f4f23b39b54', metadata={}, page_content='and build up the British'), Document(id='f6362583-73d4-4a6d-bbff-13433992e9eb', metadata={}, page_content='by the fact that'), Document(id='6dac07cd-5f31-4b28-ba75-1c95564103ba', metadata={}, page_content='in all about four thousand'), Document(id='a0c74b99-0989-4ed8-bfe1-93332d3e8a38', metadata={}, page_content='will be read the next day')]


In [38]:
for doc in docs:
    print(doc.page_content)
    print('-'*50)

and build up the British
--------------------------------------------------
by the fact that
--------------------------------------------------
in all about four thousand
--------------------------------------------------
will be read the next day
--------------------------------------------------


In [39]:
from langchain.chains import RetrievalQA

retriever = vectorstore.as_retriever(seach_type="similarity", search_kwargs={"k": 3})

chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever
)

In [40]:
#query = "What was the main message of the speech?"
answers = chain.run(query)
print(answers)

I don't know.
