## dotenv and chatmodels setup

In [12]:
from dotenv import load_dotenv, find_dotenv
import os

load_dotenv(find_dotenv(), override=True)

True

In [13]:
from langchain_groq import ChatGroq

llm = ChatGroq(model="llama3-8b-8192", temperature=1)

response = llm.invoke("What is the capital of France?")
print(response.content)

The capital of France is Paris.


### Splitting and embedding text using LangChain

In [14]:
import builtins

with builtins.open("churchill_speech.txt", "r") as f:
    churchill_speech = f.read()

In [15]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len
    )


In [16]:
chunk = text_splitter.create_documents([churchill_speech])
print(chunk[0].page_content)

print(f"Number of chunks: {len(chunk)}")

Winston Churchill Speech - We Shall Fight on the Beaches
We Shall Fight on the Beaches
June 4, 1940
Number of chunks: 300


In [21]:
from langchain_ollama import OllamaEmbeddings

#embeddings = OllamaEmbeddings(model="llama3.2:1b")

#embeddings = OllamaEmbeddings(model="granite-embedding:30m")

embeddings = OllamaEmbeddings(model="nomic-embed-text:latest")


In [22]:
embeddings.embed_query("Hello, world!")

[0.015951632,
 -0.00066996424,
 -0.15885885,
 -0.012657387,
 -0.017753249,
 0.061320245,
 -0.005491252,
 -0.010701116,
 -0.0055996445,
 -0.040793166,
 0.013789049,
 0.07319223,
 0.019487698,
 0.051192224,
 0.027002448,
 -0.059568644,
 0.0076662255,
 -0.06245029,
 -0.029461585,
 0.025311636,
 -0.03167875,
 -0.08911716,
 0.008606663,
 0.01984472,
 0.12291859,
 0.009061198,
 -0.037374612,
 0.07187734,
 0.012613118,
 -0.0032542283,
 -0.005120477,
 0.0077211387,
 -0.0016446928,
 0.034060054,
 0.056631163,
 0.00035218656,
 0.02197616,
 0.0075445683,
 0.024604717,
 -0.026079183,
 0.012698903,
 -0.00062393147,
 0.012742986,
 0.005797232,
 0.07582022,
 -0.017264228,
 -0.019319652,
 -0.034184746,
 0.069988206,
 -0.03556641,
 -0.04616333,
 -0.0073607224,
 -0.0038881674,
 0.05516038,
 0.04708488,
 0.01671669,
 0.053503018,
 -0.035663106,
 0.018357659,
 0.05551932,
 0.041776985,
 0.054344818,
 0.028155666,
 0.03244801,
 0.0007530078,
 -0.047532044,
 -0.01414573,
 0.07351867,
 0.023626907,
 0.009264

In [23]:
vec = embeddings.embed_query("test")
print(len(vec))

768


### Inserting embeddings into a pinecone index

In [24]:
import pinecone
from langchain.vectorstores import Pinecone

pc = pinecone.Pinecone()

In [29]:
pc.list_indexes().names()

['dotproduct-index', 'churchill-speech', 'datacamp-index', 'pinecone-datacamp']

In [28]:
index_name = "churchill-speech"
if index_name not in pc.list_indexes().names():
    print(f"Creating index {index_name}...")
    pc.create_index(
        name=index_name,
        dimension=768,
        metric="cosine",
        spec=pinecone.ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        ),
    )
print(f"Index {index_name} is ready.")

Creating index churchill-speech...
Index churchill-speech is ready.


In [None]:
index_name = "churchill-speech"
#pc.delete_index(index_name)

### Inserting data to pinecone index

In [30]:
from langchain_pinecone import PineconeVectorStore
index_name = "churchill-speech"

vectorstore = PineconeVectorStore.from_documents(chunk, embedding=embeddings, index_name=index_name)

  from .autonotebook import tqdm as notebook_tqdm


### loading data from vector store

In [None]:
vector_store = PineconeVectorStore(
    index=pc.Index(index_name),
    embedding=embeddings,
)

### Asking questions - Similarity Search

In [31]:
query = "Belgium"
docs = vectorstore.similarity_search(query)
print(docs)

[Document(id='f48be559-9646-47da-abf4-7a924b08b807', metadata={}, page_content='and Belgium is a colossal military disaster. The French Army has been weakened, the Belgian Army'), Document(id='587a14ed-aa3b-4853-bcdc-610f17ce7d06', metadata={}, page_content='Belgium to keep on holding the right hand of the Belgians and to give their own right hand to a'), Document(id='ad6b7285-5d02-4f9a-b0c8-aae6d32fbb57', metadata={}, page_content='abandonment of the whole of Belgium. Therefore, when the force and scope of the German'), Document(id='c4932263-a993-457d-bc43-e1ae378758fd', metadata={}, page_content='French Armies who had entered Belgium at the appeal of the Belgian King; but this strategic fact')]


In [32]:
for doc in docs:
    print(doc.page_content)
    print('-'*50)

and Belgium is a colossal military disaster. The French Army has been weakened, the Belgian Army
--------------------------------------------------
Belgium to keep on holding the right hand of the Belgians and to give their own right hand to a
--------------------------------------------------
abandonment of the whole of Belgium. Therefore, when the force and scope of the German
--------------------------------------------------
French Armies who had entered Belgium at the appeal of the Belgian King; but this strategic fact
--------------------------------------------------


In [33]:
from langchain.chains import RetrievalQA

retriever = vectorstore.as_retriever(seach_type="similarity", search_kwargs={"k": 3})

chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever
)

In [34]:
query = "What was the main message of the speech?"
answers = chain.run(query)
print(answers)

  answers = chain.run(query)


The main message of Winston Churchill's famous "We Shall Fight on the Beaches" speech, delivered on June 4, 1940, was that Britain would continue to fight against Nazi Germany, even in the face of overwhelming odds and despite the impending invasion of Europe. Churchill declared that Britain would not surrender, and that the British people would continue to defend their country and way of life to the end.
