# LangChain

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()
# os.environ['HF_TOKEN']

#### Document Loaders

In [None]:
from langchain_community.document_loaders import TextLoader

# Create TextLoader Object
loader = TextLoader("Data/speech.txt")

# Get file Contents
speech_doc = loader.load()

speech_doc[0]

In [None]:
from langchain_community.document_loaders import PyPDFLoader

pdf_loader = PyPDFLoader("Data/attention.pdf")
pdf_loader = pdf_loader.load()
pdf_loader

In [None]:
from langchain_community.document_loaders import WebBaseLoader
import bs4
wb_loader = WebBaseLoader(web_path="https://lilianweng.github.io/posts/2023-06-23-agent/",
                          bs_kwargs=dict(parse_only = bs4.SoupStrainer(
                              class_ = ("post-title","post-content","post-header")
                          )),
                          )
wb_loader.load()

In [None]:
from langchain_community.document_loaders import ArxivLoader
ax_loader = ArxivLoader(query="2304.10557",load_max_docs = 2).load()
ax_loader

In [None]:
from langchain_community.document_loaders import WikipediaLoader
wp_loader = WikipediaLoader(query="Chelsea FC", load_max_docs=2).load()
wp_loader

# Text Splitters

In [None]:
# Recursive Split Text by Characters
from langchain_text_splitters import RecursiveCharacterTextSplitter
char_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap=50)
final_chunked_documents = char_splitter.split_documents(pdf_loader)
final_chunked_documents

In [None]:
final_chunked_documents[1]

In [None]:
with open('Data/speech.txt') as file:
    speech_txt = file.readlines()

from langchain_text_splitters import CharacterTextSplitter
char_text_splitter = CharacterTextSplitter(chunk_size = 100, chunk_overlap = 20) #By default splits on '\n\n'
speech_doc = char_text_splitter.create_documents(speech_txt)
char_splitter.split_documents(speech_doc)

In [None]:
from langchain_text_splitters import HTMLHeaderTextSplitter
headers_to_split_on  = [('h1','H1'), ('h2','H2'),('h3','H3')]
html_head_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
html_splits = html_head_splitter.split_text_from_url("https://longform.org/random")
html_splits

In [None]:
import json
import requests

json_data = requests.get("https://api.smith.langchain.com/openapi.json").json()

from langchain_text_splitters import RecursiveJsonSplitter
json_splitter = RecursiveJsonSplitter(max_chunk_size = 300)
splitted_json = json_splitter.split_json(json_data)
splitted_json

In [None]:
json_doc = json_splitter.create_documents([json_data])
json_text = json_splitter.split_text(json_data)
json_text

## Embedding Techniques

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

In [None]:
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')# to get the api key 

In [None]:
from langchain_openai import OpenAIEmbeddings
# embedding_model = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1056)
embedding_model

In [None]:
vector = embedding_model.embed_query('Hi meet cool AI Engineer Rahul')
len(vector)

In [None]:
vector

In [None]:
from langchain_community.document_loaders import WikipediaLoader
wp_chels = WikipediaLoader('Chelsea FC', load_max_docs=1, doc_content_chars_max=150).load()

from langchain_text_splitters import RecursiveCharacterTextSplitter
rct_splitter =  RecursiveCharacterTextSplitter(chunk_size = 100, chunk_overlap = 10)
splitted_chels = rct_splitter.split_documents(wp_chels)
splitted_chels

In [None]:
from langchain_openai import OpenAIEmbeddings
# embedding_model_200 = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=200)


## Vector Stores

We need to specify the chunks and the embedding model to the vector store

In [None]:
from langchain_community.vectorstores import Chroma
db = Chroma.from_documents(splitted_chels, embedding_model_200)
db

In [None]:
splitted_chels

## Retreival

In [None]:
db.similarity_search('FIFA Club World Cup')

# Ollama

ollama run deepseek-r1

In [None]:
from langchain_community.embeddings import OllamaEmbeddings #embedding_model_ollama
embeddings = (OllamaEmbeddings(model='deepseek-r1')) #uses llama2 by default
embeddings

In [None]:
e1 = embeddings.embed_documents([
    'Rahul is awesome',
    'Rahul likes football'
])

e1

In [None]:
len(e1[0])

In [None]:
embeddings.embed_query('Rahul loves Chelsea')

Good Embedding Model - ollama pull mxbai-embed-large<br>
Link : <a href="https://ollama.com/blog/embedding-models"> Ollama Embedding Models </a>

In [None]:
from langchain_community.embeddings import OllamaEmbeddings
ollama_large_embedding_model = OllamaEmbeddings(model="mxbai-embed-large")
ollama_large_embedding_model.embed_query("Wow")

In [None]:
vec = ollama_large_embedding_model.embed_query("Rahul Loves Chelsea")
len(vec)

### Hugging Face Embeddings

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
hf_embeddings = HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2")
hf_embeddings

In [None]:
len(hf_embeddings.embed_query("Rahul loves Chelsea FC"))

In [None]:
from langchain_community.document_loaders import WikipediaLoader
chelsea_doc = WikipediaLoader(query="Chelsea FC", load_max_docs=2).load()
rct_splitter = RecursiveCharacterTextSplitter(chunk_size = 200, chunk_overlap = 20) # 
chelsea_splitted = rct_splitter.split_documents(chelsea_doc)
chelsea_splitted

In [None]:
chelsea_splitted[0].page_content

In [None]:
# Embedding a whole document
hf_embeddings.embed_documents([doc.page_content for doc in chelsea_splitted])

## Embedding and Storing in a Vector Database

In [None]:
# Creating a vector store for the whole document after embedding it !!!!
from langchain_community.vectorstores import Chroma
chelsea_db = Chroma.from_documents(documents=chelsea_splitted,embedding= hf_embeddings, collection_name='ChelseaFC')
chelsea_db.similarity_search('who is manager', k=1)

In [None]:
from langchain_community.document_loaders import WikipediaLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Creatign a document using Wikipedia loader for ChelseaFC
chelsea_doc = WikipediaLoader(query="Chelsea FC", load_max_docs=10).load()

# Creating chunks of 200 dimension 
rct_splitter = RecursiveCharacterTextSplitter(chunk_size = 200, chunk_overlap = 20)
chelsea_splitted = rct_splitter.split_documents(chelsea_doc)

# Chelsea_splitted now contains the chunks of 200 size
# chelsea_splitted[0]

from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings

ollama_large_embedding_model = OllamaEmbeddings(model="mxbai-embed-large")
chelsea_faiss_db = FAISS.from_documents(chelsea_splitted, ollama_large_embedding_model)

In [14]:
from langchain_community.document_loaders import WikipediaLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import FAISS, Chroma

tesla_docs = WikipediaLoader(query= "Tesla", load_max_docs=2).load()
rct_splitter = RecursiveCharacterTextSplitter(chunk_size = 200, chunk_overlap = 20)
tesla_splitted = rct_splitter.split_documents(tesla_docs)
embedding_model = OllamaEmbeddings(model = 'mxbai-embed-large')
tesla_db = FAISS.from_documents(tesla_splitted, embedding_model)

In [13]:
tesla_db.similarity_search('when was tesla born')

[Document(metadata={'title': 'Nikola Tesla', 'source': 'https://en.wikipedia.org/wiki/Nikola_Tesla', 'summary': 'Nikola Tesla (10 July 1856 – 7 January 1943) was a Serbian-American engineer, futurist, and inventor. He is known for his contributions to the design of the modern alternating current (AC) electricity supply system.\nBorn and raised in the Austrian Empire, Tesla first studied engineering and physics in the 1870s without receiving a degree. He then gained practical experience in the early 1880s working in telephony and at Continental Edison in the new electric power industry. In 1884, he immigrated to the United States, where he became a naturalized citizen. He worked for a short time at the Edison Machine Works in New York City before he struck out on his own. With the help of partners to finance and market his ideas, Tesla set up laboratories and companies in New York to develop a range of electrical and mechanical devices. His AC induction motor and related polyphase AC pa

### Loading And Saving the vectordatabases 

In [16]:
tesla_retriever = tesla_db.as_retriever()
query_op = tesla_retriever.invoke("Who is Tesla")
query_op

[Document(id='b5056887-baef-4a66-8cc9-fd2813d26e0b', metadata={'title': 'Tesla, Inc.', 'summary': 'Tesla, Inc. ( TEZ-lə or   TESS-lə) is an American multinational automotive and clean energy company. Headquartered in Austin, Texas, it designs, manufactures and sells battery electric vehicles (BEVs), stationary battery energy storage devices from home to grid-scale, solar panels and solar shingles, and related products and services.\nTesla was incorporated in July 2003 by Martin Eberhard and Marc Tarpenning as Tesla Motors. Its name is a tribute to inventor and electrical engineer Nikola Tesla. In February 2004, Elon Musk led Tesla\'s first funding round and became the company\'s chairman; in 2008, he was named chief executive officer. In 2008, the company began production of its first car model, the Roadster sports car, followed by the Model S sedan in 2012, the Model X SUV in 2015, the Model 3 sedan in 2017, the Model Y crossover in 2020, the Tesla Semi truck in 2022 and the Cybertruc

In [17]:
tesla_db.similarity_search_with_score('Where was Tesla born?')

[(Document(id='1fd39eca-16bc-4a3b-b8ef-1686b0b214ee', metadata={'title': 'Nikola Tesla', 'summary': 'Nikola Tesla (10 July 1856 – 7 January 1943) was a Serbian-American engineer, futurist, and inventor. He is known for his contributions to the design of the modern alternating current (AC) electricity supply system.\nBorn and raised in the Austrian Empire, Tesla first studied engineering and physics in the 1870s without receiving a degree. He then gained practical experience in the early 1880s working in telephony and at Continental Edison in the new electric power industry. In 1884, he immigrated to the United States, where he became a naturalized citizen. He worked for a short time at the Edison Machine Works in New York City before he struck out on his own. With the help of partners to finance and market his ideas, Tesla set up laboratories and companies in New York to develop a range of electrical and mechanical devices. His AC induction motor and related polyphase AC patents, licen

In [18]:
pwd

'f:\\LangChain'

In [19]:
tesla_db.save_local("VectorStore/tesla_faiss_index")

In [21]:
# allow_dangerous_deserialization : Trust this file!!
new_db = FAISS.load_local("VectorStore/tesla_faiss_index",embedding_model, allow_dangerous_deserialization=True)

In [24]:
# from langchain_chroma import Chroma
# Chroma.from_documents(chelsea_splitted, embedding_model)