### Data Ingestion

In [3]:
from langchain_core.documents import Document
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter
) 

print ("Setup completed!")

Setup completed!


In [5]:
# Document structure in LangChain

doc = Document(
    page_content="Main page content can be stored here...",
    metadata={
        "source":"example.txt",
        "author":"xyz"
    }

)
print(" Document = ", doc)
print("\n\n\n Document Page content = ", doc.page_content)
print("\n\n\n Document metadata", doc.metadata)

 Document =  page_content='Main page content can be stored here...' metadata={'source': 'example.txt', 'author': 'xyz'}



 Document Page content =  Main page content can be stored here...



 Document metadata {'source': 'example.txt', 'author': 'xyz'}


### Text files (.txt)

In [6]:
import os
os.makedirs("data/text_files", exist_ok=True)

In [3]:
sample_text = {
    "data/text_files/example1.txt":"""
The LangChain framework consists of multiple open-source libraries. Read more in the Architecture page.

langchain-core: Base abstractions for chat models and other components.
Integration packages (e.g. langchain-openai, langchain-anthropic, etc.): Important integrations have been split into lightweight packages that are co-maintained by the LangChain team and the integration developers.
langchain: Chains, agents, and retrieval strategies that make up an application's cognitive architecture.
langchain-community: Third-party integrations that are community maintained.
langgraph: Orchestration framework for combining LangChain components into production-ready applications with persistence, streaming, and other key features. See LangGraph documentation.

"""
}

#(filepath, content), = sample_text.items()  
# sample_text.items() gives an iterable result
# By adding a trailing , we are defining python to unpack items first from the iterable result from sample_text.items()
#with open(filepath, 'w', encoding="utf-8") as f:
#    f.write(content)

#print("Printed Succesfully!")


# Better approach - using a loop even for single element
for filepath, content in sample_text.items():
    with open(filepath, 'w', encoding="utf-8") as f:
        f.write(content)




### Reading single text file

In [None]:
from langchain.document_loaders import TextLoader
#or
from langchain_community.document_loaders import TextLoader

loader = TextLoader("data/text_files/example1.txt", encoding="utf-8")

documents = loader.load() 
print(documents)
print(type(documents))
print(documents[0].page_content)

[Document(metadata={'source': 'data/text_files/example1.txt'}, page_content="\nThe LangChain framework consists of multiple open-source libraries. Read more in the Architecture page.\n\nlangchain-core: Base abstractions for chat models and other components.\nIntegration packages (e.g. langchain-openai, langchain-anthropic, etc.): Important integrations have been split into lightweight packages that are co-maintained by the LangChain team and the integration developers.\nlangchain: Chains, agents, and retrieval strategies that make up an application's cognitive architecture.\nlangchain-community: Third-party integrations that are community maintained.\nlanggraph: Orchestration framework for combining LangChain components into production-ready applications with persistence, streaming, and other key features. See LangGraph documentation.\n\n")]
<class 'list'>

The LangChain framework consists of multiple open-source libraries. Read more in the Architecture page.

langchain-core: Base abst

### Reading multiple text files

In [8]:
# creating one more text file

another_sample_text = {
    "data/text_files/example2.txt":"""Trace and evaluate your language model applications and intelligent agents to help you move from prototype to production.\n 🦜🕸️ LangGraph \n\n Build stateful, multi-actor applications with LLMs. Integrates smoothly with LangChain, but can be used without it. LangGraph powers production-grade agents, trusted by Linkedin, Uber, Klarna, GitLab, and many more."""
}

for filepath, content in another_sample_text.items():
    with open(filepath, 'w', encoding="utf-8") as f:
        f.write(content)

print("Second file created succesfully!")



Second file created succesfully!


In [16]:
from langchain_community.document_loaders import DirectoryLoader

dirLoader = DirectoryLoader(
    path="data/text_files",
    glob="**/*.txt",
    loader_cls= TextLoader,
    loader_kwargs={'encoding':'utf-8'},
    show_progress=True
)

dir_documents = dirLoader.load()

print(f"Loaded {len(dir_documents)} documents!")

for i, doc in enumerate(dir_documents):
    print(f"\n\nDocument = {i+1}")
    print(f"Source = {doc.metadata['source']}")
    print(f"Length = {len(doc.page_content)}")


100%|██████████| 2/2 [00:00<00:00, 1282.86it/s]

Loaded 2 documents!


Document = 1
Source = data\text_files\example1.txt
Length = 764


Document = 2
Source = data\text_files\example2.txt
Length = 355





In [1]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")



  from .autonotebook import tqdm as notebook_tqdm


In [None]:
#The model we are using always splits our text into embedding vector of size "384" 

text = "hello there, how you doinn?"

#for single text, embed_query is used
embeding_vector = embeddings.embed_query(text)
print(f"Embedding vector : {embeding_vector}")
print(f"Embedding vector size :  = {len(embeding_vector)}")


# SAME SENTENCES WILL HAVE SAME EMBEDING VECTOR ALWAYS

Embedding vector : [-0.05682361125946045, 0.03533463552594185, 0.08081857860088348, 0.04205037280917168, -0.08291807770729065, -0.07396841794252396, 0.08484696596860886, -0.018906328827142715, -0.059868358075618744, -0.031491659581661224, -0.0022120289504528046, -0.012657633051276207, -0.013012447394430637, 0.0007849973044358194, 0.026180170476436615, -0.013836095109581947, 0.025441771373152733, -0.03665701299905777, -0.13914257287979126, 0.04135800525546074, -0.02109372429549694, 0.026307635009288788, 0.05402533337473869, 0.034550782293081284, -0.06981494277715683, -0.04672589525580406, 0.07478775084018707, 0.060515306890010834, 0.025080393999814987, -0.10002962499856949, 0.02656574919819832, 0.04612959548830986, -0.02085006795823574, 0.04229604825377464, -0.029349451884627342, 0.07964026927947998, -0.007708067540079355, -0.09500449150800705, 0.02303917519748211, -0.016895616427063942, 0.07246621698141098, -0.018565082922577858, -0.0019269379554316401, -0.052613407373428345, 0.0488562

In [4]:
text_document = [ 
    "Iron man is a superhero moveie, showing a billionare, philanthropist who is an engineer and creates suits that can do so many thihgs",
    "Virat kohli is an Indian cricketer. He is one of the greatest of all times as a batter in cricket.",
    "Sherlock holmes is a great thriller series",
    "Docotor strange is a movie where a doctor called stepher strange learns magic and turns into a superhero.",
    "Rohit sharma is a star indian cicketer holding many brilliant records."
]

In [12]:
import numpy as np
def cosine_similarity(vec1, vec2):
    dot_prod = np.dot(vec1, vec2)
    norm_a=np.linalg.norm(vec1)
    norm_b=np.linalg.norm(vec2)

    return dot_prod/(norm_a*norm_b)


embedded_texts = embeddings.embed_documents(text_document)
print(embedded_texts)
print(embedded_texts[0])
print(embedded_texts[1])

# Showcasing similarity between the texts

for i in range(len(embedded_texts)):
    for j in range(i+1, len(embedded_texts)):
        similarity=cosine_similarity(embedded_texts[i], embedded_texts[j])

        print(f"{text_document[i]}\n vs {text_document[j]}\n")
        print(f"similarity between them is : {similarity:.3f}\n\n")

[[-0.039031848311424255, 0.0018403552239760756, -0.10356047004461288, 0.06205875054001808, 0.020162083208560944, -0.015352127142250538, 0.06778129935264587, -0.02518213912844658, -0.0931377112865448, -0.010715652257204056, -0.04522791877388954, -0.020833928138017654, 0.05307387560606003, 0.012118593789637089, 0.009696688503026962, -0.008692027069628239, 0.058765191584825516, -0.06634217500686646, -0.03332270309329033, 0.06054060906171799, 0.08962420374155045, -0.02212936244904995, 0.008264311589300632, -0.07939302921295166, -0.11068131029605865, 0.026491321623325348, 0.05886729434132576, -0.01126445084810257, -0.03264167159795761, -0.0020641926676034927, 0.013928603380918503, 0.04904923215508461, -0.000843969639390707, 0.008721119724214077, -0.010472550988197327, 0.088685542345047, 0.0365804098546505, 0.03272195905447006, -0.04469393938779831, -0.012632341124117374, -0.0581812858581543, -0.05427450314164162, -0.003179932711645961, -0.07143504917621613, 0.027899781242012978, -0.02574765

### Cosine Similarity scores

###### Close to 1 : High similarity
###### Close to 0 : Low similarity
###### Negative : Inverse Relationship

In [13]:
science_info_text = [
    "The Earth's circumference is approximately 40,075 kilometers.",
    "Water is composed of two hydrogen atoms and one oxygen atom.",
    "The speed of light in a vacuum is about 299,792 kilometers per second.",
    "Photosynthesis is the process by which plants convert light energy into chemical energy.",
    "The human body is made up of approximately 37.2 trillion cells.",
    "DNA carries the genetic instructions for the development, functioning, growth, and reproduction of all known organisms.",
    "The universe is estimated to be around 13.8 billion years old.",
    "A black hole is a region of spacetime where gravity is so strong that nothing, no particles or even electromagnetic radiation such as light, can escape from it.",
    "The theory of relativity, developed by Albert Einstein, describes the laws of motion at high speeds.",
    "The periodic table is a tabular arrangement of the chemical elements, ordered by their atomic number."
]

In [26]:
from langchain_community.vectorstores import FAISS

# science_info_embedding = embeddings.embed_documents(science_info_text)
# science_info_embedding

# Here we can directly give the text and the embedding model, it will itself convert
faiss_db = FAISS.from_texts(science_info_text, embedding=embeddings)
faiss_db.save_local("faiss_index")
vector_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)

print(f"{type(faiss_db)}\n")
print(faiss_db)


<class 'langchain_community.vectorstores.faiss.FAISS'>

<langchain_community.vectorstores.faiss.FAISS object at 0x000001C3C33C7B30>


In [27]:
query = "What is speed of light?"
ans = vector_db.similarity_search(query, k=2)


# Top 2 answers for the question
for ans_text in ans:
    print(f"{ans_text.page_content} \n")

The speed of light in a vacuum is about 299,792 kilometers per second. 

The theory of relativity, developed by Albert Einstein, describes the laws of motion at high speeds. 

