In [2]:
#Data ingestion
from langchain_core.documents import Document
sample_documents = [
    Document(
        page_content="""
        Artificial Intelligence (AI) is the simulation of human intelligence in machines.
        These systems are designed to think like humans and mimic their actions.
        AI can be categorized into narrow AI and general AI.
        """,
        metadata={"source": "AI Introduction", "page": 1, "topic": "AI"}
    ),
    Document(
        page_content="""
        Machine Learning is a subset of AI that enables systems to learn from data.
        Instead of being explicitly programmed, ML algorithms find patterns in data.
        Common types include supervised, unsupervised, and reinforcement learning.
        """,
        metadata={"source": "ML Basics", "page": 1, "topic": "ML"}
    ),
    Document(
        page_content="""
        Deep Learning is a subset of machine learning based on artificial neural networks.
        It uses multiple layers to progressively extract higher-level features from raw input.
        Deep learning has revolutionized computer vision, NLP, and speech recognition.
        """,
        metadata={"source": "Deep Learning", "page": 1, "topic": "DL"}
    ),
    Document(
        page_content="""
        Natural Language Processing (NLP) is a branch of AI that helps computers understand human language.
        It combines computational linguistics with machine learning and deep learning models.
        Applications include chatbots, translation, sentiment analysis, and text summarization.
        """,
        metadata={"source": "NLP Overview", "page": 1, "topic": "NLP"}
    )
]

print(sample_documents)

[Document(metadata={'source': 'AI Introduction', 'page': 1, 'topic': 'AI'}, page_content='\n        Artificial Intelligence (AI) is the simulation of human intelligence in machines.\n        These systems are designed to think like humans and mimic their actions.\n        AI can be categorized into narrow AI and general AI.\n        '), Document(metadata={'source': 'ML Basics', 'page': 1, 'topic': 'ML'}, page_content='\n        Machine Learning is a subset of AI that enables systems to learn from data.\n        Instead of being explicitly programmed, ML algorithms find patterns in data.\n        Common types include supervised, unsupervised, and reinforcement learning.\n        '), Document(metadata={'source': 'Deep Learning', 'page': 1, 'topic': 'DL'}, page_content='\n        Deep Learning is a subset of machine learning based on artificial neural networks.\n        It uses multiple layers to progressively extract higher-level features from raw input.\n        Deep learning has revolu

In [3]:
# Text splitting
from langchain.text_splitter import RecursiveCharacterTextSplitter
splitter=RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    separators=[" "]

)

chunks=splitter.split_documents(sample_documents)

print(f"Number of chunks is {len(chunks)}")
print(chunks[0].page_content[:100])
print(chunks[0].metadata)

Number of chunks is 4
Artificial Intelligence (AI) is the simulation of human intelligence in machines.
        These syst
{'source': 'AI Introduction', 'page': 1, 'topic': 'AI'}


In [6]:
from dotenv import load_dotenv
load_dotenv()

True

In [7]:
import os
os.environ['OPENAI_API_KEY']=os.getenv("OPENAI_API_KEY")

In [8]:
from langchain_openai import OpenAIEmbeddings
embedding=OpenAIEmbeddings(
    model="text-embedding-3-small"
)
embedding

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x000001E9DA94D160>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x000001E9DA94DA90>, model='text-embedding-3-small', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [9]:
text="what is ML ?"
text_embedding=embedding.embed_query(text)
len(text_embedding)

1536

## Create FAISS Vectorstore

In [10]:
from langchain_community.vectorstores import FAISS
vectorstore=FAISS.from_documents(
    documents=chunks,
    embedding=embedding
)
vectorstore

<langchain_community.vectorstores.faiss.FAISS at 0x1e9da94f0e0>

In [15]:
print(f"Loaded vector store contains {vectorstore.index.ntotal} vectors")

Loaded vector store contains 4 vectors


In [11]:
# Save vectorstoreto local file
vectorstore.save_local("faiss_index")

In [13]:
# Load vectorstore from local file
db=vectorstore.load_local("faiss_index",embedding,allow_dangerous_deserialization=True)
db

<langchain_community.vectorstores.faiss.FAISS at 0x1e9dab39450>

In [14]:
print(f"Loaded vector store contains {db.index.ntotal} vectors")

Loaded vector store contains 4 vectors


In [16]:
## Similarity Search 
query="What is deep learning"

results=db.similarity_search(query,k=3)
print(results)

[Document(id='498a3779-a423-4e05-b83d-e41ae079e533', metadata={'source': 'Deep Learning', 'page': 1, 'topic': 'DL'}, page_content='Deep Learning is a subset of machine learning based on artificial neural networks.\n        It uses multiple layers to progressively extract higher-level features from raw input.\n        Deep learning has revolutionized computer vision, NLP, and speech recognition.'), Document(id='c5db4639-7490-489e-a441-364fe99bdede', metadata={'source': 'ML Basics', 'page': 1, 'topic': 'ML'}, page_content='Machine Learning is a subset of AI that enables systems to learn from data.\n        Instead of being explicitly programmed, ML algorithms find patterns in data.\n        Common types include supervised, unsupervised, and reinforcement learning.'), Document(id='83d31e88-d0f1-48dc-a5ac-044a19c1f153', metadata={'source': 'NLP Overview', 'page': 1, 'topic': 'NLP'}, page_content='Natural Language Processing (NLP) is a branch of AI that helps computers understand human lang

In [18]:
### Search with metadata filtering
filter_dict={"topic":"ML"}
filtered_results=db.similarity_search(
    query,
    k=3,
    filter=filter_dict
)
print(filtered_results)

[Document(id='c5db4639-7490-489e-a441-364fe99bdede', metadata={'source': 'ML Basics', 'page': 1, 'topic': 'ML'}, page_content='Machine Learning is a subset of AI that enables systems to learn from data.\n        Instead of being explicitly programmed, ML algorithms find patterns in data.\n        Common types include supervised, unsupervised, and reinforcement learning.')]
