In [None]:
!pip install -U langchain-community  # Here we are installing langchain community version,as it is open source.

Collecting langchain-community
  Downloading langchain_community-0.3.21-py3-none-any.whl.metadata (2.4 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB

# Faiss-CPU is a tool used to quickly search for similar items in large sets of data. It helps in finding the closest matches (nearest neighbors) in high-dimensional data, like text embeddings. This is useful when you have a lot of data (like sentences or images) and need to find similar ones fast. Faiss-CPU works on regular computers without needing special hardware like GPUs, making it efficient for large-scale searching.

# The sentence-transformers library essentially acts as a wrapper around various pre-trained models (like BERT, RoBERTa, MiniLM, etc.) specifically designed for sentence-level embedding tasks.

In [None]:
!pip install sentence-transformers faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_6

In [7]:
import os
#Importing OS library


import zipfile
#Importing zipfile library to open zip file


from langchain_community.document_loaders import DirectoryLoader, TextLoader
#These loaders help in reading and processing the text content from files and directories so that it can be fed into the Langchain system for various tasks like embedding, summarization, or retrieval.


from langchain_community.vectorstores import FAISS
#Importing vector database file


from langchain_community.embeddings import HuggingFaceEmbeddings
#This explains that you're importing HuggingFaceEmbeddings to perform the embedding task,
#Where embeddings refer to converting text or other data types into numerical vectors for further processing.



from langchain.text_splitter import RecursiveCharacterTextSplitter

RecursiveCharacterTextSplitter used to split long text into smaller, manageable chunks for processing. Here’s why it's important:

Text Chunking: When working with large documents or texts (like books, articles, or long paragraphs), it becomes hard for models to process the entire text at once. RecursiveCharacterTextSplitter helps break the text into smaller pieces or chunks, which are easier for the model to handle.

Efficient Processing: By splitting the text, the language model can process each chunk separately without running into memory or token limits. This makes the system more efficient.

Recursive Splitting: The "recursive" part means the splitter goes through the text and intelligently splits it, ensuring chunks are meaningful and don’t break in the middle of sentences or words.

In short, RecursiveCharacterTextSplitter helps divide large chunks of text into smaller parts, so the model can better understand and process them without any issues.

In [None]:
def extract_zip(zip_path, extract_to):
    if os.path.exists(zip_path) and not os.path.exists(extract_to):
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to)
        print("Knowledge Base extracted successfully!")
    elif not os.path.exists(zip_path):
        print(f"ZIP file not found:{zip_path}")
    else:
        print("Extraction folder already exists, skipping extraction.")

The above code is working to extract the "Knowledge_Base.zip" file.

In [8]:
def find_md_folder(base_path):
    for root, dirs, files in os.walk(base_path):
        if any(file.endswith(".md") for file in files):
            return root
    return None

In [9]:
def load_knowledge_base(folder_path):
    print(f"Loading documents from: {folder_path}")
    loader = DirectoryLoader(folder_path, glob="*.md", loader_cls=TextLoader)
    docs = loader.load()
    if not docs:
        print("No documents found in the knowledge base folder.")
        return None

    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    split_docs = splitter.split_documents(docs)
    if not split_docs:
        print("Failed to split documents.")
        return None

    model_name = "sentence-transformers/paraphrase-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(model_name=model_name)

    try:
        db = FAISS.from_documents(split_docs, embeddings)
        print("FAISS index created successfully!")
        return db
    except Exception as e:
        print(f"Error creating FAISS index: {e}")
        return None


Seamless LangChain Integration: HuggingFaceEmbeddings integrates smoothly with LangChain, automating the entire pipeline, making embedding, document loading, and retrieval easier without manual model handling.

Optimized Performance: HuggingFaceEmbeddings offers better efficiency, faster embedding generation, and optimized processing, especially with large datasets, compared to using Sentence-Transformers manually.

In [10]:
# Upload your "Knowledge_Base.zip" to Colab before running this

zip_path = "Knowledge_Base.zip"  # The uploaded ZIP file
extract_to = "Knowledge_Base"    # The folder where it will be extracted

# Step 1: Extract the zip
extract_zip(zip_path, extract_to)

# Step 2: Find the folder containing .md files
md_folder = find_md_folder(extract_to)

# Step 3: Load documents and create FAISS database
if md_folder:
    print(f"Markdown files found in: {md_folder}")
    db = load_knowledge_base(md_folder)
else:
    print("No markdown (.md) files found in the extracted directory.")


Knowledge Base extracted successfully!
Markdown files found in: Knowledge_Base
Loading documents from: Knowledge_Base


  embeddings = HuggingFaceEmbeddings(model_name=model_name)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.51k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

FAISS index created successfully!


# Vector Data Base Structure

{
   
    'embeddings': [<embedding_1>, <embedding_2>, ...],  # List of embeddings (vectors) corresponding to each document chunk
    'page_content': [<content_1>, <content_2>, ...],    # List of document chunk contents (text)
    'metadata': [<metadata_1>, <metadata_2>, ...],      # List of metadata (like file name, source, etc.)
}


In [11]:
# Ask the user for their query
user_query = input("Please enter your query: ")

# Assuming 'db' is already initialized and connected
if db:
    results = db.similarity_search(user_query, k=1)  # Only 1 result
    if results:
        # Print the top result
        print("\nTop Result:")
        print(results[0].page_content)  #It is the core of code,as it is retrieving the core content of the file.
    else:
        print("No results found for your query.")

Please enter your query: What is diet plan?

Top Result:
# How to Track Your Meals and Calories on Fitlytic?

To track your meals:
1. Go to the "Nutrition" section in the app.
2. Select "Add Meal."
3. Enter your food items, quantity, and meal type (breakfast, lunch, dinner, snacks).
4. Fitlytic automatically calculates calories and macronutrients.
5. Save the meal entry to track daily intake.

Monitoring nutrition helps you stay aligned with your fitness goals.
