## Import and install libraries

In [None]:
!pip install langchain chromadb sentence_transformers -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m84.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.2/284.2 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m54.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.3/103.3 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.5/16.5 MB[0m [31m96.5 MB/s[0m eta [36m0:00:

In [None]:
!pip install langchain-community -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.7/64.7 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires requests==2.32.4, but you have requests 2.32.5 which is incompatible.[0m[31m
[0m

In [None]:
from langchain.vectorstores.chroma import Chroma
from langchain_core.documents import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import WebBaseLoader
from langchain.chains import ConversationalRetrievalChain
from langchain.llms.together import Together
import pandas as pd
import matplotlib.pyplot as plt
import os
from pprint import pprint
from google.colab import drive
import pandas as pd



In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


# Source of Data:
**Human Rights Library - University of MINNESOTA**

[ 'https://hrlibrary.umn.edu/index.html'](https://)

In [None]:
data_df = pd.read_csv('human_rights_links-2.csv')
web_paths = data_df["URL"].to_list()

🔹 Model Overview
Full name: BAAI/bge-base-en-v1.5

Publisher: Beijing Academy of Artificial Intelligence (BAAI)

Type: Sentence embedding model (for semantic search, retrieval, clustering, reranking)

Language: English

Architecture: Transformer-based (similar to BERT but optimized for embeddings)

Dimension: 768 (each sentence/vector is 768-dimensional)

Size: ~109M parameters (Base model size, smaller & faster than Large versions)

License: MIT (open & free to use)

## Define the ingestion method

In [None]:
def ingestion(
    web_paths: list[str],
    chunk_size: int = 700,
    chunk_overlap: int = 100,
    model_name: str = "BAAI/bge-base-en-v1.5",
    persist_directory: str = "/content/drive/MyDrive/chroma_db"
) -> Chroma:
    """
    Perform document ingestion and embedding, then persist to disk.

    Args:
        web_paths (list[str]): Web paths to load from.
        chunk_size (int, optional): The size of document chunks. Defaults to 700.
        chunk_overlap (int, optional): The overlap between document chunks. Defaults to 100.
        model_name (str, optional): Hugging Face embedding model. Defaults to 'BAAI/bge-base-en-v1.5'.
        persist_directory (str, optional): Directory where Chroma DB will be stored.

    Returns:
        Chroma: The embedded Chroma database (persisted).
    """
    if not web_paths or type(web_paths) != list:
        raise ValueError("the `web_paths` must be a list of strings (links) and not empty")

    # Load documents
    try:
        loader = WebBaseLoader(web_paths)
        loaded_docs: list[Document] = loader.load()
    except Exception as e:
        print(f"There is an error in loading from web_paths: {e}")
        return


    # Split documents into chunks
    # Uses a recursive strategy: it first tries to split by large separators (like paragraphs),
    # then smaller ones (like sentences), and finally characters—until the chunk size fits.
    # This avoids cutting text in the middle of important sentences or words.
    # Semantic / meaning-based splitter

    try:

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size, chunk_overlap=chunk_overlap
        )
        all_splits: list[Document] = text_splitter.split_documents(loaded_docs)
    except Exception as e:
        print(f"There is an error in splitting documents: {e}")
        return

    # Embeddings + Store
    try:
        hfe = HuggingFaceEmbeddings(model_name=model_name)
        db: Chroma = Chroma.from_documents(
            all_splits, hfe, persist_directory=persist_directory
        )
        db.persist()  # ✅ Save to disk
        print(f"✅ Chroma DB persisted at {persist_directory}")
    except Exception as e:
        print(f"There is an error in storing: {e}")
        return

    return db


## Ingest the daat

In [None]:
chromadb = ingestion(web_paths)

  hfe = HuggingFaceEmbeddings(model_name=model_name)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
embedding = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
persist_directory = "/content/drive/MyDrive/chroma_db"

db = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding
)

  embedding = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  db = Chroma(


## Check ingested objects

In [None]:
query = "what is Definition of the term refugee"
embedded_query = embedding.embed_query(query)
docs = db.similarity_search(embedded_query, k=3)
print(docs[0].page_content)

GENERAL PROVISIONS
Article 1. - Definition of the term "refugee"
A. For the purposes of the present Convention, the term "refugee,,
shall apply to any person who:
(1) Has been considered a refugee under the Arrangements of
12 May 1926 and 30 June 1928 or under the Conventions of 28 October
1933 and 10 February 1938, the Protocol of 14 September 1939 or
the Constitution of the International Refugee Organization;
Decisions of non-eligibility taken by the International Refugee
Organization during the period of its activities shall not prevent
the status of refugee being accorded to persons who fulfil the
conditions of paragraph 2 of this section;


## Create a Retriever with custom Parameters


In [None]:
retriever = db.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={
        "k": 4,
        "score_threshold": 0.60
    }
)


## test data and the retriever

In [None]:
query = "How is the weather today"
docs = retriever.get_relevant_documents(query)
print(docs)



[]


In [None]:
query = "what is Definition of the term refugee"
docs = retriever.get_relevant_documents(query)
print(docs)

[Document(metadata={'source': 'http://hrlibrary.umn.edu/instree/v1crs.htm', 'title': 'Convention relating to the Status of Refugees, 189 U.N.T.S. 150, entered into force April 22, 1954.', 'language': 'No language found.'}, page_content='GENERAL PROVISIONS\nArticle 1. - Definition of the term "refugee"\nA. For the purposes of the present Convention, the term "refugee,,\nshall apply to any person who:\n(1) Has been considered a refugee under the Arrangements of\n12 May 1926 and 30 June 1928 or under the Conventions of 28 October\n1933 and 10 February 1938, the Protocol of 14 September 1939 or\nthe Constitution of the International Refugee Organization;\nDecisions of non-eligibility taken by the International Refugee\nOrganization during the period of its activities shall not prevent\nthe status of refugee being accorded to persons who fulfil the\nconditions of paragraph 2 of this section;'), Document(metadata={'language': 'No language found.', 'title': 'Protocol Additional to the Geneva 