In [None]:
! pip install --upgrade pymilvus sentence-transformers huggingface-hub langchain_community langchain-text-splitters pypdf tqdm

Install milvus_lite for local db

In [None]:
!pip install pymilvus[milvus_lite]

In [2]:
import os

os.environ["HF_TOKEN"] = "hf_..."

Prepare the data

In [3]:
%%bash

if [ ! -f "The-AI-Act.pdf" ]; then
    wget -q https://artificialintelligenceact.eu/wp-content/uploads/2021/08/The-AI-Act.pdf
fi

We use the PyPDFLoader from LangChain to extract the text from the PDF, and then split the text into smaller chunks. By default we set the chunk size as 1000 and the overlap as 200 which means each chunk will nearly have 1000 characters and the overlap between chunks will be 200 characters.

In [4]:
from langchain_community.document_loaders import PyPDFLoader

# Create a document loader
loader = PyPDFLoader("The-AI-Act.pdf")
# Load the document
docs = loader.load()
# print number of pages in the pdf
print(len(docs))

108


In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Create a text splitter to split text in our pdf knowledge source
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, # number of character in each chunk
    chunk_overlap=200, # number of characters overlapping between chunks
)
# Apply text splitter on document
chunks = text_splitter.split_documents(docs)

# Get number of text lines
text_lines = [chunk.page_content for chunk in chunks]

# print number of chunks
print(len(chunks))

# print number of text lines
print(len(text_lines))

424
424


Let's view the content of the first chunk

In [13]:
chunks[1].page_content

'EN 1  EN \nEXPLANATORY MEMORANDUM \n1. CONTEXT OF THE PROPOSAL \n1.1. Reasons for and objectives of the proposal \nThis explanatory memorandum accompanies the proposal for a Regulation laying down \nharmonised rules on artificial intelligence (Artificial Intelligence Act). Artificial Intelligence \n(AI) is a fast evolving family of technologies that can bring a wide array of economic and \nsocietal benefits across the entire s pectrum of industries and social activities. By improving \nprediction, optimising operations and resource allocation, and personalising service delivery, \nthe use of artificial intelligence can support socially and environmentally beneficial outcomes \nand pro vide key competitive advantages to companies and the European economy. Such \naction is especially needed in high-impact sectors, including climate change, environment and \nhealth, the public sector, finance, mobility, home affairs and agriculture. However, t he same'

#### Prepare Embedding Model
We use BGE embedding model as an example.

In [6]:
from sentence_transformers import SentenceTransformer

# Initialize embedding model
embedding_model = SentenceTransformer("BAAI/bge-small-en-v1.5")
# Helper function to embed text
def embed_text(text):
    return embedding_model.encode([text], normalize_embeddings=True).tolist()[0]

# Generate text embedding and print its dimension and first few elements
text_embedding = embed_text(text_lines[0])
print(len(text_embedding))
print(text_embedding[:5])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

384
[-0.10604584217071533, -0.05663284659385681, 0.006358962506055832, -0.031974997371435165, -0.025199413299560547]


Load data into Milvus vector database

[Milvus](https://milvus.io/) is a popular open-source vector database that powers AI applications with highly performant and scalable vector similarity search.

In [16]:
from pymilvus import MilvusClient

# initialize milvus client
milvus_client = MilvusClient(uri="./hf_milvus_demo.db") # use local db file. It automatically utilizes Milvus Lite

collection_name = "rag_collection"

# Check if the collection already exist
if milvus_client.has_collection(collection_name):
    milvus_client.drop_collection(collection_name)

milvus_client.create_collection(
    collection_name=collection_name,
    dimension=len(text_embedding),
    metric_type="IP", # Inner product distance
    consistency_level='Strong' # Strong consistency level
)

# iterate through the text lines, create embeddings and then insert the data into Milvus
from tqdm import tqdm

data = []
for i, line in enumerate(tqdm(text_lines, desc="Creating embeddings")):
    data.append({"id": i, "vector": embed_text(line), "text": line})

insert_res = milvus_client.insert(collection_name=collection_name, data=data)
insert_res["insert_count"]

Creating embeddings: 100%|██████████| 424/424 [00:04<00:00, 90.47it/s]


424

#### Build RAG
Retrieve data for a query

In [18]:
question = "What is the legal basis for the proposal?"

search_result = milvus_client.search(
    collection_name=collection_name,
    data=[embed_text(question)],
    search_params={"metric_type": "IP", "params": {"nprobe": 10}}, # inner product distance
    query_records=[{"vector": embed_text(question)}],
    output_fields=["text"], # Return text field
    limit=3, # Return top 3 results
)

# Let's see sample search result
import json

retrieved_text_with_distances = [(res['entity']['text'], res['distance']) for res in search_result[0]]
print(json.dumps(retrieved_text_with_distances, indent=2))


[
  [
    "EN 6  EN \n2. LEGAL BASIS, SUBSIDIARITY AND PROPORTIONALITY \n2.1. Legal basis \nThe legal basis for the proposal is in the first place Article 114 of the Treaty on the \nFunctioning of the European Union (TFEU), which provides for the adoption of measures to \nensure the establishment and functioning of the internal market.  \nThis proposal constitutes a core part of the EU digital single market strategy. The primary \nobjective of this proposal is to ensure the proper functioning of the internal market by setting \nharmonised rules in particular on the development, placing on the Union market and the use \nof products and services making use of AI technologies or provided as stand -alone AI \nsystems. Some Member States are already considering national rules to ensure that AI is safe \nand is developed and used in compliance with fundamental rights obligations. This will likely \nlead to two main problems: i) a fragmentation of the internal market on essential elements",
 

Use LLM to get RAG response

In [51]:
# flatten retrieved document into a plain string
context = "\n".join([line_with_distance[0] for line_with_distance in retrieved_text_with_distances])

# Define prompts for the language model
Prompt = """
Use the following pieces of information enclosed in <context> tags to provide an answer to the question enclosed in <question> tags.
<context>
{context}
</context>
<question>
{question}
</question>
"""

from huggingface_hub import InferenceClient
import os

client = InferenceClient(
    api_key=os.environ["HF_TOKEN"],
    provider="auto",   # Automatically selects best provider
)

# Format the prompt and generate the answer
prompt = Prompt.format(context=context, question=question)

# Chat completion
answer = client.chat.completions.create(
    model="deepseek-ai/DeepSeek-V3-0324",
    messages=[{"role": "user", "content": prompt}]
)

print(answer.choices[0].message.content)


The legal basis for the proposal is **Article 114 of the Treaty on the Functioning of the European Union (TFEU)**, which provides for the adoption of measures to ensure the establishment and functioning of the internal market. This aligns with the proposal's objective of setting harmonized rules for AI systems to prevent market fragmentation and ensure compliance with fundamental rights. 

The proposal is also framed as a core part of the EU's digital single market strategy, emphasizing a balanced, risk-based regulatory approach tailored to address AI-related risks without unduly hindering innovation or trade.
