In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# This cell to to see if we are connected to a GPU, if so, which one
import torch
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("Device:", torch.cuda.get_device_name(0))


CUDA available: True
Device: Tesla T4


In [None]:
# Log into Hugging Face and get required access tokens to load eventual model
from huggingface_hub import login
login()

In [2]:
# All required package installations for the code (so far)
!pip install transformers accelerate bitsandbytes sentencepiece
!pip install PyPDF2 pycountry transformers
!pip install PyMuPDF
!pip install sentence-transformers langchain chromadb
!pip uninstall -y bitsandbytes
!pip install bitsandbytes --prefer-binary --extra-index-url=https://pypi.nvidia.com

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2

## Section Splitting Functions

In [3]:
# Function using Regular Expressions to break apart extracted texts from PDFs

import re

HEADING_REGEX = re.compile(
    r"(?:^|\n)(?P<heading>(?:PART|CHAPTER|TITLE|SECTION|ANNEX)\s+[\w\dIVXLCDM\-\.]+[^\n]*)",
    re.IGNORECASE
)

ARTICLE_HEADING_REGEX = re.compile(
    r"(?:(?:^|\n)\s*)(?P<article>Article\s+(?P<number>\d{1,3}))\s*(?:\n|$)",
    flags=re.IGNORECASE
)

def split_into_section(text):
    sections = []
    hierarchy = {
        "PART": None,
        "CHAPTER": None,
        "TITLE": None,
        "SECTION": None,
        "ANNEX": None
    }

    matches = []
    for m in HEADING_REGEX.finditer(text):
        matches.append({
            "type": "structure",
            "match": m,
            "start": m.start(),
            "end": m.end()
        })
    for m in ARTICLE_HEADING_REGEX.finditer(text):
        matches.append({
            "type": "article",
            "match": m,
            "start": m.start(),
            "end": m.end(),
            "article_number": int(m.group("number"))
        })

    matches.sort(key=lambda x: x["start"])

    # 1. Preamble
    if matches and matches[0]["start"] > 0:
        preamble = text[:matches[0]["start"]].strip()
        if preamble:
            sections.append({
                "heading": "PREAMBLE",
                "text": preamble,
                "sub_heading": []
            })

    # 2. Loop through matches and build sections
    i = 0
    while i < len(matches):
        m = matches[i]
        match_obj = m["match"]
        start = m["end"]

        # Determine where to end this section
        end = len(text)
        for j in range(i + 1, len(matches)):
            if matches[j]["type"] == "article":
                # Only split if it's exactly the next article
                if matches[j]["article_number"] == m.get("article_number", -999) + 1:
                    end = matches[j]["start"]
                    break
            elif matches[j]["type"] == "structure":
                end = matches[j]["start"]
                break

        content = text[start:end].strip()

        if m["type"] == "structure":
            heading_text = match_obj.group("heading").strip()
            for level in hierarchy:
                if heading_text.upper().startswith(level):
                    hierarchy[level] = heading_text
                    # Clear lower hierarchy
                    for k in list(hierarchy.keys())[list(hierarchy).index(level) + 1:]:
                        hierarchy[k] = None
                    break

        elif m["type"] == "article":
            heading_text = match_obj.group("article").strip()
            full_heading = " > ".join([v for v in hierarchy.values() if v]) + f" > {heading_text}"

            sections.append({
                "heading": full_heading.strip(),
                "text": content,
                "sub_heading": [heading_text]
            })

        i += 1

    return sections


In [4]:
import os
import pycountry

# PDF directory to where the English PDFs are stored
pdf_dir = "/content/drive/MyDrive/DSSI/Q A Project/SUCCESSFUL PDFs"

# Manual override mapping for edge cases or special names
# Dictionary of countries we have
country_to_iso = {
    "Albania": "AL",
    "Argentina": "AR",
    "Armenia": "AM",
    "Australia": "AU",
    "Austria": "AT",
    "Azerbaijan": "AZ",
    "Belgium": "BE",
    "Bulgaria": "BG",
    "Bosnia And Herzegovina": "BA",
    "Belarus": "BY",
    "Bolivia": "BO",
    "Brazil": "BR",
    "Canada": "CA",
    "Switzerland": "CH",
    "Chile": "CL",
    "China": "CN",
    "Colombia": "CO",
    "Costa Rica": "CR",
    "Czech Republic": "CZ",
    "Germany": "DE",
    "Denmark": "DK",
    "Dominican Republic": "DO",
    "Ecuador": "EC",
    "Egypt": "EG",
    "Spain": "ES",
    "Estonia": "EE",
    "Finland": "FI",
    "France": "FR",
    "Great Britain (uk)": "GB",
    "Georgia": "GE",
    "Croatia": "HR",
    "Hungary": "HU",
    "Ireland": "IE",
    "Iceland": "IS",
    "Israel": "IL",
    "Italy": "IT",
    "Hashemite Kingdom Of Jordan": "JO",
    "Japan": "JP",
    "Kenya": "KE",
    "Kyrgyzstan": "KG",
    "South Korea": "KR",
    "Lithuania": "LT",
    "Latvia": "LV",
    "Moldova": "MD",
    "Mexico": "MX",
    "Macedonia": "MK",
    "Montenegro": "ME",
    "Nicaragua": "NI",
    "Netherlands": "NL",
    "Norway": "NO",
    "New Zealand (aotearoa)": "NZ",
    "Oman": "OM",
    "Panama": "PA",
    "Peru": "PE",
    "Poland": "PL",
    "Portugal": "PT",
    "Paraguay": "PY",
    "Romania": "RO",
    "Russia": "RU",
    "Singapore": "SG",
    "Serbia": "RS",
    "Slovakia": "SK",
    "Slovenia": "SI",
    "Sweden": "SE",
    "Trinidad And Tobago": "TT",
    "Tunisia": "TN",
    "Turkey": "TR",
    "Tanzania, United Republic Of": "TZ",
    "Ukraine": "UA",
    "Uruguay": "UY",
    "United States": "US",
    "Uzbekistan": "UZ",
    "Saint Vincent And The Grenadines": "VC",
    "Vietnam": "VN",
    "South Africa": "ZA"
}

# Ask user for input
country_input = input("🌍 What country are you asking for? ").strip()

# Try manual mapping first
# Whatever country the user puts, it will convert it to the iso-2 code
iso_code = country_to_iso.get(country_input.title())

# If not found, try pycountry fallback
if not iso_code:
    try:
        iso_code = pycountry.countries.search_fuzzy(country_input)[0].alpha_2
        print(f"🔁 Using fallback ISO code from pycountry: {iso_code}")
    except:
        raise ValueError(f"❌ Could not find ISO code for country: {country_input}")

# Find matching PDF(s) of chosen country
matching_pdfs = [f for f in os.listdir(pdf_dir) if f.lower().startswith(iso_code.lower()) and f.lower().endswith(".pdf")]

if not matching_pdfs:
    raise FileNotFoundError(f"❌ No PDFs found for {iso_code.upper()} in: {pdf_dir}")

print(f"🔍 Looking for PDFs with ISO code: {iso_code.upper()}")
print(f"📄 Found {len(matching_pdfs)} PDF(s): {matching_pdfs}")


🌍 What country are you asking for? Albania
🔍 Looking for PDFs with ISO code: AL
📄 Found 1 PDF(s): ['al031en.pdf']


In [5]:
import fitz
import os

# Loop through all PDFs that match the ISO code
for pdf_name in matching_pdfs:
    pdf_path = os.path.join(pdf_dir, pdf_name)
    print(f"\n📄 Reading: {pdf_name}")

    # Extract raw text
    doc = fitz.open(pdf_path)
    full_text = "\n".join([page.get_text() for page in doc])

    # Apply section-based splitting
    sections = split_into_section(full_text)

    # Preview first 3 sections
    print(f"\n✅ Split into {len(sections)} sections.")
    for i, sec in enumerate(sections[:3]):
        print(f"\n• Section {i+1}: {sec['heading']}\nPreview: {sec['text'][:200]}...\n")



📄 Reading: al031en.pdf

✅ Split into 42 sections.

• Section 1: PREAMBLE
Preview: LAW NO. 8880 DATED APRIL 15, 2002, ON PLANT BREEDER’S RIGHT 
Based on Articles 78, 81, point 1 and Article 83, point 1 of the Constitution, upon the 
proposal of the Council of Ministers, 
THE PEOPLE’...


• Section 2: CHAPTER 1 > Article 1
Preview: The goal 
The purpose of this Law is to protect the rights of the persons who breed, discover and 
develop new varieties of plants....


• Section 3: CHAPTER 1 > Article 2
Preview: Subjects for Law application 
The provisions of this Law are applicable to legal and physical persons, Albanian citizens or 
foreigners, and legal and physical persons who are citizens of: 
a)
contrac...



In [7]:
import os
import torch
import json
import fitz
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import chromadb

# --- Setup ---
pdf_dir = "/content/drive/MyDrive/DSSI/Q A Project/SUCCESSFUL PDFs"
chroma_client = chromadb.Client()
collection = chroma_client.get_or_create_collection(name="seedlaws")

# ✅ Load BAAI/bge-m3 embedding model (must match retrieval later)
embedding_model = SentenceTransformer("BAAI/bge-m3", device="cuda" if torch.cuda.is_available() else "cpu")

def get_query_embedding(query):
    return embedding_model.encode([query], normalize_embeddings=True).tolist()

# --- Loop through PDFs ---
for pdf_filename in matching_pdfs:
    print(f"\n📄 Reading: {pdf_filename}")
    pdf_path = os.path.join(pdf_dir, pdf_filename)

    # --- PDF Extraction ---
    doc = fitz.open(pdf_path)
    full_text = "\n".join([page.get_text() for page in doc])

    # --- Section Splitting ---
    sections = split_into_section(full_text)
    print(f"✅ Split into {len(sections)} sections.")
    for i, sec in enumerate(sections[:3]):
        print(f"\n🔹 Section {i+1}: {sec['heading']}\nPreview: {sec['text'][:200]}...")

    # --- Chunking ---
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1024,
        chunk_overlap=450,
        separators=["\n\n", "\n", ".", " ", ""]
    )

    all_chunks = []
    for section in sections:
        chunks = splitter.split_text(section["text"])
        for i, chunk in enumerate(chunks):
            all_chunks.append({
                "text": chunk,
                "heading": section["heading"],
                "source_file": pdf_filename
            })

    # --- Save chunks to JSON ---
    os.makedirs("/content/chunked_json", exist_ok=True)
    json_path = f"/content/chunked_json/{pdf_filename.replace('.pdf', '.json')}"
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(all_chunks, f, indent=2, ensure_ascii=False)
    print(f"💾 Saved {len(all_chunks)} chunks → {json_path}")

    # --- Embedding + Chroma Storage ---
    texts = [c["text"] for c in all_chunks]
    metadatas = [{"heading": c["heading"], "source_file": c["source_file"]} for c in all_chunks]
    ids = [f"{pdf_filename.replace('.pdf','')}_{i}" for i in range(len(all_chunks))]

    # ✅ Embed using same model you'll use for querying
    embeddings = embedding_model.encode(texts, show_progress_bar=True, normalize_embeddings=True)

    collection.add(
        documents=texts,
        metadatas=metadatas,
        ids=ids,
        embeddings=embeddings.tolist()
    )

    print(f"✅ Embedded and stored {len(texts)} chunks from {pdf_filename}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]


📄 Reading: al031en.pdf
✅ Split into 42 sections.

🔹 Section 1: PREAMBLE
Preview: LAW NO. 8880 DATED APRIL 15, 2002, ON PLANT BREEDER’S RIGHT 
Based on Articles 78, 81, point 1 and Article 83, point 1 of the Constitution, upon the 
proposal of the Council of Ministers, 
THE PEOPLE’...

🔹 Section 2: CHAPTER 1 > Article 1
Preview: The goal 
The purpose of this Law is to protect the rights of the persons who breed, discover and 
develop new varieties of plants....

🔹 Section 3: CHAPTER 1 > Article 2
Preview: Subjects for Law application 
The provisions of this Law are applicable to legal and physical persons, Albanian citizens or 
foreigners, and legal and physical persons who are citizens of: 
a)
contrac...
💾 Saved 58 chunks → /content/chunked_json/al031en.json


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


✅ Embedded and stored 58 chunks from al031en.pdf


In [9]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

# Load the Gemma 2B model
model_name = "google/gemma-2b-it"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.bfloat16
)




tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [10]:
# Try and make the model run faster
model = torch.compile(model)

In [11]:
# Keywords to help the model answer questions
BURDEN_KEYWORDS = [
    "application", "submit", "fee", "document", "registration", "deadline", "within",
    "penalty", "fine", "cancelled", "court", "unauthorized", "legal", "maintain",
    "verification", "requirement", "procedure", "compliance", "refused", "rejected"
]
BURDEN_BONUS_WEIGHT = 0.1

# Function that uses the given keywords to help with chunk retrieval
def rerank_chunks(results, query_embedding, query_text):
    documents = results["documents"][0]
    embeddings = np.array(results["embeddings"][0])
    metadatas = results["metadatas"][0]
    query_vec = np.array(query_embedding).reshape(1, -1)
    base_scores = cosine_similarity(query_vec, embeddings)[0]

    def count_keyword_overlap(text, keywords):
        return sum(1 for kw in keywords if kw.lower() in text.lower())

    reranked = []
    for score, doc, meta in zip(base_scores, documents, metadatas):
        bonus = count_keyword_overlap(doc, BURDEN_KEYWORDS) * BURDEN_BONUS_WEIGHT
        final_score = score + bonus
        reranked.append((final_score, score, doc.strip(), meta))

    reranked.sort(key=lambda x: x[0], reverse=True)
    return reranked


In [34]:
from sentence_transformers.util import cos_sim

## New Prompt (with examples to help)
def build_prompt(context, question):
    # Few-shot examples (can be randomized or rotated if you want)
    few_shot_examples = """
Example 1:
Q: What is NSI?
A: NSI stands for the National Seed and Seedlings Institute. It operates under the Ministry of Agriculture and Food and is responsible for maintaining registers, evaluating varieties, and processing breeder’s rights.

Example 2:
Q: What are the penalties for violating the breeder’s right?
A: According to the law, violations such as unauthorized use of protected varieties may lead to administrative fines or legal claims. These penalties are enforced by the State Control Inspectorate or through court action.

Example 3:
Q: What challenges might a small-scale farmer face when complying with this law?
A: While the text does not explicitly mention small-scale farmers, challenges may include the cost of applications, complexity of the legal requirements, and the administrative burden of maintaining protected varieties.

---
Now, answer the following question using the context below.
"""

    return f"""You are a legal analyst specializing in interpreting laws and official documents.

Your task is to answer the user's question using the information in the legal text below.

{few_shot_examples}

Context:
{context}

Question:
{question}

Instructions:
- Use only the information found in the context to construct your answer.
- If the answer is not explicitly stated, analyze the relevant sections to infer a reasoned legal conclusion.
- Consider implications, responsibilities, penalties, and procedural requirements.
- If relevant, describe how these may affect individuals such as farmers, breeders, or applicants.
- If the context lacks any basis for an informed answer, say: "Sorry, I couldn't find relevant information in the documents."

Answer:"""


def ask_with_gemma(question, top_k=5):
    embedding = embedding_model.encode([question], normalize_embeddings=True)[0]
    results = collection.query(query_embeddings=[embedding.tolist()], n_results=top_k * 3)

    if not results.get("documents") or not results["documents"][0]:
        print("⚠️ No documents found.")
        return

    retrieved_chunks = results["documents"][0]
    metadatas = results["metadatas"][0]

    print(f"\n🔍 Top {top_k} Retrieved Chunks:")
    for i, chunk in enumerate(retrieved_chunks[:top_k]):
        preview = chunk[:200].replace("\n", " ") + "..." if len(chunk) > 200 else chunk
        heading = metadatas[i].get("heading", "N/A")
        print(f"\n🔹 Rank {i+1} | Heading: {heading}\nPreview: {preview}")

    # 🔁 Rerank with cosine similarity and attach metadata
    chunk_embeddings = embedding_model.encode(retrieved_chunks, normalize_embeddings=True)
    chunks_meta = list(zip(retrieved_chunks, chunk_embeddings, metadatas))

    reranked = sorted(
        chunks_meta,
        key=lambda x: cos_sim(embedding, x[1]),
        reverse=True
    )

    # 🧠 Use top reranked for context
    context = "\n\n".join([chunk for chunk, _, _ in reranked[:5]])
    prompt = build_prompt(context, question)

    input_ids = tokenizer(prompt, return_tensors="pt").to(model.device)
    output = model.generate(**input_ids, max_new_tokens=600, do_sample=True, temperature=0.7) # max_new_tokens was 400
    answer = tokenizer.decode(output[0], skip_special_tokens=True)
    response = answer[len(prompt):].strip()

    print(f"\n🧠 Gemma's Answer:\n{response}")



In [35]:
# Cell block to ask question to the model
question = input("❓ How may I assist you :  ")
ask_with_gemma(question)

❓ How may I assist you :  What is NSI?

🔍 Top 5 Retrieved Chunks:

🔹 Rank 1 | Heading: CHAPTER 1 > Article 3
Preview: 4.  “NSI” is the National Seed and Seedlings Institute in the Ministry of Agriculture and  Food.    5.  “SCVA” is the State Commission for Variety Attestation that operates within the NSI,  and examin...

🔹 Rank 2 | Heading: CHAPTER 1 > Article 3
Preview: C(Extr.)/21/4  Annexe II / Annex II / Anlage II / Anexo II  page 2 / Seite 2 / página 2      3.  “Variety” means a plant grouping within a botanical taxon of the lowest known rank,  which is given whe...

🔹 Rank 3 | Heading: CHAPTER VII > Article 29
Preview: Verification of the variety maintenance 
 
During the protection period, the NSI shall verify whether the variety and its parent 
components are properly maintained.

🔹 Rank 4 | Heading: CHAPTER  IV > Article 18
Preview: Preservation of files    1.  The NSI maintains a register for applications and another register for the breeder’s right,  i.e. the applicants that