<a href="https://colab.research.google.com/github/stellarIV/Riggiti-RAG/blob/main/rag_redo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



# Data Scraping

## Install Dependancies

In [1]:
# Install dependencies
!pip install -q PyMuPDF
!pip install -q spacy
!python -m spacy download en_core_web_sm
!pip install chromadb sentence-transformers requests
!pip install -U google-genai

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m52.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m74.3 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting chromadb
  Downloading chromadb-1.0.9-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting fastapi==0.115.9 (from chromadb)
  Downloading fastapi-0.115.9-py3-none-any

## Upload PDF to ingest

In [2]:
import fitz  # PyMuPDF
import json
from google.colab import files
import json
import chromadb
import openai
import re
from sentence_transformers import SentenceTransformer
import spacy
import numpy as np
from collections import Counter


In [63]:


def upload_and_process_pdf():
    print("📤 Please upload a PDF file...")
    uploaded = files.upload()

    for filename in uploaded.keys():
        if filename.endswith('.pdf'):
            # Open the PDF file
            doc = fitz.open(filename)
            scraped_data = []

            # Extract text from each page
            for i, page in enumerate(doc):
                text = page.get_text()
                scraped_data.append({"page": i + 1, "text": text.strip()})

            # Save to JSONL format
            with open("scraped_data.jsonl", "w", encoding="utf-8") as f:
                for item in scraped_data:
                    json.dump(item, f, ensure_ascii=False)
                    f.write("\n")

            print(f"✅ Extracted {len(scraped_data)} pages and saved to scraped_data.jsonl")

        else:
            print("❌ Please upload a valid PDF file.")

# Run the function
upload_and_process_pdf()


📤 Please upload a PDF file...


KeyboardInterrupt: 

## Text Analyzer

In [41]:
def is_amharic(text):
    # Detect presence of Amharic script characters
    return bool(re.search(r'[\u1200-\u137F]', text))

def analyze_jsonl_script_based(file_path, text_key='text', max_lines=1000):
    lang_counter = Counter()

    with open(file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i >= max_lines:
                break
            try:
                data = json.loads(line)
                text = data.get(text_key, "") if isinstance(data, dict) else str(data)
                if not text.strip():
                    continue
                lang = 'am' if is_amharic(text) else 'en'
                lang_counter[lang] += 1
            except Exception:
                continue

    total = sum(lang_counter.values())
    print(f"\n📊 Script-Based Language Detection (first {min(i+1, max_lines)} lines):")
    for lang, count in lang_counter.items():
        percent = 100 * count / total
        print(f"- {lang}: {count} lines ({percent:.2f}%)")

    return lang_counter
file_path = "/content/scraped_data.jsonl"
p=analyze_jsonl_script_based(file_path)
print(p)


📊 Script-Based Language Detection (first 71 lines):
- am: 71 lines (100.00%)
Counter({'am': 71})


## Clean Ingested Data

In [43]:


def remove_common_headers(text):
    # Define common patterns to strip out from the top of the page
    header_patterns = [
        r'^\s*(Incomplete advance copy)\s*',  # Header alone
        r'^\s*\d+\s+Grade 9\s+Incomplete advance copy',  # e.g., '15 Grade 9 Incomplete advance copy'
        r'^\s*Grade 9.*',  # Any other Grade 9 variation
        r'^\s*የፋይናንስ ደህንነት አገልግሎት',  # Amharic: "Finance Security Service"
        r'^\s*አመታዊ መጽሐፍ',  # Amharic: "Annual Book"
    ]

    lines = text.split('\n')
    cleaned_lines = []

    for i, line in enumerate(lines):
        # Only check first 3 lines of the page (where headers typically occur)
        if i < 3:
            if any(re.match(pat, line, flags=re.IGNORECASE) for pat in header_patterns):
                continue  # Skip header line
        cleaned_lines.append(line)

    return '\n'.join(cleaned_lines).strip()


def clean_text(text):
    # Remove header
    text = remove_common_headers(text)

    # Preserve '.\n' but clean up messy whitespace
    text = text.replace('.\n', '[DOT_NEWLINE]')
    text = text.replace('።\n', '[DOT_NEWLINE]')
    text = re.sub(r'[\t\r\f\v]+', ' ', text)
    text = re.sub(r'\s*\n\s*', ' ', text)
    text = text.replace('[DOT_NEWLINE]', '.\n')
    text = re.sub(r'[\[\]{}<>“”"\'()_/\\=+@#%*~`|^•●]+', ' ', text)  # generic symbol cleaner
    text = re.sub(r'\s+', ' ', text)  # collapse multiple spaces
    # Remove standalone Amharic letters
    tokens = text.split()
    filtered_tokens = [
        word for word in tokens
        if not (len(word) == 1 and re.fullmatch(r'[\u1200-\u137F]', word))
    ]
    text = ' '.join(filtered_tokens)

    return text.strip()

def extract_amharic_text(jsonl_path='scraped_data.jsonl', output_path='amharic_text.jsonl'):
    amharic_pattern = re.compile(r'[\u1200-\u137F0-9\s\.\,\:\;\-\–\(\)\[\]\{\}\'\"!@#\$%\^&\*\+=\?\/\\]+')

    with open(jsonl_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile:
        for line in infile:
            entry = json.loads(line)
            cleaned = clean_text(entry['text'])
            matches = amharic_pattern.findall(cleaned)
            result_text = ''.join(matches).strip()
            if result_text:
                json.dump({"page": entry["page"], "text": result_text}, outfile, ensure_ascii=False)
                outfile.write('\n')

def extract_english_text(jsonl_path='scraped_data.jsonl', output_path='english_text.jsonl'):
    english_pattern = re.compile(r'[A-Za-z0-9\s\.\,\:\;\-\–\(\)\[\]\{\}\'\"!@#\$%\^&\*\+=\?\/\\]+')

    with open(jsonl_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile:
        for line in infile:
            entry = json.loads(line)
            cleaned = clean_text(entry['text'])
            matches = english_pattern.findall(cleaned)
            result_text = ''.join(matches).strip()
            if result_text:
                json.dump({"page": entry["page"], "text": result_text}, outfile, ensure_ascii=False)
                outfile.write('\n')


# Language detector

In [46]:
def process_jsonl_based_on_language(file_path='scraped_data.jsonl', text_key='text'):
    lang_counts = analyze_jsonl_script_based(file_path, text_key=text_key)
    am = lang_counts.get('am', 0)
    en = lang_counts.get('en', 0)

    print("\n🧠 Language dominance analysis:")
    print(f"- Amharic lines: {am}")
    print(f"- English lines: {en}")

    if am > en:
        extract_amharic_text(file_path)
        return "am"
    else:
        extract_english_text(file_path)
        return "en"


In [48]:
lang_type=process_jsonl_based_on_language()


📊 Script-Based Language Detection (first 71 lines):
- am: 71 lines (100.00%)

🧠 Language dominance analysis:
- Amharic lines: 71
- English lines: 0


In [5]:
# Call the functions to extract and save filtered text
extract_amharic_text()   # creates amharic_text.jsonl


In [None]:
extract_english_text()   # creates english_text.jsonl


# Chunk our Data

### For English

In [49]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Sentence splitter using spaCy
def split_into_sentences(text):
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents if sent.text.strip()]

# Paragraph splitter using double line breaks (manually preserved)
def split_into_paragraphs(text):
    return [p.strip() for p in text.strip().split('\n') if p.strip()]

### For Amharic

In [50]:
def split_into_sentences_amharic(text):
    sentences = []
    current_sentence = ""
    for char in text:
        current_sentence += char
        if char in {'።', '!', '?'}:
            sentence = current_sentence.strip()
            if sentence:
                sentences.append(sentence)
            current_sentence = ""
    # Add any remaining text as a sentence
    if current_sentence.strip():
        sentences.append(current_sentence.strip())
    return [sent for sent in sentences if sent]


In [52]:


# Clean text by removing all '\n' characters, tabs, and excess spaces
def clean_text_linebreaks(text):
    # Remove all \n characters
    text = text.replace('\n', ' ')
    # Remove any tabs and excess spaces
    text = re.sub(r'\t+', ' ', text)
    text = re.sub(r' +', ' ', text)   # Collapse multiple spaces into one
    return text.strip()

# Main chunking function
def chunk_jsonl_text(
    filepath='english_text.jsonl',
    output_path='chunked_text.jsonl',
    chunk_by='sentence',  # 'sentence', 'paragraph', 'page'
    group_size=1,
    max_characters=3000
):
    with open(filepath, 'r', encoding='utf-8') as infile:
        data = [json.loads(line) for line in infile]

    chunks = []
    buffer = ""
    unit_count = 0
    current_pages = []

    for entry in data:
        text = entry['text']
        page = entry['page']

        # Clean all newlines from the text
        text = clean_text_linebreaks(text)

        # Split text by chosen method
        if chunk_by == 'sentence':
          if lang_type== "am":
            units = split_into_sentences_amharic(text)
          else:
            units = split_into_sentences(text)
        elif chunk_by == 'paragraph':
            units = split_into_paragraphs(text)
        elif chunk_by == 'page':
            units = [text]
        else:
            raise ValueError("chunk_by must be 'sentence', 'paragraph', or 'page'")

        # Build chunks
        for unit in units:
            proposed = buffer + (" " if buffer else "") + unit
            if len(proposed) > max_characters or unit_count >= group_size:
                if buffer:
                    chunks.append({
                        "pages": current_pages,
                        "text": buffer.strip()
                    })
                buffer = unit
                unit_count = 1
                current_pages = [page]
            else:
                buffer = proposed
                unit_count += 1
                if page not in current_pages:
                    current_pages.append(page)

    # Add last chunk
    if buffer.strip():
        chunks.append({
            "pages": current_pages,
            "text": buffer.strip()
        })

    # Write to output
    with open(output_path, 'w', encoding='utf-8') as outfile:
        for chunk in chunks:
            json.dump(chunk, outfile, ensure_ascii=False)
            outfile.write('\n')

    print(f"✅ Chunked into {len(chunks)} items and saved to {output_path}")


In [53]:
chunk_jsonl_text(
    filepath='amharic_text.jsonl',
    output_path='chunked_by_sentences2.jsonl',
    chunk_by='sentence',
    group_size=3,
    max_characters=5000
)

✅ Chunked into 128 items and saved to chunked_by_sentences2.jsonl


Hurray!! data collection and chunking finished

## Embedding Our Data

## For FAISS

In [None]:
def embed_text_chunks(
    input_jsonl='chunked_text.jsonl',
    output_npy='embeddings.npy',
    output_metadata='metadata.jsonl',
    model_name='paraphrase-MiniLM-L6-v2',
    batch_size=32,
    show_preview=True
):
    print(f"🔄 Loading model: {model_name}")
    model = SentenceTransformer(model_name)

    texts = []
    metadata = []

    # Load text chunks
    with open(input_jsonl, 'r', encoding='utf-8') as f:
        for line in f:
            entry = json.loads(line)
            texts.append(entry['text'])
            metadata.append({
                "pages": entry.get("pages", []),
                "text_preview": entry["text"][:100] + "..."  # For quick inspection
            })

    print(f"📄 Loaded {len(texts)} text chunks")

    # Compute embeddings
    print("⚙️ Computing embeddings...")
    embeddings = model.encode(texts, batch_size=batch_size, convert_to_numpy=True, show_progress_bar=True)

    # Save embeddings as .npy
    np.save(output_npy, embeddings)
    print(f"💾 Saved embeddings to {output_npy}")

    # Save metadata
    with open(output_metadata, 'w', encoding='utf-8') as meta_out:
        for item in metadata:
            json.dump(item, meta_out, ensure_ascii=False)
            meta_out.write('\n')
    print(f"📎 Metadata saved to {output_metadata}")

    # Optional preview
    if show_preview:
        print("\n🧾 Example preview:")
        print("Text:", texts[0][:200])
        print("Embedding shape:", embeddings[0].shape)

    return embeddings


In [None]:
embed_text_chunks(
    input_jsonl='chunked_by_sentences.jsonl',
    output_npy='paraphrase_MiniLM_embeddings.npy',
    output_metadata='paraphrase_MiniLM_metadata.jsonl',
    model_name='paraphrase-MiniLM-L6-v2'  # Try changing this to other models!
)


🔄 Loading model: paraphrase-MiniLM-L6-v2
📄 Loaded 231 text chunks
⚙️ Computing embeddings...


Batches:   0%|          | 0/8 [00:00<?, ?it/s]

💾 Saved embeddings to paraphrase_MiniLM_embeddings.npy
📎 Metadata saved to paraphrase_MiniLM_metadata.jsonl

🧾 Example preview:
Text: Author: Ann Fullick Adviser: Alemu Asfaw Evaluators: Solomon Belayneh Getachew Bogale Silas Araya Federal Democratic Republic of Ethiopia Ministry of Education Biology Student Textbook Grade 9
Embedding shape: (384,)


array([[-0.36088732,  0.62437767, -0.31661388, ..., -0.34875688,
         0.30137637,  0.29754072],
       [-0.43010095,  0.17494476, -0.41270575, ..., -0.2968747 ,
         0.20853299,  0.06345865],
       [-0.20878713, -0.02464597, -0.02219432, ..., -0.24649675,
         0.04722616,  0.10193737],
       ...,
       [-0.09510569, -0.16385382,  0.01368616, ..., -0.08052471,
         0.21493797,  0.04344206],
       [-0.39261466,  0.05208082, -0.02295829, ..., -0.31241482,
         0.2776665 ,  0.06188554],
       [-0.33360326,  0.07018267,  0.12846112, ..., -0.37540123,
         0.2615606 ,  0.17897749]], dtype=float32)

In [None]:
# Test the function with different models
embed_chunks(input_jsonl='chunked_by_sentences.jsonl', output_jsonl='chunked_embeddings_paraphrase.jsonl', model_name='paraphrase-MiniLM-L6-v2')
embed_chunks(input_jsonl='chunked_by_sentences.jsonl', output_jsonl='chunked_embeddings_all_miniLM.jsonl', model_name='all-MiniLM-L6-v2')
embed_chunks(input_jsonl='chunked_by_sentences.jsonl', output_jsonl='chunked_embeddings_distilbert.jsonl', model_name='distilbert-base-nli-stsb-mean-tokens')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.51k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

FileNotFoundError: [Errno 2] No such file or directory: 'chunked_text.jsonl'

## For Chroma

### With out embedding declared

In [9]:
import json
import uuid
import chromadb

# Initialize Chroma client and collection
chroma_client = chromadb.Client()
collection = chroma_client.get_or_create_collection(name="my_collection")

# Step 1: Read the JSONL file and load the data
with open("chunked_by_sentences_amharic.jsonl", "r") as file:
    documents = []
    ids = []
    for idx, line in enumerate(file):
        data = json.loads(line)
        text = data["text"]
        # Make the ID unique using page number + line index
        document_id = f"id{data['pages']}_{idx}"
        documents.append(text)
        ids.append(document_id)

# Step 2: Add documents to Chroma collection
collection.upsert(
    documents=documents,
    ids=ids
)

# Step 3: Query the collection
query_text = "አቶ ኪሩቤል ጨምረው እንደገለጹት ሶፍትዌሩ ከወንጀሎቹ ጋር በተያያዘ ስንት መሰረታዊ ጉዳዮችን ያከናውናል።"
results = collection.query(
    query_texts=[query_text],
    n_results=2
)

# Print results
print(results)


/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:01<00:00, 72.5MiB/s]


{'ids': [['id[64]_118', 'id[44]_78']], 'embeddings': None, 'documents': [['ይህ ትብብር ደግሞ አገራት በሚያቀርቡት ጥያቄ ወይም ጉዳዩ በእጃቸው እንዲገባ ወዲያውኑ መፈፀም እንደሚገባ ተቀምጧል። ትብብር ለማድረግም እጅግ ፈጣንና ውጤታማ መንገድ እንዲጠቀሙ የሚያበረታታ ነው። ለዓለም አቀፍ ትብብር መሳካት የአገራት የሚመለከታቸው ተቋማት የሁለትዮሽ እንዲሁም ባለብዙ ዘርፍ የጋራ ትብብር ሊኖራቸው እንደሚገባ ይመክራል።', 'ይህ ደግሞ የተለያዩ አካትን ይመለከታል። ከነዚህም መካከል ለትርፍ ያልተቋቋሙ ድርጅቶች ተጠቃሽ ናቸው። የዚህን ወንጀል ተግባር ለመከላከልና ለትርፍ ያልተቋቋሙ ድርጅቶች ለወንጀሉ ያላቸውን ተጋላጭነት መቀነስ ያስፈልጋል. እነዚሁ ድርጅቶች የሚቋቋሙት ለትርፍ ዓላማ ሳይሆን የተለያዩ ማሕበራዊ እና ሰብአዊ አገልግሎቶችን ለማሕበረሰቡ ለመስጠት ነው።']], 'uris': None, 'included': ['metadatas', 'documents', 'distances'], 'data': None, 'metadatas': [[None, None]], 'distances': [[0.3437552750110626, 0.35913342237472534]]}


# Open Router Integration

## Setup chroma

### Function for chroma setup

In [54]:
from sentence_transformers import SentenceTransformer
import chromadb
import json

def index_jsonl_to_chroma(
    collection_name: str,
    model_name: str,
    jsonl_file_path: str,
    text_key: str = "text",
    page_key: str = "pages"
):
    """
    Indexes a JSONL file into a ChromaDB collection using SentenceTransformers embeddings.

    Args:
        collection_name (str): Name of the Chroma collection to create.
        model_name (str): SentenceTransformer model to use.
        jsonl_file_path (str): Path to the JSONL file.
        text_key (str): Key used in each JSON object to extract text.
        page_key (str): Key used to extract page number (optional).

    Returns:
        chromadb.Collection: The created and populated Chroma collection.
    """
    print(f"🔄 Loading model: {model_name}")
    model = SentenceTransformer(model_name)

    chroma_client = chromadb.Client()

    # Try deleting existing collection
    try:
        chroma_client.delete_collection(collection_name)
        print(f"🧹 Old collection '{collection_name}' deleted.")
    except:
        pass

    # Create a new collection
    collection = chroma_client.create_collection(name=collection_name)

    documents, ids, metadatas, embeddings = [], [], [], []

    print(f"📂 Reading file: {jsonl_file_path}")
    try:
        with open(jsonl_file_path, "r", encoding="utf-8") as file:
            for i, line in enumerate(file):
                data = json.loads(line)
                text = data.get(text_key, "")
                if not text.strip():
                    continue
                page_num = data.get(page_key, [0])[0] if isinstance(data.get(page_key), list) else 0
                doc_id = f"id{page_num}_{i}"

                documents.append(text)
                ids.append(doc_id)
                metadatas.append({"page": page_num, "line_number": i})
                embeddings.append(model.encode(text).tolist())

    except Exception as e:
        print("❌ Error reading file:", e)
        return None

    print(f"✅ Loaded {len(documents)} documents.")
    if documents:
        print(f"📈 Sample embedding (first 5 values): {embeddings[0][:5]}")

    collection.add(
        documents=documents,
        ids=ids,
        metadatas=metadatas,
        embeddings=embeddings
    )

    print(f"✅ Documents added to collection '{collection_name}'. Total: {collection.count()}")

    return collection


In [55]:
collection = index_jsonl_to_chroma(
    collection_name="my_collection2",
    model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
    jsonl_file_path="chunked_by_sentences_amharic.jsonl"
)


🔄 Loading model: sentence-transformers/paraphrase-multilingual-mpnet-base-v2
🧹 Old collection 'my_collection2' deleted.
📂 Reading file: chunked_by_sentences_amharic.jsonl
✅ Loaded 128 documents.
📈 Sample embedding (first 5 values): [-0.04631396010518074, 0.06661059707403183, -0.014639226719737053, -0.059827499091625214, 0.06187387928366661]
✅ Documents added to collection 'my_collection2'. Total: 128


test chroma init code

In [12]:
import json
import chromadb
from sentence_transformers import SentenceTransformer

# Load the sentence transformer model
model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")

# Initialize Chroma client
chroma_client = chromadb.Client()

# Delete the old collection (if it exists) to ensure clean state
try:
    chroma_client.delete_collection("my_collection2")
    print("Old collection deleted.")
except:
    pass  # Ignore if the collection doesn't exist

# Create a new collection (no embedding_function)
collection = chroma_client.create_collection(name="my_collection2")

# Step 1: Load and process JSONL data
documents = []
ids = []
metadatas = []
embeddings = []

try:
    with open("chunked_by_sentences_amharic.jsonl", "r") as file:
        for i, line in enumerate(file):
            data = json.loads(line)
            text = data.get("text", "")
            if not text:
                continue
            page_num = data.get("pages", [0])[0]
            doc_id = f"id{page_num}_{i}"

            documents.append(text)
            ids.append(doc_id)
            metadatas.append({"page": page_num, "line_number": i})
            embeddings.append(model.encode(text).tolist())
except Exception as e:
    print("Error reading file:", e)

# Sanity check
print(f"Loaded {len(documents)} documents.")
print(f"Sample embedding (first 5 values): {embeddings[0][:5]}")

# Step 2: Add to Chroma collection
collection.add(
    documents=documents,
    ids=ids,
    metadatas=metadatas,
    embeddings=embeddings
)

print("Documents added to collection.")
print(f"Total in collection: {collection.count()}")

# Step 3: Query using manual embedding
query_text = "አቶ ኪሩቤል ጨምረው እንደገለጹት ሶፍትዌሩ ከወንጀሎቹ ጋር በተያያዘ ስንት መሰረታዊ ጉዳዮችን ያከናውናል"
query_embedding = model.encode(query_text).tolist()

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=2
)

# Step 4: Print the results
print("\n--- Query Results ---")
for i, doc in enumerate(results["documents"][0]):
    print(f"\nResult {i+1}:")
    print("Document ID:", results["ids"][0][i])
    print("Metadata:", results["metadatas"][0][i])
    print("Text:", doc)


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.90k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Old collection deleted.
Loaded 128 documents.
Sample embedding (first 5 values): [-0.04631396010518074, 0.06661059707403183, -0.014639226719737053, -0.059827499091625214, 0.06187387928366661]
Documents added to collection.
Total in collection: 128

--- Query Results ---

Result 1:
Document ID: id14_15
Metadata: {'line_number': 15, 'page': 14}
Text: የፋይናንስ ደህንነት አገልግሎት አመታዊ መጽሐፍ 11 የወንጀል ትንተና ሶፍትዌር ልማትና አገልግሎቱ አቶ ኪሩቤል ጨምረው እንደገለጹት ሶፍትዌሩ ከወንጀሎቹ ጋር በተያያዘ 14 አስራ አራት መሰረታዊ ጉዳዮችን ያከናውናል። ከእነዚህ ውስጥ ዋና ዋናዎቹ ከተለያዩ አካላት መረጃዎችን ማሰባሰብ ያስችላል።

Result 2:
Document ID: id66_123
Metadata: {'line_number': 123, 'page': 66}
Text: እነዚህ አካላት በዚህ ሊንክ ይግቡ፣ በዚህ ተመዝገብ እና ከዚሁ ጋር የተያየዙ ልዩ ልዩ አማላይ መልዕክቶችን ይልካሉ። በሚያደርጉት የቅስቀሳ ተግባር ከህብረተሰቡ በርካታ ገንዘቦችን በተለያዩ የክፍያ አማራጮች እየሰበሰቡ እንደሆነ ታውቋል። በዚህም ተግባር ላይ ወደ አምስት በሚሆኑት በዲጂታል የገንዘብ መክፈያ ሥርዓት ጨምሮ & , , , 2 እና ፊያስ 777 ላይ በተደረገ ጥናትም ለሚያሠሩት ሥራ ኃላፊነት የሚወስድ አካል የሌላቸው፤ በተለይ ከላይ ያሉት ደግሞ ሀሰተኛ እና የመረጃ መዝባሪዎች ያለ ግለሰብ ፈቃድ የግል መረጃ ስርቆት ወይም ማጭበርበር ላይ የተጠመዱ መሆናቸው ታውቋል. እነዚሁ አካላት በተለይም ማህ

# Define a Function to generate Answer

In [34]:
from sentence_transformers import SentenceTransformer
import google.generativeai as genai

def generate_answer(api_key: str, embedder_model_name: str, query_text: str, collection: any, n_results: int = 2, model_name: str = "gemini-pro"):
    """
    Embeds a query, searches a vector database, and generates an answer using a specified Gemini model.

    Args:
        api_key: Your Google AI Studio API key.
        embedder_model_name: Name of the Sentence Transformer model to use for embedding.
        query_text: The query text in Amharic.
        collection: Your vector database collection object (must have a 'query' method).
        n_results: The number of search results to retrieve from the vector database.
        model_name: The name of the Gemini model to use for generation. Defaults to "gemini-pro".
    """
    try:
        # 1. Embed query
        embedder = SentenceTransformer(embedder_model_name)
        query_embedding = embedder.encode(query_text).tolist()

        # 2. Search vector DB
        results = collection.query(
            query_embeddings=[query_embedding],
            n_results=n_results
        )
        retrieved_chunks = results["documents"][0]
        context = "\n\n".join(retrieved_chunks)

        # 3. Generate with Gemini
        genai.configure(api_key=api_key)
        gemini_model = genai.GenerativeModel(model_name)

        prompt = f"""
        እርስዎ አጋዥ የ AI ረዳት ነዎት። ጥያቄውን ለመመለስ የቀረበውን ጽሑፍ ብቻ ይጠቀሙ።

        ጽሑፍ:
        {context}

        ጥያቄ:
        {query_text}

        መልሱን ግልጽ እና አጭር በሆነ መንገድ በጽሑፉ ላይ ብቻ በመመስረት ይመልሱ።
        """

        response = gemini_model.generate_content(prompt)

        # 4. Show the final answer
        print("\n--- የመጨረሻ መልስ ---\n")
        print(response.text)

    except Exception as e:
        print(f"An error occurred: {e}")
        print("\n--- Available Models (Attempting to List) ---\n")
        try:
            models = genai.list_models()
            for model in models:
                print(f"Model: {model.name}")
                for method in model.supported_generation_methods:
                    print(f"  - Supports: {method}")
        except Exception as list_error:
            print(f"Error listing models: {list_error}")
            print("Could not retrieve the list of available models at this time.")

# --- Example Usage ---
# Assuming you have your API key and a populated 'collection' object
your_api_key = "YOUR_GOOGLE_AI_STUDIO_API_KEY"
your_collection = ... # Your vector database collection





In [56]:
collection = chroma_client.get_or_create_collection(name="my_collection2")
generate_answer(
    api_key="AIzaSyCkxdvHxyntztJs8gXtwhOftmlwM6UuFTo",
    embedder_model_name='sentence-transformers/paraphrase-multilingual-mpnet-base-v2',
    query_text="አቶ ኪሩቤል ጨምረው እንደገለጹት ሶፍትዌሩ ከወንጀሎቹ ጋር በተያያዘ ስንት መሰረታዊ ጉዳዮችን ያከናውናል?",
    collection=collection,
    n_results=2,
    model_name="models/gemini-2.0-flash-lite-preview" # You can change this to other available models
)


--- የመጨረሻ መልስ ---

ሶፍትዌሩ ከወንጀሎቹ ጋር በተያያዘ 14 (አስራ አራት) መሰረታዊ ጉዳዮችን ያከናውናል።



# **Complete Rag_pipeline**

In [65]:
import time

def Rag_pipeline(
    temp_jsonl_path: str = "/content/scraped_data.jsonl",
    chunked_jsonl_path: str = "/content/chunked_text.jsonl",
    chunk_by: str = "sentence",
    group_size: int = 3,
    max_characters: int = 5000,
    collection_name: str = "my_collection3",
    model_name: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
    query_text: str = "ሶፍትዌሩ ከወንጀሎቹ ጋር ምን ግንኙነት አለው?",
    api_key: str = "",
    n_results: int = 2,
    gemini_model_name: str = "models/gemini-2.0-flash-lite-preview"
):
    """
    Full RAG pipeline: from PDF to answer generation using embedding + retrieval + Gemini model.
    """

    # 1. Upload and process PDF -> temp_jsonl_path
    upload_and_process_pdf()
    time.sleep(3)

    # 2. Detect dominant language -> optionally call extract function
    process_jsonl_based_on_language(temp_jsonl_path)
    time.sleep(3)

    # 3. Chunk the .jsonl file
    chunk_jsonl_text(
        filepath=temp_jsonl_path,
        output_path=chunked_jsonl_path,
        chunk_by=chunk_by,
        group_size=group_size,
        max_characters=max_characters
    )
    time.sleep(3)

    # 4. Embed & index to Chroma
    collection = index_jsonl_to_chroma(
        collection_name=collection_name,
        model_name=model_name,
        jsonl_file_path=chunked_jsonl_path
    )
    time.sleep(3)

    # 5. Generate answer using Gemini
    generate_answer(
        api_key=api_key,
        embedder_model_name=model_name,
        query_text=query_text,
        collection=collection,
        n_results=n_results,
        model_name=gemini_model_name
    )


In [67]:
Rag_pipeline(

    api_key="AIzaSyCkxdvHxyntztJs8gXtwhOftmlwM6UuFTo",
    query_text="አቶ ኪሩቤል ጨምረው እንደገለጹት ሶፍትዌሩ ከወንጀሎቹ ጋር በተያያዘ ስንት መሰረታዊ ጉዳዮችን ያከናውናል?"
)


📤 Please upload a PDF file...


Saving 2014.pdf to 2014.pdf
✅ Extracted 71 pages and saved to scraped_data.jsonl

📊 Script-Based Language Detection (first 71 lines):
- am: 71 lines (100.00%)

🧠 Language dominance analysis:
- Amharic lines: 71
- English lines: 0
✅ Chunked into 182 items and saved to /content/chunked_text.jsonl
🔄 Loading model: sentence-transformers/paraphrase-multilingual-mpnet-base-v2
🧹 Old collection 'my_collection3' deleted.
📂 Reading file: /content/chunked_text.jsonl
✅ Loaded 182 documents.
📈 Sample embedding (first 5 values): [-0.038933925330638885, 0.08463428914546967, -0.014276583679020405, -0.0418577566742897, 0.022652726620435715]
✅ Documents added to collection 'my_collection3'. Total: 182

--- የመጨረሻ መልስ ---

ሶፍትዌሩ ከወንጀሎቹ ጋር በተያያዘ 14 መሰረታዊ ጉዳዮችን ያከናውናል።

