In [12]:
import sys
import time
from google import genai
from google.genai import types
from IPython.display import Markdown
from IPython.display import display
from dotenv import load_dotenv
import os
import glob
import chromadb
from chromadb import Documents, EmbeddingFunction, Embeddings
from google.api_core import retry
from langcodes import Language
from collections import defaultdict
from tqdm import tqdm
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from langchain_core.documents import Document
import re
from collections import defaultdict
from typing import List, Tuple
import fasttext
import re
from typing import Any, Dict, List, Tuple

In [2]:
def get_language_name(lang_code):
    """Returns the full English name of a language code."""
    language = Language.make(language=lang_code).language_name()
    return language

def import_google_api():
    load_dotenv()
    GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

    client = genai.Client(api_key=GOOGLE_API_KEY)

    # Simple model check (keeping the original print logic)
    for m in client.models.list():
        if "embedContent" in m.supported_actions:
            print(m.name)

    return client

In [3]:
def embedding_function(client):
    class GeminiEmbeddingFunction(EmbeddingFunction):
        document_mode = True

        def __init__(self, client):
            self.client = client
            # Retry only on specific transient API errors
            self._retry = retry.Retry(predicate=lambda e: isinstance(e, genai.errors.APIError) and e.code in {429, 503})

        def __call__(self, input: Documents) -> Embeddings:
            embedding_task = "retrieval_document" if self.document_mode else "retrieval_query"
            response = self._retry(self.client.models.embed_content)(
                model="models/text-embedding-004",
                contents=input,
                config=types.EmbedContentConfig(task_type=embedding_task),
            )
            return [e.values for e in response.embeddings]

    return GeminiEmbeddingFunction(client)

In [None]:
# Assuming 'Document' is a class/dataclass with 'page_content' and 'metadata' attributes
# from a library like LangChain, LlamaIndex, etc.
class Document:
    def __init__(self, page_content: str, metadata: dict = None):
        self.page_content = page_content
        self.metadata = metadata if metadata is not None else {}

# NOTE: The original doc_stats, filename_base, google_drive_path, and all_chunks 
# are not defined in the provided snippet. I'll define placeholders or 
# make assumptions for a runnable example.

def parse_markdown_for_metadata(directory: str, google_drive_path: str = None) -> List[Document]:
    """
    Reads markdown files in a directory (and subdirectories) and creates a 
    single Document for each file, adding relevant metadata, but does not chunk.
    """
    markdown_files = glob.glob(os.path.join(directory, '**/*.md'), recursive=True)
    if not markdown_files:
        print("No markdown files found")
        return []

    print(f"Processing {len(markdown_files)} markdown files...")
    
    all_documents = []

    for filepath in tqdm(markdown_files, desc="Processing documents"):
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                markdown_text = f.read()
        except Exception as e:
            print(f"\nWarning: Could not read file {filepath}: {e}")
            continue

        # Extract basic file name for metadata
        filename_base = os.path.basename(filepath)

        # Create a single Document for the entire file content
        doc = Document(page_content=markdown_text)

        # Add metadata
        doc.metadata["source"] = filename_base
        # Use google_drive_path if provided, otherwise use local path
        doc.metadata["source_path"] = google_drive_path or filepath 
        
        # NOTE: Since we are not chunking by headers, we won't have a specific header.
        # We set it to an empty string or a placeholder.
        doc.metadata["header"] = "" 
        
        # Since the entire file is one document, these values reflect that.
        doc.metadata["chunk_index"] = 0
        doc.metadata["total_chunks"] = 1
        doc.metadata["is_complete_doc"] = True
        
        all_documents.append(doc)

    print(f"\nSuccessfully processed {len(all_documents)} files into documents.")
    return all_documents

In [5]:
def create_collection(chroma_client, gemini_embedding_function, documents_list):
    """
    Create or update ChromaDB collection with optimized batch processing.
    """
    DB_NAME = "hrstud-bot"
    embed_fn = gemini_embedding_function
    embed_fn.document_mode = True

    db = chroma_client.get_or_create_collection(
        name=DB_NAME,
        metadata={"model": "models/text-embedding-004", "dimension": 768},
        embedding_function=embed_fn
    )

    documents = [doc.page_content for doc in documents_list]
    metadatas = [doc.metadata for doc in documents_list]
    ids = [f"{DB_NAME}_doc_{i}" for i in range(len(documents))]

    if db.count() == 0:
        print(f"Adding {len(documents)} documents to ChromaDB collection: {DB_NAME}")

        # Optimized batch size for Gemini API
        BATCH_SIZE = 100
        
        for i in tqdm(range(0, len(documents), BATCH_SIZE), desc="Adding documents", unit="batch"):
            batch_end = min(i + BATCH_SIZE, len(documents))
            db.add(
                documents=documents[i:batch_end],
                metadatas=metadatas[i:batch_end],
                ids=ids[i:batch_end]
            )
            # Rate limiting for API stability
            time.sleep(0.2)

        print(f"\nCollection '{DB_NAME}' now contains {db.count()} documents.")
    else:
        print(f"Collection '{DB_NAME}' already has {db.count()} documents.")

In [6]:
def persistent_client(embed_fn):
    """
    Initialize persistent ChromaDB client.
    """
    persist_dir = "./output"
    chroma_client = chromadb.PersistentClient(path=persist_dir)

    DB_NAME = "hrstud-bot"
    collection = chroma_client.get_collection(DB_NAME, embedding_function=embed_fn)

    print(f"Connected to collection: {collection.name}")
    print(f"Documents: {collection.count()}")
    print(f"Metadata: {collection.metadata}")
    return embed_fn, collection

In [None]:
# NOTE: Placeholder for helper function (used in the original snippet)
def _no_answer_response():
    """Standard no-answer response."""
    return ("Ispričavamo se, ali ne mogu pronaći relevantan odgovor u bazi znanja. "
            "Molimo kontaktirajte odgovarajuću službu za dodatne informacije.")

# FastText model loading assumed to be successful from the previous block
LID_MODEL = fasttext.load_model('./fasttext/lid.176.ftz') 

def get_article_hr(user_query, embed_fn, collection, client, user_language):
    # Print language (kept for debugging consistency)
    print(user_language.upper())
    
    # Switch to query mode when generating embeddings
    embed_fn.document_mode = False

    # Retrieve top 1 document (based on your n_results=1 in the original code)
    # The result structure is a dict: {'ids': [[]], 'distances': [[]], 'documents': [[]], 'metadatas': [[]], ...}
    n_results_to_fetch = 3 # Fetch more results for a richer context
    result = collection.query(query_texts=[user_query], n_results=n_results_to_fetch)
    
    # Extract documents (list of passages) and metadatas (list of dicts)
    all_passages = result["documents"][0]
    all_metadatas = result["metadatas"][0]

    query_oneline = user_query.replace("\n", " ")
    print(query_oneline)
    
    # 1. CONSTRUCT THE CONTEXT
    context_list = []
    # Use the metadata from the top result to define the main source link
    # Assuming 'source_path' contains the URL or relevant file path
    document_link = all_metadatas[0].get("source_path", "Link nije dostupan")
    
    for i, (passage, metadata) in enumerate(zip(all_passages, all_metadatas)):
        # Format the context for the model
        source_name = metadata.get("source", "Nepoznat izvor")
        # I removed the redundant "PASSAGE: " wrapper that was causing issues
        context_list.append(f"--- Izvor: {source_name} (Dio {i+1} od {len(all_passages)}) ---\n{passage.strip()}")

    # Join all context chunks into a single string
    context = "\n\n".join(context_list)
    
    # 2. CONSTRUCT THE PROMPT
    # The document_link is now a defined variable
    prompt = f"""
    Ti si ljubazan, precizan i informativan chatbot **Fakulteta Hrvatskih studija**. Tvoja je glavna zadaća odgovarati na pitanja studenata, potencijalnih studenata i osoblja o fakultetu, uključujući informacije o studijima, nastavi, smjerovima, prijavama, i općenitim informacijama o školi.

    **KRITIČNA PRAVILA:**
    1.  Koristi ISKLJUČIVO informacije iz dostavljene dokumentacije.
    2.  Odgovaraj na **Hrvatskom jeziku**.
    3.  Budi koncizan ali potpun — navedi sve relevantne detalje iz konteksta.
    4.  Ako dokumentacija ne sadrži odgovor, jasno i ljubazno reci da ne možeš pronaći odgovor u bazi znanja i uputi na kontaktiranje odgovarajuće službe.
    5.  **Ne smiješ koristiti fraze poput "Naravno, mogu vam pomoći!" ili "Evo nekoliko informacija o...". Odmah započni s relevantnim odgovorom.**

    **FORMATIRANJE ODGOVORA:**
    * Sve odgovore započni s **Izvorni link je [LINK](url)**, nakon čega slijedi prazan red.
    * Nemoj navoditi izvorni link dokumenta samo URL. npr nemoj navoditi: Izvorni link: ./markdown/fhs.hr_predmet_opsv.md
    * Koristi podebljani tekst za ključne pojmove (npr. **Upisi**, **Filozofija**, **Pročelnik**).
    * Koristi popise (liste) za nabrajanje informacija (studiji, uvjeti, rokovi).
    * Odgovori trebaju biti profesionalni i službeni, ali s ljubaznim tonom.

    **DOSTUPNA DOKUMENTACIJA (Kontekst):**
    {context}

    **KORISNIČKO PITANJE:** {query_oneline}

    **ODGOVOR:**
    """
    
    # 3. Call the model
    answer = client.models.generate_content(
        model="gemini-2.0-flash",
        contents=prompt, # Use the full prompt
        config={
            "max_output_tokens": 2048,
            "temperature": 0.2,
            "top_p": 0.9
        }
    )
    
    # Prepend the link as per your strict instruction, since Gemini might not format the first line perfectly
    #final_response = f"Izvorni link: {document_link}\n\n{answer.text.strip()}"
    
    #return final_response

    return answer.text.strip()



In [8]:
# USAGE EXAMPLE - Uncomment to run

markdown_folder = "./markdown"
# 
# # STEP 1: Parse and chunk documents (run once or when documents change)
md_documents = parse_markdown_for_metadata(markdown_folder)
# 
# # STEP 2: Create collection and add documents (run once)
client = import_google_api()
gemini_embedding_function = embedding_function(client)
chroma_persistent_client = chromadb.PersistentClient(path="./output")
create_collection(chroma_persistent_client, gemini_embedding_function, md_documents)

Processing 1095 markdown files...


Processing documents: 100%|██████████| 1095/1095 [00:00<00:00, 12975.52it/s]



Successfully processed 1095 files into documents.
models/embedding-001
models/text-embedding-004
models/gemini-embedding-exp-03-07
models/gemini-embedding-exp
models/gemini-embedding-001
Collection 'hrstud-bot' already has 1095 documents.


In [9]:
# STEP 3: Query the system (run for each query)
# 
client = import_google_api()
gemini_embedding_function = embedding_function(client)
embed_fn, collection = persistent_client(gemini_embedding_function)
# 
user_query = "Tko je Sandro Skansi?"  # Example query
response = get_article_hr(
    user_query=user_query,
    embed_fn=embed_fn,
    collection=collection,
    client=client,
    user_language="HR"
)
display(Markdown(response))

models/embedding-001
models/text-embedding-004
models/gemini-embedding-exp-03-07
models/gemini-embedding-exp
models/gemini-embedding-001
Connected to collection: hrstud-bot
Documents: 1095
Metadata: {'model': 'models/text-embedding-004', 'dimension': 768}
HR
Tko je Sandro Skansi?


Izvorni link je [https://www.fhs.hr/djelatnik/sandro.skansi](https://www.fhs.hr/djelatnik/sandro.skansi)

**Sandro Skansi** je izvanredni profesor. Diplomirao je 2009. godine **Filozofiju** i **Kroatologiju** na Hrvatskim studijima, a doktorirao je 2013. godine na Filozofskom fakultetu u Zagrebu disertacijom iz logike. Glavni mu je jezik Python.

Član je Hrvatskog filozofskog drustva, Hrvatskog logičkog udruženja i Association for the Advancement of Artificial Intelligence. Izradio je ili nadzirao izradu za produkcijske sustave umjetne inteligencije za Erste Banku, Iskon, Eurostat, A1 Telekom i HOK osiguranje.

Na Fakultetu Hrvatskih studija nositelj je sljedećih kolegija:
**Prijediplomski:**
*   [Društveni mediji i neformalna logika](https://www.fhs.hr/predmet/dmnl)
*   [Logika 2](https://www.fhs.hr/predmet/log2_a)
*   [Socijalna filozofija](https://www.fhs.hr/predmet/socfil_b)
*   [Uvod u umjetnu inteligenciju](https://www.fhs.hr/predmet/uuui)
*   [Završni rad](https://www.fhs.hr/predmet/zavrad)

**Diplomski:**
*   [Filozofija politike](https://www.fhs.hr/predmet/filpol)
*   [Suvremena filozofija](https://www.fhs.hr/predmet/suvfil_a)
*   [Umjetna inteligencija i razumijevanje prirodnoga jezika](https://www.fhs.hr/predmet/uirpj)
*   [Diplomski rad](https://www.fhs.hr/predmet/diprad_f)

**Doktorski:**
*   [Politička i pravna filozofija](https://www.fhs.hr/predmet/ppf_c)
*   [Estetika](https://www.fhs.hr/predmet/est_a)

Njegovi znanstveni interesi su:
*   Zaključivanje
*   Duboko učenje
*   Povijest logike i kibernetike u istočnom bloku
*   Filozofija uma

In [10]:
# ADVANCED: Test multiple queries
# 
test_queries = [
    "Koje predmete predaje Mato Škerbić?",
    "Koje predmete predaje Sandro Skansi?",
    "Tko predaje Opća povijest srednjeg vijeka?"
]
# 
for query in test_queries:
    print(f"\n{'#'*60}")
    print(f"QUERY: {query}")
    print(f"{'#'*60}")
    response = get_article_hr(
        user_query=query,
        embed_fn=embed_fn,
        collection=collection,
        client=client,
        user_language="HR"
    )
    display(Markdown(response))
    print("\n")


############################################################
QUERY: Koje predmete predaje Mato Škerbić?
############################################################
HR
Koje predmete predaje Mato Škerbić?


Izvorni link je [https://www.fhs.hr/predmet/filodg](https://www.fhs.hr/predmet/filodg)

**Matija Mato Škerbić** je nositelj i izvođač (seminar) predmeta **Filozofija odgoja**.

Izvorni link je [https://www.fhs.hr/predmet/nek](https://www.fhs.hr/predmet/nek)

**Matija Mato Škerbić** je nositelj i izvođač (seminar) predmeta **Nova etička kultura**.

Izvorni link je [https://www.fhs.hr/predmet/fis](https://www.fhs.hr/predmet/fis)

**Matija Mato Škerbić** je nositelj i izvođač (seminar) predmeta **Filozofija igre i športa**.




############################################################
QUERY: Koje predmete predaje Sandro Skansi?
############################################################
HR
Koje predmete predaje Sandro Skansi?


Izvorni link je [https://www.fhs.hr/predmet/socfil_b](https://www.fhs.hr/predmet/socfil_b)

**Sandro Skansi** je nositelj kolegija **Socijalna filozofija**. Predavanja iz kolegija Socijalna filozofija počinju u četvrtak 9.10.

Izvorni link je [https://www.fhs.hr/predmet/suvfil_a](https://www.fhs.hr/predmet/suvfil_a)

**Sandro Skansi** je nositelj i izvođač seminara iz kolegija **Suvremena filozofija**.

Izvorni link je [https://www.fhs.hr/predmet/ppf_c](https://www.fhs.hr/predmet/ppf_c)

**Sandro Skansi** je nositelj kolegija **Politička i pravna filozofija**.




############################################################
QUERY: Tko predaje Opća povijest srednjeg vijeka?
############################################################
HR
Tko predaje Opća povijest srednjeg vijeka?


Izvorni link je [https://www.fhs.hr/predmet/opsv](https://www.fhs.hr/predmet/opsv)

**Opća povijest srednjega vijeka** se izvodi, a nositelj je izv. prof. dr. sc. **Marko Jerković**.



