# Build and upload index

In [2]:
from evidence_seeker.retrieval import build_index, RetrievalConfig
config = RetrievalConfig(
    ###### MODEL CONFIGURATION ##############
    ### Local model (via Huggingface API) ###
    embed_backend_type="huggingface",
    embed_model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",   
    ### Huggingface inference API ###########
    # embed_backend_type="huggingface_inference_api",
    # embed_model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
    # embed_base_url="https://router.huggingface.co/hf-inference/models/sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
    # api_key_name="hf_debatelab_inference_provider",
    # bill_to="DebateLabKIT",
    ####### END MODEL CONFIGURATIAN #########
    hub_key_name="hf_evse_data",
    document_input_dir="../TMP/APUZ/corpus",
    index_persist_path="../TMP/APUZ/storage",
    # uncomment the following line to not upload the index to the HF hub
    #index_hub_path = "DebateLabKIT/apuz-index-es",
)

In [7]:
from loguru import logger
import pathlib
import yaml
from llama_index.core import SimpleDirectoryReader
from pprint import pprint
from typing import Dict
import os
#import chardet 

metadata_dict = {}
metadata_dir = config.document_input_dir or os.path.dirname(config.document_input_files[0])
# load and parse all yaml files in metadata_dir
for filepath in pathlib.Path(metadata_dir).rglob("*.yaml"):
    #logger.info(f"Loading metadata from {filepath}")

    # if the corresponding pdf file does not exist, skip this metadata file
    pdf_file = os.path.join(metadata_dir, filepath.stem + ".pdf")
    if not pathlib.Path(pdf_file).is_file():
        logger.warning(f"PDF file {pdf_file} does not exist, skipping metadata file {filepath}.")
        continue
    with open(filepath, "r", encoding="utf-8") as f:
        data = yaml.safe_load(f)
        if (
            "file" in data 
            and "author" in data 
            and "url" in data
            and "title" in data
        ):
            filename = data["file"]
            metadata_dict[filename] = {
                "author": data["author"],
                "title": data["title"], 
                "url": data["url"],
                "year": data.get("year", None),  # optional field
                "month": data.get("month", None),  # optional field
                # added by hand (not correctly in the yaml files/bibtex file)
                "journal": "Aus Politik und Zeitgeschichte (APuZ)",
            }
        else:
            logger.warning(f"Invalid metadata in {filepath}.")
        

def document_file_metadata(filename: str) -> Dict: 
    meta = metadata_dict.get(pathlib.Path(filename).name, {})
    if not meta:
        logger.warning(f"No metadata found for file: {filename}")
    return meta

# List all pdf files (we do not need to index the metadata files)
pdf_files = [str(p) for p in pathlib.Path(metadata_dir).rglob("*") if p.is_file() and (p.name.endswith(".pdf") or p.name.endswith(".PDF"))]

# for debugging purposes, print the metadata dictionary
reader = SimpleDirectoryReader(
    input_files=pdf_files,
    file_metadata=document_file_metadata,
)
docs = reader.load_data()
print(f"Loaded {len(docs)} documents with metadata from {metadata_dir}")
#for doc in docs:
#    print(doc.metadata)

#pprint(docs[0].metadata)


Loaded 907 documents with metadata from ../TMP/APUZ/corpus


In [8]:
# TODO: Mv to package.
import os
import dotenv
dotenv.load_dotenv()
if config.api_key_name is None or config.api_key_name not in os.environ:
    api_token = None
    print(
        f"Warning: Check wether you need an API token for your embedding backend ('{config.embed_backend_type}'). "
        f"If you need one, specify the key name under 'api_key_name' in the config and "
        f"set the API token with the specified key name as environment variable in .env file"
    )
else:
    api_token=os.environ.get(config.api_key_name, None)
# Alternatively, you can set the API token directly:
# api_token = "your_api_token_here"



In [9]:
import huggingface_hub
import os
import dotenv
dotenv.load_dotenv()

# TODO: Mv to package.

if config.hub_key_name is None or config.hub_key_name not in os.environ:
    hub_token = None
    print(
        f"Warning: Check wether you need a token to upload the index on HuggingFace hub. "
        f"If you need one, specify the key name under 'hub_key_name' in the config and "
        f"set the token with the specified key name as environment variable in .env file"

    )
else:
    hub_token=os.environ.get(config.hub_key_name, None)

hub_token=os.environ.get(config.hub_key_name, None)
# Alternatively, you can set the HuggingFace hub token directly:
# hub_token = "your_huggingface_hub_token_here"

In [10]:
build_index(
    #document_input_dir=config.document_input_dir,
    document_input_files=pdf_files,
    document_file_metadata=document_file_metadata,
    index_persist_path=config.index_persist_path,
    embed_base_url=config.embed_base_url,
    embed_model_name=config.embed_model_name,
    embed_backend_type=config.embed_backend_type,
    bill_to=config.bill_to,
    api_token=api_token,
    hub_token=hub_token,
    upload_hub_path=config.index_hub_path,
)

[32m2025-06-04 14:36:35.084[0m | [34m[1mDEBUG   [0m | [36mevidence_seeker.retrieval.base[0m:[36mbuild_index[0m:[36m437[0m - [34m[1mReading documents from ['../TMP/APUZ/corpus/ehrhardt_vernetzt_2024.pdf', '../TMP/APUZ/corpus/jager_hohere_2024.pdf', '../TMP/APUZ/corpus/stephan_europa_2024.pdf', '../TMP/APUZ/corpus/ulrike_flucht_2024.pdf', '../TMP/APUZ/corpus/joachim_am_2024.pdf', '../TMP/APUZ/corpus/juhasz_mehr_2024.pdf', '../TMP/APUZ/corpus/haas_von_2023.pdf', '../TMP/APUZ/corpus/becker_antisemitische_2024.pdf', '../TMP/APUZ/corpus/hoppe_schatten_2024.pdf', '../TMP/APUZ/corpus/wrohlich_erwerbsbeteiligung_2024.pdf', '../TMP/APUZ/corpus/gorg_comeback_2024.pdf', '../TMP/APUZ/corpus/rainald_alltagliche_2024.pdf', '../TMP/APUZ/corpus/geier_blackout_2023.pdf', '../TMP/APUZ/corpus/andreas_humanitare_2024.pdf', '../TMP/APUZ/corpus/rieger-ladich_neustart_2024.pdf', '../TMP/APUZ/corpus/svetlana_zwischen_2024.pdf', '../TMP/APUZ/corpus/jobst_uber_2024.pdf', '../TMP/APUZ/corpus/jana_mol

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/1702 [00:00<?, ?it/s]

[32m2025-06-04 15:34:29.671[0m | [34m[1mDEBUG   [0m | [36mevidence_seeker.retrieval.base[0m:[36mbuild_index[0m:[36m464[0m - [34m[1mPersisting index to /home/basti/Nextcloud/Documents/mindmaps/mind/projects/kideku/code/evidence-seeker/TMP/APUZ/storage/index[0m


In [11]:
config.index_persist_path

'../TMP/APUZ/storage'

In [12]:
# explicitly upload the index to the HuggingFace hub 
# (can also be done via the `build_index` function)
from evidence_seeker.retrieval.base import INDEX_PATH_IN_REPO

HfApi = huggingface_hub.HfApi(token=hub_token)
HfApi.upload_folder(
    repo_id="DebateLabKIT/apuz-index-es",
    folder_path=config.index_persist_path,
    #path_in_repo=INDEX_PATH_IN_REPO,
    repo_type="dataset",
)

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

default__vector_store.json:   0%|          | 0.00/650M [00:00<?, ?B/s]

docstore.json:   0%|          | 0.00/208M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/DebateLabKIT/apuz-index-es/commit/799dee54e009b532410c4765368ee1bcbdade16b', commit_message='Upload folder using huggingface_hub', commit_description='', oid='799dee54e009b532410c4765368ee1bcbdade16b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/DebateLabKIT/apuz-index-es', endpoint='https://huggingface.co', repo_type='dataset', repo_id='DebateLabKIT/apuz-index-es'), pr_revision=None, pr_num=None)