In [1]:
import os
from dotenv import load_dotenv

load_dotenv()
from qdrant_client import QdrantClient

if os.getenv("IS_DOCKER"):
    qdrant_host = "localhost"
else:
    qdrant_host = os.getenv("QDRANT_HOST")
    
qdrant_url = f"http://{qdrant_host}:{os.getenv('QDRANT_PORT')}"
client = QdrantClient(url=qdrant_url)

In [2]:
collection_name = os.getenv("QDRANT_COLLECTION")
limit = client.count(collection_name=collection_name, exact=True).count

In [3]:
offset = None

scroll_results, _ = client.scroll(
    collection_name=collection_name,
    limit=limit,
    offset=offset,
    with_payload=True,
    with_vectors=True,
)

print(f"ðŸ¤– Generating synthetic queries for {len(scroll_results)} docs...")

ðŸ¤– Generating synthetic queries for 117 docs...


In [4]:
import os # Ensure os is imported
import json
from langchain_community.chat_models import ChatDeepInfra
from langchain_core.prompts import ChatPromptTemplate

def call_local_llm(template_str, variables, model="openai/gpt-oss-20b"):
    # Ensure API Key is set
    if not os.getenv("DEEPINFRA_API_TOKEN"):
        raise ValueError("Please set your DEEPINFRA_API_TOKEN environment variable.")

    llm = ChatDeepInfra(
        name=model, 
        temperature=0, 
        deepinfra_api_token=os.getenv("DEEPINFRA_API_TOKEN")
    )
    
    # Create template from the raw string
    prompt = ChatPromptTemplate.from_template(template_str)
    
    # Pipe: Prompt -> LLM
    chain = prompt | llm
    
    # Invoke with the dictionary of variables
    response = chain.invoke(variables)
    
    # Fix: Extract the text string from the AIMessage object
    return response.content

def generate_query_agent(doc_text):
    # CHANGE: Explicit instructions for Indonesian context
    prompt_template = """
    You are an Indonesian user of a beauty clinic application (like ERHA or Dermies).
    Read the following document snippet:
    "{doc_text}"
    
    Task: Write a natural search query in **Indonesian (Bahasa Indonesia)** that you would type to find this exact information.
    
    Guidelines:
    - If the text mentions a price (Rp ...), generate a query like "Berapa harga..." or "Biaya treatment...".
    - If the text mentions a location, generate a query like "Lokasi klinik..." or "Dimana cabang...".
    - Use natural, casual Indonesian if appropriate, but keep it clear.
    
    Constraint: Output ONLY the query string in Indonesian. Do NOT use English. No quotes.
    """
    
    # Pass the template AND the variable dict
    return call_local_llm(prompt_template, {"doc_text": doc_text}).strip()

def relevance_judge_agent(query, doc_text):
    # CHANGE: Contextualized the judge to understand Indonesian relevance
    prompt_template = """
    You are a relevance judge for an Indonesian search engine.
    
    Query (Indonesian): "{query}"
    Document Snippet: "{doc_text}"
    
    Task: Does the document contain the answer to the query or is it highly relevant?
    Output: Return strictly valid JSON like this: {json_format}
    """

    example_json = '{"is_relevant": 1}'
    
    response_text = call_local_llm(
        prompt_template, 
        {
            "query": query, 
            "doc_text": doc_text,
            "json_format": example_json
        }
    )
    
    try:
        # Find the first { and last }
        start = response_text.find('{')
        end = response_text.rfind('}') + 1
        if start == -1 or end == 0: return 0
        
        json_str = response_text[start:end]
        data = json.loads(json_str)
        return data.get('is_relevant', 0)
    except Exception as e:
        return 0

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import pandas as pd
import os
from langchain_community.embeddings import DeepInfraEmbeddings

def build_dataset_with_ollama(client, collection_name, scroll_results):
    ollama_embedding = DeepInfraEmbeddings(
        model_id=os.getenv("DEEPINFRA_EMBEDDING_MODEL"),
        embed_instruction="",
        query_instruction=""
    )
    
    dataset = []
    total = len(scroll_results)
    
    print(f"ðŸš€ Starting generation pipeline on {total} documents...")

    for i, point in enumerate(scroll_results):
        print(f"   Processing {i+1}/{total}...", end="\r")
        
        doc_text = point.payload.get('full_text', '')
        # Skip empty/short docs to save time and token costs
        if len(doc_text) < 50: continue 

        # --- AGENT 1: GENERATOR (INDONESIAN MODIFICATION) ---
        
        doc_type = point.payload.get('type', 'Text')
        
        # We clarify the context for the LLM
        if doc_type == "Table":
            context_hint = "This text is a pricelist table (Daftar Harga)."
        else:
            context_hint = "This text is a description of a treatment or product."
        
        # MODIFIED PROMPT: Explicitly requests Bahasa Indonesia
        prompt_gen = """
        You are an Indonesian user of a beauty clinic application (like ERHA or Dermies).
        Context: {context_hint}
        Document Snippet: "{doc_text}"
        
        Task: Write a natural search query in **Bahasa Indonesia** that you would type to find this exact information.
        
        Guidelines:
        - If it lists a price, ask like: "Berapa harga [treatment]?" or "Biaya [treatment]...".
        - If it describes a skin problem, ask like: "Solusi [jerawat/flek]..." or "Cara mengatasi...".
        - If it mentions a location, ask like: "Lokasi klinik di..." or "Cabang terdekat...".
        - Use casual but clear Indonesian.
        
        Constraint: Output ONLY the query string in Indonesian. Do NOT use quotes.
        """
        
        # Safe call with variables
        synthetic_query = call_local_llm(
            prompt_gen, 
            {"context_hint": context_hint, "doc_text": doc_text[:1200]} # Truncate to avoid context limit
        ).strip()
        
        # Validation: If query is too short or accidentally English
        if len(synthetic_query) < 5: continue
            
        # --- RETRIEVAL (Mining Hard Negatives) ---
        q_vector = ollama_embedding.embed_query(synthetic_query)
        
        search_result = client.query_points(
            collection_name=collection_name,
            query=q_vector,
            limit=4 # 1 Positive + 3 Potential Negatives
        )
        
        # Collect candidates: The retrieved hits + the original source doc
        candidates = {}
        
        # 1. Add Original Source (The "Gold" Positive)
        candidates[point.id] = point
        
        # 2. Add Retrieved Hits (Potential Hard Negatives)
        for hit in search_result.points:
            candidates[hit.id] = hit
            
        # --- AGENT 2: JUDGE (Ensures Quality) ---
        for doc_id, candidate_point in candidates.items():
            cand_text = candidate_point.payload.get('full_text', '')
            
            is_relevant = relevance_judge_agent(synthetic_query, cand_text)
            
            dataset.append({
                "query": synthetic_query,
                "doc_id": doc_id,
                "full_text": cand_text,
                "h1": candidate_point.payload.get('h1', ''), 
                "h2": candidate_point.payload.get('h2', ''),
                "qdrant_score": candidate_point.score if hasattr(candidate_point, 'score') else 0.0, 
                "label": is_relevant
            })
            
    print(f"\nâœ… Finished! Generated {len(dataset)} training examples.")
    return pd.DataFrame(dataset)

# Usage
df_train = build_dataset_with_ollama(client, collection_name, scroll_results)
df_train.head(15)

ðŸš€ Starting generation pipeline on 117 documents...
   Processing 117/117...
âœ… Finished! Generated 481 training examples.


Unnamed: 0,query,doc_id,full_text,h1,h2,qdrant_score,label
0,Harga treatment Dermias Max di klinik kecantikan,00a745c3-1b98-4fa3-ae10-848963a7f80d,DERMIAS MAX\n\nRefresh Bright Oxy\n\nTreatment...,661627558-Katalog-Dermies-Max-by-Erha-Hiress.pdf,,0.56365,1
1,Harga treatment Dermias Max di klinik kecantikan,29cc2d5f-cfb2-4a17-a8b0-dac9bab6dd1d,DERMIAS MAX\n\nPerfect Glow Treatment by IPL\n...,661627558-Katalog-Dermies-Max-by-Erha-Hiress.pdf,,0.587803,1
2,Harga treatment Dermias Max di klinik kecantikan,1eae878a-608e-4754-ad8f-135f8af0f5d6,DERMIAS MAX\n\nÂ© 20-30 Menit\n\nBotox Treatmen...,661627558-Katalog-Dermies-Max-by-Erha-Hiress.pdf,,0.570249,1
3,Harga treatment Dermias Max di klinik kecantikan,f5506f93-97e5-4f5f-ad57-1b54fdee045a,Apa Keunggulannya?\n\n|\n\nHempaskan rambut ha...,661627558-Katalog-Dermies-Max-by-Erha-Hiress.pdf,,0.563202,1
4,Harga perawatan mata panda di ERHA Ultimate Br...,023dbd85-970b-41a8-81ae-4dfe821b9bcf,PROSEDUR\n\nCleansing - Treatment - Post treat...,724126666-ERHA-Ultimate-Pricelist-24.pdf,,0.604614,1
5,Harga perawatan mata panda di ERHA Ultimate Br...,4c16b569-f0b9-4691-a32e-75dabc417bbe,MANFAAT\n\n2) 60 menit\n\n@ Mulai dari Rp6.500...,724126666-ERHA-Ultimate-Pricelist-24.pdf,,0.556525,0
6,Harga perawatan mata panda di ERHA Ultimate Br...,7cbda854-3258-4475-8fa4-caeff7a749d5,ERHA ULTIMATE\n\nACNE CENTER\n\nPROGRAMS\n\nAc...,724126666-ERHA-Ultimate-Pricelist-24.pdf,,0.539494,0
7,Harga perawatan mata panda di ERHA Ultimate Br...,e22ab03c-5c80-42ad-9951-303b43843740,DERMIAS MAx\n\nDark Circle Rp 115.000 Correcti...,661627558-Katalog-Dermies-Max-by-Erha-Hiress.pdf,,0.538494,0
8,Solusi untuk bekas jerawat dan jerawat,04bb7b75-0cbd-4838-9a02-74feaef97699,Table of\n\nCONTENTS\n\nTable of Content One S...,724126666-ERHA-Ultimate-Pricelist-24.pdf,,0.604908,1
9,Solusi untuk bekas jerawat dan jerawat,7cbda854-3258-4475-8fa4-caeff7a749d5,ERHA ULTIMATE\n\nACNE CENTER\n\nPROGRAMS\n\nAc...,724126666-ERHA-Ultimate-Pricelist-24.pdf,,0.605298,1


In [6]:
df_train.to_csv("../data/csv/rerank_train.csv", index=False)