In [1]:
import os
from dotenv import load_dotenv

load_dotenv()
from qdrant_client import QdrantClient

if os.getenv("IS_DOCKER"):
    qdrant_host = "localhost"
else:
    qdrant_host = os.getenv("QDRANT_HOST")
    
qdrant_url = f"http://{qdrant_host}:{os.getenv('QDRANT_PORT')}"
client = QdrantClient(url=qdrant_url)

In [2]:
collection_name = os.getenv("QDRANT_COLLECTION")
offset = None

search_result, _ = client.scroll(
    collection_name=collection_name,
    limit=100,
    offset=offset,
    with_payload=True,
    with_vectors=True,
)

In [3]:
import pandas as pd

def process_qdrant_results(results):
    processed_list = []
    
    # Priority keys you want to include in the text analysis
    text_keys = ['h1', 'h2', 'h3', 'h4', 'h5']
    
    for point in results:
        payload = point.payload
        
        # # 1. Construct the Full String
        text_parts = [str(payload.get(k, '')) for k in text_keys if k in payload]
        full_text_string = " ".join(text_parts).strip()
        
        processed_list.append({
            'id': point.id,
            'full_text': full_text_string,
            'h1': payload.get('h1', ''),
            'h2': payload.get('h2', ''),
            'h3': payload.get('h3', ''),
            'h4': payload.get('h4', ''),
            'h5': payload.get('h5', '')
        })
        
    return processed_list

# Run the conversion
clean_data = process_qdrant_results(search_result)
pd.DataFrame(clean_data).head(10)

Unnamed: 0,id,full_text,h1,h2,h3,h4,h5
0,01afb7a4-3919-49eb-9399-4c841cbcbf38,OPERATIONAL HOUR,OPERATIONAL HOUR,,,,
1,02aa9b27-f4d7-4321-be72-b7222ecb0883,OPERATIONAL HOUR,OPERATIONAL HOUR,,,,
2,0835f371-926d-4779-920b-1d8696b34ec6,Treatments,Treatments,,,,
3,08ce9c10-d605-4ece-a147-ba4eefe6b584,Treatments Bagaimana jika ingin melakukan pemb...,Treatments,Bagaimana jika ingin melakukan pembatalan book...,FAQ,,
4,198725d9-115a-4f43-bdcd-5379b9df9cc9,OPERATIONAL HOUR,OPERATIONAL HOUR,,,,
5,2fc26abe-1314-4a1f-9d01-38ec4fad73ae,SEE OUR RESULT YOUR FEEDBACK & REVIEWS ARE IMP...,,SEE OUR RESULT,YOUR FEEDBACK & REVIEWS ARE IMPORTANT TO US!,,SKIN LIFTING
6,322afcaa-63f5-4784-8ca0-496207aa0f55,SEE OUR RESULT YOUR FEEDBACK & REVIEWS ARE IMP...,,SEE OUR RESULT,YOUR FEEDBACK & REVIEWS ARE IMPORTANT TO US!,Tan Michelle ![icon-verified](https://emdeecli...,
7,34a93331-e4d6-4114-ba53-a3b1f16d9b57,OPERATIONAL HOUR,OPERATIONAL HOUR,,,,
8,34af740f-5d2b-465c-84bb-57438713cc22,WHY EMDEE? MISI OUR JOURNEY,WHY EMDEE?,,MISI,OUR JOURNEY,
9,37ec5272-5f85-42f4-b718-95234426f429,OPERATIONAL HOUR,OPERATIONAL HOUR,,,,


In [4]:
import os
import json
import pandas as pd
from langchain_community.chat_models import ChatDeepInfra
from langchain_core.prompts import ChatPromptTemplate

# 1. Modified Wrapper: Accepts 'variables' dictionary
def call_local_llm(template_str, variables, model="openai/gpt-oss-20b"):
    # Ensure API Key is set
    if not os.getenv("DEEPINFRA_API_TOKEN"):
        raise ValueError("Please set your DEEPINFRA_API_TOKEN environment variable.")

    llm = ChatDeepInfra(
        name=model, 
        temperature=0, 
        deepinfra_api_token=os.getenv("DEEPINFRA_API_TOKEN")
    )
    
    # Create template from the raw string
    prompt = ChatPromptTemplate.from_template(template_str)
    
    # Pipe: Prompt -> LLM
    chain = prompt | llm
    
    # Invoke with the dictionary of variables
    response = chain.invoke(variables)
    
    # Fix: Extract the text string from the AIMessage object
    return response.content

def generate_query_agent(doc_text):
    # NOTE: We use a raw string (no 'f') and use {doc_text} for LangChain to fill
    prompt_template = """
    You are a user of a beauty clinic app. Read the following document snippet:
    "{doc_text}"
    
    Task: Write a specific search query (in Indonesian or English) that you would type to find this exact information.
    Constraint: Output ONLY the query string. No quotes, no markdown.
    """
    
    # Pass the template AND the variable dict
    return call_local_llm(prompt_template, {"doc_text": doc_text}).strip()

def relevance_judge_agent(query, doc_text):
    # NOTE: 
    # 1. We use {query} and {doc_text} for variables.
    # 2. We use DOUBLE curly braces {{...}} for the JSON example so LangChain ignores it.
    prompt_template = """
    You are a relevance judge.
    Query: "{query}"
    Document: "{doc_text}"
    
    Task: Does the document contain the answer to the query?
    Output: Return strictly valid JSON: {{"is_relevant": 1}} or {{"is_relevant": 0}}
    """
    
    response_text = call_local_llm(prompt_template, {"query": query, "doc_text": doc_text})
    
    # Simple cleanup to ensure we get JSON
    try:
        # Find the first { and last } to strip markdown like ```json ... ```
        start = response_text.find('{')
        end = response_text.rfind('}') + 1
        if start == -1 or end == 0: return 0
        
        json_str = response_text[start:end]
        data = json.loads(json_str)
        return data.get('is_relevant', 0)
    except Exception as e:
        print(f"JSON Parse Error: {e} | Raw: {response_text}")
        return 0 # Fallback

def create_labeled_dataset(df_clean, samples_per_doc=3):
    labeled_data = []
    
    print(f"ðŸš€ Starting generation on {len(df_clean)} documents...")
    
    for idx, row in df_clean.iterrows():
        print("Document:", idx + 1)
        source_doc_text = row['full_text']
        
        # --- PHASE A: GENERATOR ---
        print("Generating synthetic query...")
        synthetic_query = generate_query_agent(source_doc_text)
        
        # Validation: Check if query is empty or just whitespace
        if not synthetic_query or len(synthetic_query.strip()) < 3:
            continue
            
        # --- PHASE B: SELECT CANDIDATES ---
        candidates = []
        candidates.append(row) # Positive
        
        other_docs = df_clean[df_clean['id'] != row['id']]
        if not other_docs.empty:
            negatives = other_docs.sample(n=min(len(other_docs), samples_per_doc))
            for _, neg_row in negatives.iterrows():
                candidates.append(neg_row)

        print("Total candidates:", len(candidates))
        
        # --- PHASE C: JUDGE ---
        for i, candidate in enumerate(candidates):
            print("Candidate:", i+1)
            cand_text = candidate['full_text']
            label = relevance_judge_agent(synthetic_query, cand_text)
            
            labeled_data.append({
                'query': synthetic_query,
                'doc_id': candidate['id'],
                'full_text': candidate['full_text'],
                'h1': candidate.get('h1', ''), 
                'h2': candidate.get('h2', ''),
                'label': label
            })
            
    return pd.DataFrame(labeled_data)

In [5]:
df_input = pd.DataFrame(clean_data)
df_labeled = create_labeled_dataset(df_input)

# Display the result
print("\n--- Final Labeled Dataset for XGBoost ---")
df_labeled.head(10)

ðŸš€ Starting generation on 57 documents...
Document: 1
Generating synthetic query...
Total candidates: 4
Candidate: 1
Candidate: 2
Candidate: 3
Candidate: 4
Document: 2
Generating synthetic query...
Total candidates: 4
Candidate: 1
Candidate: 2
Candidate: 3
Candidate: 4
Document: 3
Generating synthetic query...
Total candidates: 4
Candidate: 1
Candidate: 2
Candidate: 3
Candidate: 4
Document: 4
Generating synthetic query...
Total candidates: 4
Candidate: 1
Candidate: 2
Candidate: 3
Candidate: 4
Document: 5
Generating synthetic query...
Total candidates: 4
Candidate: 1
Candidate: 2
Candidate: 3
Candidate: 4
Document: 6
Generating synthetic query...
Total candidates: 4
Candidate: 1
Candidate: 2
Candidate: 3
Candidate: 4
Document: 7
Generating synthetic query...
Total candidates: 4
Candidate: 1
Candidate: 2
Candidate: 3
Candidate: 4
Document: 8
Generating synthetic query...
Total candidates: 4
Candidate: 1
Candidate: 2
Candidate: 3
Candidate: 4
Document: 9
Generating synthetic query...
To

Unnamed: 0,query,doc_id,full_text,h1,h2,label
0,jam operasional klinik kecantikan,01afb7a4-3919-49eb-9399-4c841cbcbf38,OPERATIONAL HOUR,OPERATIONAL HOUR,,1
1,jam operasional klinik kecantikan,0835f371-926d-4779-920b-1d8696b34ec6,Treatments,Treatments,,0
2,jam operasional klinik kecantikan,7716056d-758c-472b-94f5-de0a65408e36,OPERATIONAL HOUR,OPERATIONAL HOUR,,1
3,jam operasional klinik kecantikan,8abc43f4-cbcd-444a-9f97-38c8073453f2,Solusi terbaik mengatasi permasalahan kulit wa...,Solusi terbaik mengatasi permasalahan kulit wa...,,0
4,jam operasional klinik kecantikan,02aa9b27-f4d7-4321-be72-b7222ecb0883,OPERATIONAL HOUR,OPERATIONAL HOUR,,1
5,jam operasional klinik kecantikan,7079a3ee-4560-4f53-9741-f1d39affd8de,Treatments,Treatments,,0
6,jam operasional klinik kecantikan,34a93331-e4d6-4114-ba53-a3b1f16d9b57,OPERATIONAL HOUR,OPERATIONAL HOUR,,1
7,jam operasional klinik kecantikan,08ce9c10-d605-4ece-a147-ba4eefe6b584,Treatments Bagaimana jika ingin melakukan pemb...,Treatments,Bagaimana jika ingin melakukan pembatalan book...,0
8,perawatan kecantikan,0835f371-926d-4779-920b-1d8696b34ec6,Treatments,Treatments,,0
9,perawatan kecantikan,430299ef-b7d3-449e-881c-6e7450352010,OPERATIONAL HOUR,OPERATIONAL HOUR,,0


In [6]:
df_labeled.to_csv("../data/csv/labeled_data.csv", index=False)