In [1]:
# 🔧 Setup & Import
import sys
import os

sys.path.append(
    os.path.join(
        os.path.abspath(os.path.join(os.getcwd(), "..")) , "src"
    )
)

from schema_linking.config.config import TOP_K_COLUMNS
from schema_linking.schema.extract_schema import extract_active_column_texts
from schema_linking.keywords.keyword_extractor import llm_keyword_extraction
from schema_linking.retrieval.retriever import build_or_load_index, retrieve_top_k
from schema_linking.llm.llm_linker import llm_table_linking, llm_column_linking, llm_filter_columns_by_keywords
from schema_linking.utils.printer import (
    print_header,
    print_keywords,
    pretty_print_list,
    print_summary
)
from schema_linking.utils.schema_utils import (
    build_table_description_map,
    build_column_description_map
)

In [2]:
# ✍️ Input della Query
print_header("Input")
question = "Qual è l'indirizzo e la località delle banche con codice CAB 12345?"
print(f"Question: {question}")


Input
Question: Qual è l'indirizzo e la località delle banche con codice CAB 12345?


In [7]:
# 🔍 Step 1: Extracting keywords using LLM...
print_header("Step 1: Extracting keywords using LLM...")
keywords = llm_keyword_extraction(question)
print_keywords(keywords)


Step 1: Extracting keywords using LLM...





Keywords:
1. banche
2. indirizzo
3. località
4. codice CAB
5. 12345


In [8]:
# 🧱 Step 2: Schema extraction
print_header("Step 2: Loading schema...")
column_texts = extract_active_column_texts()
print(f"Loaded {len(column_texts)} columns")


Step 2: Loading schema...
Loading active fields from cache: c:\Workspace\NL2SQL\data\active_fields.parquet
Loaded 3908 columns


In [9]:
# 🧠 Step 3: FAISS Pruning
print_header(f"Step 3: FAISS column pruning (top {TOP_K_COLUMNS}) using keywords...")
column_index, column_map = build_or_load_index(column_texts)
pruned_columns = retrieve_top_k(column_index, column_map, question, TOP_K_COLUMNS)

if pruned_columns:
    pretty_print_list("Pruned Columns (FAISS)", pruned_columns)
else:
    print("No columns found")


Step 3: FAISS column pruning (top 50) using keywords...
Loading FAISS index from disk → c:\Workspace\NL2SQL\data\faiss_columns.bin

Pruned Columns (FAISS):
1. ba_bank.BACABCODE: Codice CAB (tabella: BA - Banche)
2. ba_bank.BALOCALITY: Localit� banca (tabella: BA - Banche)
3. ba_bank.BABANKID: Codice Banca (tabella: BA - Banche)
4. ba_bank.BACODSTA: Codice stato (ISO) (tabella: BA - Banche)
5. ba_bank.BACABDESCRI: Descrizione CAB (tabella: BA - Banche)
6. ba_bank.BACODEST: Codice BIC Swift (tabella: BA - Banche)
7. ba_caupar.CCOPEBAN: Tipo operazione bancaria (distinta) (tabella: CG - Causali partite scadenze)
8. ba_bank.BAABICODE: Codice ABI (tabella: BA - Banche)
9. ba_bank.BACAP: CAP Banca (tabella: BA - Banche)
10. ba_bank.BAFLBANC: Tipo banca Nazionale/Estera (tabella: BA - Banche)
11. ba_bank.BADESCRI: Descrizione (tabella: BA - Banche)
12. ba_city.CTCAP: Codice Avviamento Postale (tabella: BA - Localit�)
13. ba_bank.BAADDRESS: Indirizzo banca (tabella: BA - Banche)
14. ba_caucon

In [10]:
# 🧠 Step 4: LLM filtering on pruned columns
print_header("Step 4: LLM filtering on pruned columns...")
llm_filtered_columns = llm_filter_columns_by_keywords(question, pruned_columns, keywords)

if llm_filtered_columns:
    pretty_print_list("Filtered Columns (LLM)", llm_filtered_columns)
else:
    print("No columns passed the LLM filtering step")


Step 4: LLM filtering on pruned columns...





Filtered Columns (LLM):
1. ba_bank.BALOCALITY
2. ba_bank.BACABCODE
3. ba_bank.BACAP
4. ba_bank.BAADDRESS


In [11]:
# 🧮 Step 5: Table Linking via LLM
print_header("Step 5: LLM-based table linking (using filtered columns)...")
table_desc_map = build_table_description_map(pruned_columns)
candidate_tables = list(set(col.split('.')[0] for col in llm_filtered_columns))
tables_with_desc = [f"{table}: {table_desc_map.get(table, 'nessuna descrizione')}" for table in candidate_tables]

linked_tables = llm_table_linking(question, tables_with_desc)

if linked_tables:
    pretty_print_list("Linked Tables (LLM)", linked_tables)
else:
    print("No tables linked by LLM")


Step 5: LLM-based table linking (using filtered columns)...





Linked Tables (LLM):
1. ba_bank


In [12]:
# 🔗 Step 6: Final Column Linking
print_header("Step 6: LLM-based final column linking...")
final_columns = [col for col in llm_filtered_columns if col.split('.')[0] in linked_tables]
column_full_map = build_column_description_map(pruned_columns)
final_column_descriptions = [column_full_map[c] for c in final_columns if c in column_full_map]

linked_columns = llm_column_linking(question, final_column_descriptions)

if linked_columns:
    print_summary(question, keywords, linked_tables, linked_columns)
else:
    print("No columns linked by LLM")


Step 6: LLM-based final column linking...





Final Result
Question: Qual è l'indirizzo e la località delle banche con codice CAB 12345?

Keywords:
1. banche
2. indirizzo
3. località
4. codice CAB
5. 12345

Linked Tables (LLM):
1. ba_bank

Linked Columns (LLM):
1. ba_bank.BAADDRESS
2. ba_bank.BALOCALITY
3. ba_bank.BACABCODE
