In [1]:
# 🔧 Setup & Import
import sys
import os

sys.path.append(
    os.path.join(
        os.path.abspath(os.path.join(os.getcwd(), "..")) , "src"
    )
)

from schema_linking.schema.extract_schema import extract_column_texts
from schema_linking.keywords.keyword_extractor import llama_keyword_extraction
from schema_linking.retrieval.retriever import build_or_load_index, retrieve_top_k
from schema_linking.llm.llama_linker import llama_table_linking, llama_column_linking, llama_filter_columns_by_keywords
from schema_linking.utils.printer import (
    print_header,
    print_keywords,
    pretty_print_list,
    print_summary
)
from schema_linking.config.config import TOP_K_COLUMNS

In [2]:
# ✍️ Input della Query
print_header("Input")
question = "Quali attività IVA risultano ancora valide nel 2024?"
print(f"Question: {question}")


Input
Question: Quali attività IVA risultano ancora valide nel 2024?


In [5]:
# 🔍 Step 1: Extracting keywords using LLM...
print_header("Step 1: Extracting keywords using LLM...")
keywords = llama_keyword_extraction(question)
print_keywords(keywords)
keyword_string = " ".join(keywords)


Step 1: Extracting keywords using LLM...





Keywords:
1. IVA
2. attività
3. 2024
4. validità


In [6]:
# 🧱 Step 2: Schema extraction
print_header("Step 2: Loading schema...")
column_texts = extract_column_texts()
print(f"Loaded {len(column_texts)} columns")


Step 2: Loading schema...
Loaded 47388 columns


In [7]:
# 🧠 Step 3: FAISS Pruning
print_header(f"Step 3: FAISS column pruning (top {TOP_K_COLUMNS}) using keywords...")
column_index, column_map = build_or_load_index(column_texts)
pruned_columns = retrieve_top_k(column_index, column_map, keyword_string, TOP_K_COLUMNS)

if pruned_columns:
    pretty_print_list("Pruned Columns (FAISS)", pruned_columns)
else:
    print("No columns found")


Step 3: FAISS column pruning (top 50) using keywords...
Loading FAISS index from disk → c:\Workspace\NL2SQL\data\faiss_columns.bin

Pruned Columns (FAISS):
1. ba_rifatt.RACODIVI: Riferimento attivit� IVA ISTAT
2. ba_tmpregiva.SRTOTIND: Totale IVA indetraibile
3. ba_tmpregiva.SRPROIND: Progressivo IVA indetraibile
4. ba_diceme_2020_m.DECOMPLA: Plafond - Dichiarazione annuale IVA presentata
5. ba_rifatt.RACODIVA: Codice attivit� IVA azienda
6. ba_tmpliqiva2.LIIVAIVA: Credito IVA su IVA
7. ei_tmpcastiva.CIIMPON2: Imponibile IVA in valuta di conto
8. ba_tmpcastiva.CIIMPON2: Imponibile IVA in valuta di conto
9. ba_mycomp2.SCCONGRM: Conto IVA vendite per giroconto regime del margine
10. ba_diceme_m.DECOMPLA: Plafond - Dichiarazione annuale IVA presentata
11. ba_tmpregiva.SRATTIVA: Codice attivit� IVA
12. ba_diceme_2017_m.DECOMPLA: Plafond - Dichiarazione annuale IVA presentata
13. PNTIVA.AICODTAP: Codice trascodifica IVA
14. ba_annint.CICODIVA: Codice IVA intestatario
15. ba_regiva.RICODATT

In [8]:
# 🧠 Step 4: LLM filtering on pruned columns
print_header("Step 4: LLM filtering on pruned columns...")
llm_filtered_columns = llama_filter_columns_by_keywords(keyword_string, pruned_columns, keywords)

if llm_filtered_columns:
    pretty_print_list("Filtered Columns (LLM)", llm_filtered_columns)
else:
    print("No columns passed the LLM filtering step")


Step 4: LLM filtering on pruned columns...





Filtered Columns (LLM):
1. - ba_tmpregiva.SRPROIND: Progressivo IVA indetraibile
2. - ba_tmpregiva.SRATTIVA: Codice attività IVA
3. - ba_tmpregiva.SRCODREM: Codice Regime IVA
4. - PNTIVA.CCESIDIF: Gestione IVA ad esigibilità differita
5. - PNTIVA.TIPIVAESIG: Valori IVA
6. - ba_diceme_2020_m.DECOMPLA: Plafond - Dichiarazione annuale IVA presentata
7. - ba_tmpregiva.SRIVAIND: IVA indetraibile
8. - ei_reportvat.VPIDEB12: IVA dovuta
9. - ei_reportvat.VPIDEB12_C: IVA dovuta
10. - ba_ivareg.RICODIVA: Codice IVA
11. - ba_ivareg.RICODATT: Codice attività IVA
12. - ei_prinot_m.EINAZFIS: Codice nazione per identificazione fiscale ai fini IVA (cliente/fornitore)


In [9]:
# 🧮 Step 5: Table Linking via LLM
print_header("Step 5: LLM-based table linking (using filtered columns)...")
candidate_tables = list(set([col.split('.')[0] for col in llm_filtered_columns]))

linked_tables = llama_table_linking(question, candidate_tables)

if linked_tables:
    pretty_print_list("Linked Tables (LLM)", linked_tables)
else:
    print("No tables linked by LLM")


Step 5: LLM-based table linking (using filtered columns)...





Linked Tables (LLM):
1. ba_diceme_2020_m
2. ba_ivareg
3. ba_tmpregiva


In [10]:
# 🔗 Step 6: Final Column Linking
print_header("Step 6: LLM-based final column linking...")
final_columns = [col for col in llm_filtered_columns if col.split('.')[0] in linked_tables]
linked_columns = llama_column_linking(keyword_string, final_columns)

if linked_columns:
    print_summary(question, keywords, linked_tables, linked_columns)
else:
    print("No columns linked by LLM")


Step 6: LLM-based final column linking...





Final Result
Question: Quali attività IVA risultano ancora valide nel 2024?

Keywords:
1. IVA
2. attività
3. 2024
4. validità

Linked Tables (LLM):
1. ba_diceme_2020_m
2. ba_ivareg
3. ba_tmpregiva

Linked Columns (LLM):
1. - iva_attivita_2024
2. - validita
