In [None]:
!pip install sql-metadata pandas


In [10]:
!pip install sqlglot


Collecting sqlglot
  Downloading sqlglot-27.2.0-py3-none-any.whl.metadata (20 kB)
Downloading sqlglot-27.2.0-py3-none-any.whl (487 kB)
Installing collected packages: sqlglot
Successfully installed sqlglot-27.2.0


In [13]:
import pandas as pd
import sqlglot
from sqlglot import parse_one
import json

# Customize these if needed
TABLE_PREFIX = "google_ads_"
TABLE_SUFFIX = "postfix"

def clean_table_name(name):
    if name.startswith(TABLE_PREFIX):
        name = name[len(TABLE_PREFIX):]
    if name.endswith(TABLE_SUFFIX):
        name = name[: -len(TABLE_SUFFIX)]
    return name

def extract_tables_and_columns(sql):
    try:
        expression = parse_one(sql, read='bigquery')
        tables = {
            clean_table_name(table.name)
            for table in expression.find_all(sqlglot.exp.Table)
        }
        columns = {col.name for col in expression.find_all(sqlglot.exp.Column)}
        return list(tables), list(columns)
    except Exception as e:
        return [], []

# Load CSV and normalize headers
df = pd.read_csv("/home/prakhar/luke-dev/txt2sql_methods/RaTsql/notebooks/sql_ground_truth.csv", encoding='utf-8-sig', sep=None, engine='python')
df.columns = df.columns.str.strip()

output = []

for _, row in df.iterrows():
    query = str(row.get('Questions', '')).strip()
    sql = str(row.get('Ground truth sql', '')).strip()

    if not query or not sql or sql.lower() == 'nan':
        continue

    tables, columns = extract_tables_and_columns(sql)

    entry = {
        "query": query,
        "sql": sql,
        "tables": tables,
        "columns": columns
    }

    output.append(entry)

# Save to JSON
with open("formatted_data.json", "w") as f:
    json.dump(output, f, indent=2)