In [0]:
import json
import yaml
import pandas as pd
from pathlib import Path
from typing import Dict, List, Any
from datetime import datetime

# COMMAND ----------
# MAGIC %md
# MAGIC ## TML Metadata Extractor for Table/Column Mapping

# COMMAND ----------

# --- Configuration ---
CATALOG = "ds_training_1"
SCHEMA = "thoughtspot_inventory_ak"
TML_VOLUME = "lvdash_files_ak/liveboard"

TML_INPUT_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/{TML_VOLUME}/"
MAPPING_TABLE = f"{CATALOG}.{SCHEMA}.tml_dbx_metadata_mapping"
FAILURE_LOG_TABLE = f"{CATALOG}.{SCHEMA}.tml_dbx_mapping_failures"

# COMMAND ----------
# MAGIC %md
# MAGIC ## Setup Functions

# COMMAND ----------

def setup_failure_log_table():
    """Create or recreate the failure log table."""
    parts = FAILURE_LOG_TABLE.split('.')
    catalog = parts[0]
    schema = parts[1]
    table_name = parts[2]
    
    # Ensure schema exists
    spark.sql(f"CREATE SCHEMA IF NOT EXISTS `{catalog}`.`{schema}`")
    
    # Drop existing table
    try:
        spark.sql(f"DROP TABLE IF EXISTS `{catalog}`.`{schema}`.`{table_name}`")
        print(f"Dropped existing failure log table: {table_name}")
    except Exception:
        pass
    
    # Create new failure log table
    create_sql = f"""
        CREATE OR REPLACE TABLE `{catalog}`.`{schema}`.`{table_name}` (
            tml_file STRING,
            error_type STRING,
            error_message STRING,
            failure_timestamp TIMESTAMP
        ) USING DELTA
    """
    
    spark.sql(create_sql)
    print(f"Created failure log table: {table_name}")

def setup_mapping_table():
    """Create or recreate the metadata mapping table."""
    parts = MAPPING_TABLE.split('.')
    catalog = parts[0]
    schema = parts[1]
    table_name = parts[2]
    
    # Ensure schema exists
    spark.sql(f"CREATE SCHEMA IF NOT EXISTS `{catalog}`.`{schema}`")
    
    # Drop existing table
    try:
        spark.sql(f"DROP TABLE IF EXISTS `{catalog}`.`{schema}`.`{table_name}`")
        print(f"Dropped existing table: {table_name}")
    except Exception:
        pass
    
    # Create new table with mapping structure
    # Note: Columns ending with _ToBeFilled are for user input
    create_sql = f"""
        CREATE OR REPLACE TABLE `{catalog}`.`{schema}`.`{table_name}` (
            tml_file STRING,
            visualization_id STRING,
            visualization_name STRING,
            chart_type STRING,
            tml_table_name STRING,
            tml_table_id STRING,
            tml_columns_used ARRAY<STRING>,
            databricks_table_name_ToBeFilled STRING COMMENT 'For unique datasets per viz',
            databricks_column_mapping_ToBeFilled STRING COMMENT 'For unique datasets per viz - JSON format',
            common_dataset_name STRING COMMENT 'Shared dataset name for reuse across visualizations',
            common_sql_query STRING COMMENT 'Common SQL query for the shared dataset',
            common_column_mapping STRING COMMENT 'JSON mapping of common columns for shared dataset',
            search_query STRING,
            notes STRING,
            extraction_timestamp TIMESTAMP
        ) USING DELTA
    """
    
    spark.sql(create_sql)
    print(f"Created mapping table: {table_name}")

def parse_tml_file(file_path):
    """Parse TML file (YAML or JSON)."""
    content = dbutils.fs.head(file_path, 10 * 1024 * 1024)
    try:
        return yaml.safe_load(content)
    except yaml.YAMLError:
        return json.loads(content)

# COMMAND ----------
# MAGIC %md
# MAGIC ## Metadata Extraction Functions

# COMMAND ----------

def extract_columns_from_answer(answer: Dict) -> List[str]:
    """Extract all column names used in the answer."""
    columns = []
    
    # From answer_columns
    for col in answer.get('answer_columns', []):
        col_name = col.get('name')
        if col_name:
            columns.append(col_name)
    
    # From table ordered columns
    table_cols = answer.get('table', {}).get('ordered_column_ids', [])
    columns.extend([c for c in table_cols if c and c not in columns])
    
    return columns

def extract_table_info(answer: Dict) -> tuple:
    """Extract table name and ID from answer."""
    tables = answer.get('tables', [])
    if tables and len(tables) > 0:
        first_table = tables[0]
        return (
            first_table.get('name', ''),
            first_table.get('id', '')
        )
    return ('', '')

def clean_field_name(field_name: str) -> str:
    """Remove aggregate prefixes from field names."""
    if not field_name:
        return ""
    import re
    cleaned = re.sub(r'^(Total |sum\(|count\(|avg\(|min\(|max\(|Unique Number of )', 
                     '', field_name, flags=re.IGNORECASE)
    cleaned = re.sub(r'\)$', '', cleaned)
    return cleaned.strip()

def extract_base_columns(columns: List[str]) -> List[str]:
    """Extract base column names without aggregations."""
    base_columns = []
    for col in columns:
        cleaned = clean_field_name(col)
        if cleaned and cleaned not in base_columns:
            base_columns.append(cleaned)
    return base_columns

# COMMAND ----------
# MAGIC %md
# MAGIC ## Main Extraction Logic

# COMMAND ----------

def extract_tml_metadata():
    """Extract metadata from all TML files for mapping purposes."""
    print("--- Setting up mapping and failure log tables ---")
    setup_mapping_table()
    setup_failure_log_table()
    
    # Get TML files
    try:
        tml_files = [f.path for f in dbutils.fs.ls(TML_INPUT_PATH) 
                     if f.path.endswith(('.tml', '.yaml', '.json'))]
    except Exception as e:
        print(f"ERROR: Cannot list files in '{TML_INPUT_PATH}'. Error: {e}")
        return
    
    if not tml_files:
        print(f"No TML files found in {TML_INPUT_PATH}")
        return
    
    print(f"\nFound {len(tml_files)} TML files to process.")
    
    metadata_records = []
    failure_records = []
    
    for tml_file_path in tml_files:
        filename = Path(tml_file_path).name
        
        try:
            print(f"\n--- Processing: {filename} ---")
            
            # Try to parse TML file
            try:
                tml_data = parse_tml_file(tml_file_path)
            except Exception as parse_error:
                print(f"  ERROR: Failed to parse TML file - {parse_error}")
                failure_records.append({
                    'tml_file': filename,
                    'error_type': 'PARSE_ERROR',
                    'error_message': str(parse_error)[:1000],
                    'failure_timestamp': datetime.now()
                })
                continue
            
            liveboard = tml_data.get('liveboard')
            if not liveboard:
                print(f"  WARNING: No 'liveboard' key found in {filename}")
                failure_records.append({
                    'tml_file': filename,
                    'error_type': 'INVALID_STRUCTURE',
                    'error_message': "Missing 'liveboard' root key in TML file",
                    'failure_timestamp': datetime.now()
                })
                continue
            
            visualizations = liveboard.get('visualizations', [])
            
            if not visualizations:
                print(f"  WARNING: No visualizations found in {filename}")
                failure_records.append({
                    'tml_file': filename,
                    'error_type': 'NO_VISUALIZATIONS',
                    'error_message': "No visualizations found in liveboard",
                    'failure_timestamp': datetime.now()
                })
                continue
            
            print(f"  Found {len(visualizations)} visualizations")
            
            for viz in visualizations:
                try:
                    answer = viz.get('answer', {})
                    chart = answer.get('chart', {})
                    
                    viz_id = viz.get('id', 'unknown')
                    viz_name = answer.get('name', 'Unnamed')
                    
                    # Get chart type
                    display_mode = answer.get('display_mode', '')
                    chart_type = chart.get('type', 'TABLE_MODE' if display_mode == 'TABLE_MODE' else 'UNKNOWN')
                    
                    # Extract table info
                    table_name, table_id = extract_table_info(answer)
                    
                    # Extract columns
                    columns_used = extract_columns_from_answer(answer)
                    base_columns = extract_base_columns(columns_used)
                    
                    # Get search query
                    search_query = answer.get('search_query', '')
                    
                    # Build record with all new columns initialized as empty/null
                    record = {
                        'tml_file': filename,
                        'visualization_id': viz_id,
                        'visualization_name': viz_name,
                        'chart_type': chart_type,
                        'tml_table_name': table_name,
                        'tml_table_id': table_id,
                        'tml_columns_used': base_columns,
                        'databricks_table_name_ToBeFilled': '',
                        'databricks_column_mapping_ToBeFilled': '{}',
                        'common_dataset_name': None,  # NULL by default - fill only for shared datasets
                        'common_sql_query': None,  # NULL by default - fill only for shared datasets
                        'common_column_mapping': None,  # NULL by default - fill only for shared datasets
                        'search_query': search_query,
                        'notes': f"Extracted {len(base_columns)} unique columns",
                        'extraction_timestamp': datetime.now()
                    }
                    
                    metadata_records.append(record)
                    print(f"  - {viz_name} ({chart_type}): {len(base_columns)} columns from table '{table_name}'")
                
                except Exception as viz_error:
                    print(f"  ERROR processing visualization '{viz.get('id', 'unknown')}': {viz_error}")
                    failure_records.append({
                        'tml_file': filename,
                        'error_type': 'VISUALIZATION_ERROR',
                        'error_message': f"Viz ID: {viz.get('id', 'unknown')} - {str(viz_error)[:900]}",
                        'failure_timestamp': datetime.now()
                    })
                    continue
        
        except Exception as e:
            print(f"  ERROR processing {filename}: {e}")
            import traceback
            traceback.print_exc()
            failure_records.append({
                'tml_file': filename,
                'error_type': 'PROCESSING_ERROR',
                'error_message': str(e)[:1000],
                'failure_timestamp': datetime.now()
            })
    
    # Save metadata records to table
    if metadata_records:
        print(f"\n--- Saving {len(metadata_records)} metadata records ---")
        df = pd.DataFrame(metadata_records)
        df['extraction_timestamp'] = pd.to_datetime(df['extraction_timestamp'])
        
        # CRITICAL FIX: Explicitly define schema for Spark DataFrame
        from pyspark.sql.types import StructType, StructField, StringType, ArrayType, TimestampType
        
        schema = StructType([
            StructField("tml_file", StringType(), True),
            StructField("visualization_id", StringType(), True),
            StructField("visualization_name", StringType(), True),
            StructField("chart_type", StringType(), True),
            StructField("tml_table_name", StringType(), True),
            StructField("tml_table_id", StringType(), True),
            StructField("tml_columns_used", ArrayType(StringType()), True),
            StructField("databricks_table_name_ToBeFilled", StringType(), True),
            StructField("databricks_column_mapping_ToBeFilled", StringType(), True),
            StructField("common_dataset_name", StringType(), True),
            StructField("common_sql_query", StringType(), True),
            StructField("common_column_mapping", StringType(), True),
            StructField("search_query", StringType(), True),
            StructField("notes", StringType(), True),
            StructField("extraction_timestamp", TimestampType(), True)
        ])
        
        # Convert to Spark DataFrame with explicit schema
        spark_df = spark.createDataFrame(df, schema=schema)
        spark_df.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable(MAPPING_TABLE)
        
        print(f"Successfully saved metadata to {MAPPING_TABLE}")
        print("\n=== MAPPING OPTIONS ===")
        print("\nOPTION 1: Unique dataset per visualization")
        print("  - Fill 'databricks_table_name_ToBeFilled' with your Databricks table")
        print("  - Fill 'databricks_column_mapping_ToBeFilled' with JSON column mapping")
        print("  - Leave common_* columns as NULL")
        print("\nOPTION 2: Shared dataset across multiple visualizations")
        print("  - Fill 'common_dataset_name' with a shared dataset identifier (e.g., 'ds_trips_shared')")
        print("  - Fill 'common_sql_query' with the complete SQL query")
        print("  - Fill 'common_column_mapping' with JSON column mapping for the shared dataset")
        print("  - You can leave databricks_table_name_ToBeFilled empty for shared datasets")
        print("\nFor column mappings, use JSON string format:")
        print('Example: \'{"Order Date": "order_date", "Customer Name": "customer_name"}\'')
    else:
        print("\nNo metadata records extracted.")
    
    # Save failure records to table
    if failure_records:
        print(f"\n--- Saving {len(failure_records)} failure records ---")
        fail_df = pd.DataFrame(failure_records)
        fail_df['failure_timestamp'] = pd.to_datetime(fail_df['failure_timestamp'])
        
        spark_fail_df = spark.createDataFrame(fail_df)
        spark_fail_df.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable(FAILURE_LOG_TABLE)
        
        print(f"Failed TML files logged to {FAILURE_LOG_TABLE}")
    else:
        print("\nNo failures encountered - all TML files processed successfully!")

# COMMAND ----------
# MAGIC %md
# MAGIC ## Execute Extraction

# COMMAND ----------

extract_tml_metadata()

# COMMAND ----------
# MAGIC %md
# MAGIC ## View Results

# COMMAND ----------

print("\n--- TML Metadata Mapping Table ---")
print("Choose one mapping approach per visualization:")
print("1. Unique dataset: Fill databricks_table_name_ToBeFilled + databricks_column_mapping_ToBeFilled")
print("2. Shared dataset: Fill common_dataset_name + common_sql_query + common_column_mapping\n")

try:
    df = spark.table(MAPPING_TABLE)
    display(df.orderBy("tml_file", "visualization_name"))
except Exception as e:
    print(f"Could not display table. Error: {e}")

print("\n--- Failed TML Files ---")
try:
    fail_df = spark.table(FAILURE_LOG_TABLE)
    fail_count = fail_df.count()
    if fail_count > 0:
        print(f"Found {fail_count} failed TML files or visualizations:")
        display(fail_df.orderBy("failure_timestamp", ascending=False))
    else:
        print("No failures - all TML files processed successfully!")
except Exception as e:
    print(f"Could not display failure log. Error: {e}")

# COMMAND ----------
# MAGIC %md
# MAGIC ## Summary Statistics

# COMMAND ----------

print("\n--- Extraction Summary ---")

try:
    summary_query = f"""
    SELECT 
        tml_file,
        COUNT(*) as num_visualizations,
        COUNT(DISTINCT tml_table_name) as num_unique_tables,
        SUM(SIZE(tml_columns_used)) as total_columns_used
    FROM {MAPPING_TABLE}
    GROUP BY tml_file
    ORDER BY tml_file
    """
    display(spark.sql(summary_query))
except Exception as e:
    print(f"Could not display summary. Error: {e}")

# COMMAND ----------
# MAGIC %md
# MAGIC ## Update Mapping Examples

# COMMAND ----------

print(f"""
--- How to Update the Mapping Table ---

OPTION 1: Unique dataset per visualization
------------------------------------------
UPDATE {MAPPING_TABLE}
SET 
  databricks_table_name_ToBeFilled = 'my_catalog.my_schema.orders_table',
  databricks_column_mapping_ToBeFilled = '{{"Order Date": "order_date", "Customer Name": "customer_name", "Total Revenue": "total_revenue"}}'
WHERE visualization_id = 'your_viz_id_here'


OPTION 2: Shared dataset across multiple visualizations
-------------------------------------------------------
UPDATE {MAPPING_TABLE}
SET 
  common_dataset_name = 'ds_trips_shared',
  common_sql_query = 'SELECT pickup_zip, dropoff_zip, fare_amount, trip_distance FROM samples.nyctaxi.trips WHERE trip_distance > 0',
  common_column_mapping = '{{"pickup_zip": "pickup_zip", "dropoff_zip": "dropoff_zip", "fare_amount": "fare_amount"}}'
WHERE tml_file = 'NYC_Dashboard.tml' 
  AND visualization_id IN ('viz_1', 'viz_2', 'viz_3')


Query to see what needs mapping:
---------------------------------
SELECT 
  tml_file,
  visualization_name,
  tml_table_name,
  tml_columns_used,
  databricks_table_name_ToBeFilled,
  common_dataset_name
FROM {MAPPING_TABLE}
WHERE (databricks_table_name_ToBeFilled = '' OR databricks_table_name_ToBeFilled IS NULL)
  AND common_dataset_name IS NULL
ORDER BY tml_file, visualization_name


Query to identify potential shared datasets:
--------------------------------------------
SELECT 
  tml_table_name,
  COUNT(*) as num_visualizations,
  COLLECT_SET(visualization_name) as viz_names
FROM {MAPPING_TABLE}
GROUP BY tml_table_name
HAVING COUNT(*) > 1
ORDER BY num_visualizations DESC

Note: All *_column_mapping fields should be JSON strings mapping TML column names to Databricks column names.
""")

# COMMAND ----------
# MAGIC %md
# MAGIC ## Next Steps
# MAGIC
# MAGIC 1. Review the metadata mapping table above
# MAGIC 2. Decide which visualizations should share datasets (query provided above can help identify candidates)
# MAGIC 3. For shared datasets:
# MAGIC    - Set `common_dataset_name` (e.g., 'ds_trips_shared')
# MAGIC    - Set `common_sql_query` (full SQL query)
# MAGIC    - Set `common_column_mapping` (JSON string)
# MAGIC 4. For unique datasets:
# MAGIC    - Set `databricks_table_name_ToBeFilled` (catalog.schema.table)
# MAGIC    - Set `databricks_column_mapping_ToBeFilled` (JSON string)
# MAGIC 5. Check the failure log table for any TML files that couldn't be processed
# MAGIC 6. Use this mapping table in your conversion script

In [0]:
import json
import yaml
import pandas as pd
from pathlib import Path
from typing import Dict, List, Any
from datetime import datetime

# COMMAND ----------
# MAGIC %md
# MAGIC ## TML Metadata Extractor for Table/Column Mapping

# COMMAND ----------

# --- Configuration ---
CATALOG = "ds_training_1"
SCHEMA = "thoughtspot_inventory_ak"
TML_VOLUME = "lvdash_files_ak/liveboard"

TML_INPUT_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/{TML_VOLUME}/"
MAPPING_TABLE = f"{CATALOG}.{SCHEMA}.tml_dbx_metadata_mapping"
FAILURE_LOG_TABLE = f"{CATALOG}.{SCHEMA}.tml_dbx_mapping_failures"

# COMMAND ----------
# MAGIC %md
# MAGIC ## Setup Functions

# COMMAND ----------

def setup_failure_log_table():
    """Create or recreate the failure log table."""
    parts = FAILURE_LOG_TABLE.split('.')
    catalog = parts[0]
    schema = parts[1]
    table_name = parts[2]
    
    # Ensure schema exists
    spark.sql(f"CREATE SCHEMA IF NOT EXISTS `{catalog}`.`{schema}`")
    
    # Drop existing table
    try:
        spark.sql(f"DROP TABLE IF EXISTS `{catalog}`.`{schema}`.`{table_name}`")
        print(f"Dropped existing failure log table: {table_name}")
    except Exception:
        pass
    
    # Create new failure log table
    create_sql = f"""
        CREATE OR REPLACE TABLE `{catalog}`.`{schema}`.`{table_name}` (
            tml_file STRING,
            error_type STRING,
            error_message STRING,
            failure_timestamp TIMESTAMP
        ) USING DELTA
    """
    
    spark.sql(create_sql)
    print(f"Created failure log table: {table_name}")

def setup_mapping_table():
    """Create or recreate the metadata mapping table."""
    parts = MAPPING_TABLE.split('.')
    catalog = parts[0]
    schema = parts[1]
    table_name = parts[2]
    
    # Ensure schema exists
    spark.sql(f"CREATE SCHEMA IF NOT EXISTS `{catalog}`.`{schema}`")
    
    # Drop existing table
    try:
        spark.sql(f"DROP TABLE IF EXISTS `{catalog}`.`{schema}`.`{table_name}`")
        print(f"Dropped existing table: {table_name}")
    except Exception:
        pass
    
    # Create new table with mapping structure
    # Note: Columns ending with _ToBeFilled are for user input
    create_sql = f"""
        CREATE OR REPLACE TABLE `{catalog}`.`{schema}`.`{table_name}` (
            tml_file STRING,
            visualization_id STRING,
            visualization_name STRING,
            chart_type STRING,
            tml_table_name STRING,
            tml_table_id STRING,
            tml_columns_used ARRAY<STRING>,
            databricks_table_name_ToBeFilled STRING COMMENT 'For unique datasets per viz',
            databricks_column_mapping_ToBeFilled STRING COMMENT 'For unique datasets per viz - JSON format',
            common_dataset_name STRING COMMENT 'Shared dataset name for reuse across visualizations',
            common_sql_query STRING COMMENT 'Common SQL query for the shared dataset',
            common_column_mapping STRING COMMENT 'JSON mapping of common columns for shared dataset',
            search_query STRING,
            notes STRING,
            extraction_timestamp TIMESTAMP,
            filter_column_mapping_ToBeFilled STRING COMMENT 'JSON mapping TML filter names to RAW cols
        ) USING DELTA
    """
    
    spark.sql(create_sql)
    print(f"Created mapping table: {table_name}")

def parse_tml_file(file_path):
    """Parse TML file (YAML or JSON)."""
    content = dbutils.fs.head(file_path, 10 * 1024 * 1024)
    try:
        return yaml.safe_load(content)
    except yaml.YAMLError:
        return json.loads(content)

# COMMAND ----------
# MAGIC %md
# MAGIC ## Metadata Extraction Functions

# COMMAND ----------

def extract_columns_from_answer(answer: Dict) -> List[str]:
    """Extract all column names used in the answer."""
    columns = []
    
    # From answer_columns
    for col in answer.get('answer_columns', []):
        col_name = col.get('name')
        if col_name:
            columns.append(col_name)
    
    # From table ordered columns
    table_cols = answer.get('table', {}).get('ordered_column_ids', [])
    columns.extend([c for c in table_cols if c and c not in columns])
    
    return columns

def extract_table_info(answer: Dict) -> tuple:
    """Extract table name and ID from answer."""
    tables = answer.get('tables', [])
    if tables and len(tables) > 0:
        first_table = tables[0]
        return (
            first_table.get('name', ''),
            first_table.get('id', '')
        )
    return ('', '')

def clean_field_name(field_name: str) -> str:
    """Remove aggregate prefixes from field names."""
    if not field_name:
        return ""
    import re
    cleaned = re.sub(r'^(Total |sum\(|count\(|avg\(|min\(|max\(|Unique Number of )', 
                     '', field_name, flags=re.IGNORECASE)
    cleaned = re.sub(r'\)$', '', cleaned)
    return cleaned.strip()

def extract_base_columns(columns: List[str]) -> List[str]:
    """Extract base column names without aggregations."""
    base_columns = []
    for col in columns:
        cleaned = clean_field_name(col)
        if cleaned and cleaned not in base_columns:
            base_columns.append(cleaned)
    return base_columns

# COMMAND ----------
# MAGIC %md
# MAGIC ## Main Extraction Logic

# COMMAND ----------

def extract_tml_metadata():
    """Extract metadata from all TML files for mapping purposes."""
    print("--- Setting up mapping and failure log tables ---")
    setup_mapping_table() 
    setup_failure_log_table()
    
    # Get TML files
    try:
        tml_files = [f.path for f in dbutils.fs.ls(TML_INPUT_PATH) 
                     if f.path.endswith(('.tml', '.yaml', '.json'))]
    except Exception as e:
        print(f"ERROR: Cannot list files in '{TML_INPUT_PATH}'. Error: {e}")
        return
    
    if not tml_files:
        print(f"No TML files found in {TML_INPUT_PATH}")
        return
    
    print(f"\nFound {len(tml_files)} TML files to process.")
    
    metadata_records = []
    failure_records = []
    
    for tml_file_path in tml_files:
        filename = Path(tml_file_path).name
        
        try:
            print(f"\n--- Processing: {filename} ---")
            
            # Try to parse TML file
            try:
                tml_data = parse_tml_file(tml_file_path)
            except Exception as parse_error:
                print(f"  ERROR: Failed to parse TML file - {parse_error}")
                failure_records.append({
                    'tml_file': filename,
                    'error_type': 'PARSE_ERROR',
                    'error_message': str(parse_error)[:1000],
                    'failure_timestamp': datetime.now()
                })
                continue
            
            liveboard = tml_data.get('liveboard')
            if not liveboard:
                print(f"  WARNING: No 'liveboard' key found in {filename}")
                failure_records.append({
                    'tml_file': filename,
                    'error_type': 'INVALID_STRUCTURE',
                    'error_message': "Missing 'liveboard' root key in TML file",
                    'failure_timestamp': datetime.now()
                })
                continue
            
            visualizations = liveboard.get('visualizations', [])
            
            if not visualizations:
                print(f"  WARNING: No visualizations found in {filename}")
                failure_records.append({
                    'tml_file': filename,
                    'error_type': 'NO_VISUALIZATIONS',
                    'error_message': "No visualizations found in liveboard",
                    'failure_timestamp': datetime.now()
                })
                continue
            
            print(f"  Found {len(visualizations)} visualizations")
            
            for viz in visualizations:
                try:
                    answer = viz.get('answer', {})
                    chart = answer.get('chart', {})
                    
                    viz_id = viz.get('id', 'unknown')
                    viz_name = answer.get('name', 'Unnamed')
                    
                    # Get chart type
                    display_mode = answer.get('display_mode', '')
                    chart_type = chart.get('type', 'TABLE_MODE' if display_mode == 'TABLE_MODE' else 'UNKNOWN')
                    
                    # Extract table info
                    table_name, table_id = extract_table_info(answer)
                    
                    # Extract columns
                    columns_used = extract_columns_from_answer(answer)
                    base_columns = extract_base_columns(columns_used)
                    
                    # Get search query
                    search_query = answer.get('search_query', '')
                    
                    # Build record with all new columns initialized as empty/null
                    record = {
                        'tml_file': filename,
                        'visualization_id': viz_id,
                        'visualization_name': viz_name,
                        'chart_type': chart_type,
                        'tml_table_name': table_name,
                        'tml_table_id': table_id,
                        'tml_columns_used': base_columns,
                        'databricks_table_name_ToBeFilled': '',
                        'databricks_column_mapping_ToBeFilled': '{}',
                        'common_dataset_name': None,  # NULL by default - fill only for shared datasets
                        'common_sql_query': None,  # NULL by default - fill only for shared datasets
                        'common_column_mapping': None,  # NULL by default - fill only for shared datasets
                        'search_query': search_query,
                        'notes': f"Extracted {len(base_columns)} unique columns",
                        'extraction_timestamp': datetime.now()
                    }
                    
                    metadata_records.append(record)
                    print(f"  - {viz_name} ({chart_type}): {len(base_columns)} columns from table '{table_name}'")
                
                except Exception as viz_error:
                    print(f"  ERROR processing visualization '{viz.get('id', 'unknown')}': {viz_error}")
                    failure_records.append({
                        'tml_file': filename,
                        'error_type': 'VISUALIZATION_ERROR',
                        'error_message': f"Viz ID: {viz.get('id', 'unknown')} - {str(viz_error)[:900]}",
                        'failure_timestamp': datetime.now()
                    })
                    continue
        
        except Exception as e:
            print(f"  ERROR processing {filename}: {e}")
            import traceback
            traceback.print_exc()
            failure_records.append({
                'tml_file': filename,
                'error_type': 'PROCESSING_ERROR',
                'error_message': str(e)[:1000],
                'failure_timestamp': datetime.now()
            })
    
    # Save metadata records to table
    if metadata_records:
        print(f"\n--- Saving {len(metadata_records)} metadata records ---")
        df = pd.DataFrame(metadata_records)
        df['extraction_timestamp'] = pd.to_datetime(df['extraction_timestamp'])
        
        # CRITICAL FIX: Explicitly define schema for Spark DataFrame
        from pyspark.sql.types import StructType, StructField, StringType, ArrayType, TimestampType
        
        schema = StructType([
            StructField("tml_file", StringType(), True),
            StructField("visualization_id", StringType(), True),
            StructField("visualization_name", StringType(), True),
            StructField("chart_type", StringType(), True),
            StructField("tml_table_name", StringType(), True),
            StructField("tml_table_id", StringType(), True),
            StructField("tml_columns_used", ArrayType(StringType()), True),
            StructField("databricks_table_name_ToBeFilled", StringType(), True),
            StructField("databricks_column_mapping_ToBeFilled", StringType(), True),
            StructField("common_dataset_name", StringType(), True),
            StructField("common_sql_query", StringType(), True),
            StructField("common_column_mapping", StringType(), True),
            StructField("search_query", StringType(), True),
            StructField("notes", StringType(), True),
            StructField("extraction_timestamp", TimestampType(), True)
        ])
        
        # Convert to Spark DataFrame with explicit schema
        spark_df = spark.createDataFrame(df, schema=schema)
        spark_df.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable(MAPPING_TABLE)
        
        print(f"Successfully saved metadata to {MAPPING_TABLE}")
        print("\n=== MAPPING OPTIONS ===")
        print("\nOPTION 1: Unique dataset per visualization")
        print("  - Fill 'databricks_table_name_ToBeFilled' with your Databricks table")
        print("  - Fill 'databricks_column_mapping_ToBeFilled' with JSON column mapping")
        print("  - Leave common_* columns as NULL")
        print("\nOPTION 2: Shared dataset across multiple visualizations")
        print("  - Fill 'common_dataset_name' with a shared dataset identifier (e.g., 'ds_trips_shared')")
        print("  - Fill 'common_sql_query' with the complete SQL query")
        print("  - Fill 'common_column_mapping' with JSON column mapping for the shared dataset")
        print("  - You can leave databricks_table_name_ToBeFilled empty for shared datasets")
        print("\nFor column mappings, use JSON string format:")
        print('Example: \'{"Order Date": "order_date", "Customer Name": "customer_name"}\'')
    else:
        print("\nNo metadata records extracted.")
    
    # Save failure records to table
    if failure_records:
        print(f"\n--- Saving {len(failure_records)} failure records ---")
        fail_df = pd.DataFrame(failure_records)
        fail_df['failure_timestamp'] = pd.to_datetime(fail_df['failure_timestamp'])
        
        spark_fail_df = spark.createDataFrame(fail_df)
        spark_fail_df.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable(FAILURE_LOG_TABLE)
        
        print(f"Failed TML files logged to {FAILURE_LOG_TABLE}")
    else:
        print("\nNo failures encountered - all TML files processed successfully!")

# COMMAND ----------
# MAGIC %md
# MAGIC ## Execute Extraction

# COMMAND ----------

extract_tml_metadata()

# COMMAND ----------
# MAGIC %md
# MAGIC ## View Results

# COMMAND ----------

print("\n--- TML Metadata Mapping Table ---")
print("Choose one mapping approach per visualization:")
print("1. Unique dataset: Fill databricks_table_name_ToBeFilled + databricks_column_mapping_ToBeFilled")
print("2. Shared dataset: Fill common_dataset_name + common_sql_query + common_column_mapping\n")

try:
    df = spark.table(MAPPING_TABLE)
    display(df.orderBy("tml_file", "visualization_name"))
except Exception as e:
    print(f"Could not display table. Error: {e}")

print("\n--- Failed TML Files ---")
try:
    fail_df = spark.table(FAILURE_LOG_TABLE)
    fail_count = fail_df.count()
    if fail_count > 0:
        print(f"Found {fail_count} failed TML files or visualizations:")
        display(fail_df.orderBy("failure_timestamp", ascending=False))
    else:
        print("No failures - all TML files processed successfully!")
except Exception as e:
    print(f"Could not display failure log. Error: {e}")

# COMMAND ----------
# MAGIC %md
# MAGIC ## Summary Statistics

# COMMAND ----------

print("\n--- Extraction Summary ---")

try:
    summary_query = f"""
    SELECT 
        tml_file,
        COUNT(*) as num_visualizations,
        COUNT(DISTINCT tml_table_name) as num_unique_tables,
        SUM(SIZE(tml_columns_used)) as total_columns_used
    FROM {MAPPING_TABLE}
    GROUP BY tml_file
    ORDER BY tml_file
    """
    display(spark.sql(summary_query))
except Exception as e:
    print(f"Could not display summary. Error: {e}")

# COMMAND ----------
# MAGIC %md
# MAGIC ## Update Mapping Examples

# COMMAND ----------

print(f"""
--- How to Update the Mapping Table ---

OPTION 1: Unique dataset per visualization
------------------------------------------
UPDATE {MAPPING_TABLE}
SET 
  databricks_table_name_ToBeFilled = 'my_catalog.my_schema.orders_table',
  databricks_column_mapping_ToBeFilled = '{{"Order Date": "order_date", "Customer Name": "customer_name", "Total Revenue": "total_revenue"}}'
WHERE visualization_id = 'your_viz_id_here'


OPTION 2: Shared dataset across multiple visualizations
-------------------------------------------------------
UPDATE {MAPPING_TABLE}
SET 
  common_dataset_name = 'ds_trips_shared',
  common_sql_query = 'SELECT pickup_zip, dropoff_zip, fare_amount, trip_distance FROM samples.nyctaxi.trips WHERE trip_distance > 0',
  common_column_mapping = '{{"pickup_zip": "pickup_zip", "dropoff_zip": "dropoff_zip", "fare_amount": "fare_amount"}}'
WHERE tml_file = 'NYC_Dashboard.tml' 
  AND visualization_id IN ('viz_1', 'viz_2', 'viz_3')


Query to see what needs mapping:
---------------------------------
SELECT 
  tml_file,
  visualization_name,
  tml_table_name,
  tml_columns_used,
  databricks_table_name_ToBeFilled,
  common_dataset_name
FROM {MAPPING_TABLE}
WHERE (databricks_table_name_ToBeFilled = '' OR databricks_table_name_ToBeFilled IS NULL)
  AND common_dataset_name IS NULL
ORDER BY tml_file, visualization_name


Query to identify potential shared datasets:
--------------------------------------------
SELECT 
  tml_table_name,
  COUNT(*) as num_visualizations,
  COLLECT_SET(visualization_name) as viz_names
FROM {MAPPING_TABLE}
GROUP BY tml_table_name
HAVING COUNT(*) > 1
ORDER BY num_visualizations DESC

Note: All *_column_mapping fields should be JSON strings mapping TML column names to Databricks column names.
""")

# COMMAND ----------
# MAGIC %md
# MAGIC ## Next Steps
# MAGIC
# MAGIC 1. Review the metadata mapping table above
# MAGIC 2. Decide which visualizations should share datasets (query provided above can help identify candidates)
# MAGIC 3. For shared datasets:
# MAGIC    - Set `common_dataset_name` (e.g., 'ds_trips_shared')
# MAGIC    - Set `common_sql_query` (full SQL query)
# MAGIC    - Set `common_column_mapping` (JSON string)
# MAGIC 4. For unique datasets:
# MAGIC    - Set `databricks_table_name_ToBeFilled` (catalog.schema.table)
# MAGIC    - Set `databricks_column_mapping_ToBeFilled` (JSON string)
# MAGIC 5. Check the failure log table for any TML files that couldn't be processed
# MAGIC 6. Use this mapping table in your conversion script

In [0]:
%sql
select * from ds_training_1.thoughtspot_inventory_ak.tml_dbx_metadata_mapping
where tml_file = 'PaaS Tracking Card.liveboard.tml'

In [0]:
%sql
UPDATE ds_training_1.thoughtspot_inventory_ak.tml_dbx_metadata_mapping
SET
  common_dataset_name = 'ds_paas_summary_by_os',
  common_sql_query = 'SELECT Month_Date, os_platform, Viewed_PaaS_Tracking_Card, Clicked_Expand FROM ds_training_1.thoughtspot_inventory_ak.paas_tracking_card_summary',
  common_column_mapping = '{
    "Month(Date)": "Month_Date",
    "Os Platform": "os_platform",
    "Viewed PaaS Tracking Card": "Viewed_PaaS_Tracking_Card",
    "Clicked Expand": "Clicked_Expand"
  }',
  databricks_table_name_ToBeFilled = NULL, 
  databricks_column_mapping_ToBeFilled = NULL 
WHERE
  visualization_id IN ('Viz_1', 'Viz_2'); 

In [0]:
%sql
UPDATE ds_training_1.thoughtspot_inventory_ak.tml_dbx_metadata_mapping
SET
  common_dataset_name = 'ds_paas_summary_monthly', 
  common_sql_query = 'SELECT Month_Date, Clicked_Order_Confirmation, Clicked_Order_Processing, Clicked_Track_Delivery, Clicked_Complete_Setup, Delivered, Onboarded, Support_Cases, Order_Confirmed_Pill, Processing_Pill, Shipped_Pill, Delivered_Pill, Confirmed, Processed, Shipped FROM ds_training_1.thoughtspot_inventory_ak.paas_tracking_card_monthly_summary',
  common_column_mapping = '{
    "Month(Date)": "Month_Date",
    "Clicked Order Confirmation": "Clicked_Order_Confirmation",
    "Clicked Order Processing": "Clicked_Order_Processing",
    "Clicked Track Delivery": "Clicked_Track_Delivery",
    "Clicked Complete Setup": "Clicked_Complete_Setup",
    "Order Confirmation": "Clicked_Order_Confirmation", 
    "Processing": "Clicked_Order_Processing", 
    "Track Delivery": "Clicked_Track_Delivery", 
    "Complete Setup": "Clicked_Complete_Setup", 
    "Delivered": "Delivered",
    "Onboarded": "Onboarded",
    "Support Cases": "Support_Cases",
    "Order Confirmed - Pill": "Order_Confirmed_Pill",
    "Processing - Pill": "Processing_Pill",
    "Shipped - Pill": "Shipped_Pill",
    "Delivered - Pill": "Delivered_Pill",
    "Order Confirmed": "Order_Confirmed_Pill",
    "Shipped": "Shipped_Pill",
    "Confirmed": "Confirmed",
    "Processed": "Processed"
  }',
  databricks_table_name_ToBeFilled = NULL,
  databricks_column_mapping_ToBeFilled = NULL
WHERE
  visualization_id IN ('Viz_3', 'Viz_4', 'Viz_5', 'Viz_6');

In [0]:
%sql
UPDATE
  ds_training_1.thoughtspot_inventory_ak.tml_dbx_metadata_mapping
SET
  databricks_table_name_ToBeFilled = 'ds_training_1.thoughtspot_inventory_ak.paas_tracking_card_summary',
  databricks_column_mapping_ToBeFilled = '{"Date":"Month_Date","Os Platform":"os_platform","Viewed PaaS Tracking Card":"Viewed_PaaS_Tracking_Card"}',
  common_dataset_name = NULL
WHERE
  visualization_id = 'Viz_1'
  and tml_file = 'PaaS Tracking Card.liveboard.tml'

In [0]:
%sql
UPDATE
  ds_training_1.thoughtspot_inventory_ak.tml_dbx_metadata_mapping
SET
  databricks_table_name_ToBeFilled = 'ds_training_1.thoughtspot_inventory_ak.paas_tracking_card_summary',
  databricks_column_mapping_ToBeFilled = '{"Date":"Month_Date","Os Platform":"os_platform","Clicked Expand":"Clicked_Expand"}',
  common_dataset_name = NULL
WHERE
  visualization_id = 'Viz_2'
  and tml_file = 'PaaS Tracking Card.liveboard.tml'

In [0]:
%sql
UPDATE
  ds_training_1.thoughtspot_inventory_ak.tml_dbx_metadata_mapping
SET
  databricks_table_name_ToBeFilled = 'ds_training_1.thoughtspot_inventory_ak.paas_tracking_card_monthly_summary',
  databricks_column_mapping_ToBeFilled = '{"Date":"Month_Date","Clicked Complete Setup":"Clicked_Complete_Setup","Clicked Order Confirmation":"Clicked_Order_Confirmation","Clicked Order Processing":"Clicked_Order_Processing","Clicked Track Delivery":"Clicked_Track_Delivery"}',
  common_dataset_name = NULL
WHERE
  visualization_id = 'Viz_3'
  and tml_file = 'PaaS Tracking Card.liveboard.tml'

In [0]:
%sql
UPDATE
  ds_training_1.thoughtspot_inventory_ak.tml_dbx_metadata_mapping
SET
  databricks_table_name_ToBeFilled = 'ds_training_1.thoughtspot_inventory_ak.paas_tracking_card_monthly_summary',
  databricks_column_mapping_ToBeFilled = '{"Date":"Month_Date","Delivered":"Delivered","Onboarded":"Onboarded","Support Cases":"Support_Cases"}',
  common_dataset_name = NULL
WHERE
  visualization_id = 'Viz_4'
  and tml_file = 'PaaS Tracking Card.liveboard.tml'

In [0]:
%sql
UPDATE
  ds_training_1.thoughtspot_inventory_ak.tml_dbx_metadata_mapping
SET
  databricks_table_name_ToBeFilled = 'ds_training_1.thoughtspot_inventory_ak.paas_tracking_card_monthly_summary',
  databricks_column_mapping_ToBeFilled = '{"Date":"Month_Date","Delivered - Pill":"Delivered_Pill","Order Confirmed - Pill":"Order_Confirmed_Pill","Processing - Pill":"Processing_Pill","Shipped - Pill":"Shipped_Pill"}',
  common_dataset_name = NULL
WHERE
  visualization_id = 'Viz_5'
  and tml_file = 'PaaS Tracking Card.liveboard.tml'

In [0]:
%sql
UPDATE
  ds_training_1.thoughtspot_inventory_ak.tml_dbx_metadata_mapping
SET
  databricks_table_name_ToBeFilled = 'ds_training_1.thoughtspot_inventory_ak.paas_tracking_card_monthly_summary',
  databricks_column_mapping_ToBeFilled = '{"Confirmed":"Confirmed","Delivered":"Delivered","Date":"Month_Date","Processed":"Processed","Shipped":"Shipped"}',
  common_dataset_name = NULL
WHERE
  visualization_id = 'Viz_6'
  and tml_file = 'PaaS Tracking Card.liveboard.tml'