In [12]:
import json
import os

def transform_data(original_data):
    transformed_data = []
    
    for table in original_data:
        # Estrai colonne
        columns = [{"text": header[0]} for header in table["column_header"]]
        
        # Estrai righe
        rows = []
        for row in table["data"]:
            cells = [{"text": cell} for cell in row]
            rows.append({"cells": cells})

        # Costruisci la nuova struttura
        transformed_table = {
            "columns": columns,
            "rows": rows,
            "tableId": table["id"],
            "documentTitle": table.get("title", ""),  # Usa il titolo reale se disponibile
            "documentUrl": table.get("url", "")  # Usa l'URL reale se disponibile
        }
        transformed_data.append(transformed_table)
    
    return transformed_data

def main(input_file, output_file):
    original_data = []
    
    # Leggi i dati dal file di input
    try:
        with open(input_file, 'r') as infile:
            for line in infile:
                original_data.append(json.loads(line.strip()))
    except Exception as e:
        print(f"Error reading input file: {e}")
        return

    # Trasforma i dati
    transformed_result = transform_data(original_data)
    
    # Scrivi i dati trasformati nel file di output come JSON Lines
    try:
        with open(output_file, 'w') as outfile:
            for item in transformed_result:
                outfile.write(json.dumps(item) + '\n')  # Scrivi ogni oggetto JSON su una nuova riga
    except Exception as e:
        print(f"Error writing output file: {e}")
        return
    
    print(f"Output written to {output_file}")

if __name__ == "__main__":
    input_file_path = "/home/sara.ferrari/data/AITQA/tables/AITQA_tables.jsonl"  # Percorso del tuo file di input
    output_file_path = "/home/sara.ferrari/data/AITQA/tables/tables.jsonl"  # Percorso desiderato per il file di output
    
    # Verifica che il file di input esista
    if os.path.exists(input_file_path):
        main(input_file_path, output_file_path)
    else:
        print(f"Input file {input_file_path} does not exist.")


Output written to /home/sara.ferrari/data/AITQA/tables/tables.jsonl


In [18]:
import json
import csv

# Funzione per processare una singola tabella
def process_table(table_data):
    title = table_data["title"]  # Nome del file CSV sarà basato su 'title'
    cells = table_data["cells"]

    # Organizza le celle in una matrice in base ai loro indici di riga e colonna
    max_row = max(cell["row_idx"] for cell in cells)
    max_col = max(cell["col_idx"] for cell in cells)

    # Inizializza una matrice vuota per la tabella
    table = [["" for _ in range(max_col + 1)] for _ in range(max_row + 1)]

    # Riempie la matrice con i valori dalle celle
    for cell in cells:
        row = cell["row_idx"]
        col = cell["col_idx"]
        # Rimuove il delimitatore "|" dai dati
        table[row][col] = cell["text"].replace("|", "").strip()

    # Scrivi la tabella su un file CSV con il titolo
    csv_file_name = f"/home/sara.ferrari/data/AITQA/tables_csv/{title}.csv"
    with open(csv_file_name, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerows(table)

    print(f"File CSV creato: {csv_file_name}")

# Funzione principale per leggere il file JSONL e processare ciascuna tabella
def convert_jsonl_to_csv(jsonl_file_path):
    with open(jsonl_file_path, mode="r", encoding="utf-8") as file:
        for line in file:
            table_data = json.loads(line.strip())
            process_table(table_data)

# Esegui la conversione
jsonl_file_path = "/home/sara.ferrari/data/AITQA/tables/transformed_tables4.jsonl"   # Sostituisci con il percorso del tuo file JSONL
convert_jsonl_to_csv(jsonl_file_path)

File CSV creato: /home/sara.ferrari/data/AITQA/tables_csv/tab-0.csv
File CSV creato: /home/sara.ferrari/data/AITQA/tables_csv/tab-1.csv
File CSV creato: /home/sara.ferrari/data/AITQA/tables_csv/tab-2.csv
File CSV creato: /home/sara.ferrari/data/AITQA/tables_csv/tab-3.csv
File CSV creato: /home/sara.ferrari/data/AITQA/tables_csv/tab-4.csv
File CSV creato: /home/sara.ferrari/data/AITQA/tables_csv/tab-5.csv
File CSV creato: /home/sara.ferrari/data/AITQA/tables_csv/tab-6.csv
File CSV creato: /home/sara.ferrari/data/AITQA/tables_csv/tab-7.csv
File CSV creato: /home/sara.ferrari/data/AITQA/tables_csv/tab-8.csv
File CSV creato: /home/sara.ferrari/data/AITQA/tables_csv/tab-9.csv
File CSV creato: /home/sara.ferrari/data/AITQA/tables_csv/tab-10.csv
File CSV creato: /home/sara.ferrari/data/AITQA/tables_csv/tab-11.csv
File CSV creato: /home/sara.ferrari/data/AITQA/tables_csv/tab-12.csv
File CSV creato: /home/sara.ferrari/data/AITQA/tables_csv/tab-13.csv
File CSV creato: /home/sara.ferrari/data/AIT