In [5]:
import pandas as pd
import os
import glob


def list_csv_files(path):
    # 使用glob模块查找所有.csv文件
    csv_files_path = glob.glob(os.path.join(path, "*.csv"))
    return csv_files_path


def parse_table(df: pd.DataFrame, database_name: str, table_name: str):
    df["description"] = df["column_description"].fillna("") + df[
        "value_description"
    ].fillna("")
    df["database_name"] = database_name
    df["table_name"] = table_name

    df_subset = df[
        ["database_name", "table_name", "original_column_name", "description"]
    ]
    df_subset["original_column_name"] = df_subset["original_column_name"].str.strip()
    dict_list = df_subset.to_dict("records")

    return dict_list


def parse_bird(base_path: str, output_path: str):
    database_names = [
        name
        for name in os.listdir(base_path)
        if os.path.isdir(os.path.join(base_path, name))
    ]
    output = []
    for database_name in database_names:
        database_path = os.path.join(base_path, database_name)
        description_dir_path = os.path.join(database_path, "database_description")
        csv_files_path = list_csv_files(description_dir_path)
        for csv_file_path in csv_files_path:
            print(csv_file_path)
            df = pd.read_csv(csv_file_path)
            table_name = os.path.splitext(os.path.basename(csv_file_path))[0]
            output += parse_table(df, database_name, table_name)
    df = pd.DataFrame(output)

    df.to_csv(output_path, index=False)


dataset = [
    (
        os.path.abspath("../bird_bench/train/train_databases"),
        "./public_dataset/rag/bird_train.csv",
    ),
    (
        os.path.abspath("../bird_bench/dev/dev_databases"),
        "./public_dataset/rag/bird_dev.csv",
    ),
]
for input_path, output_path in dataset:
    parse_bird(input_path, output_path)

/home/data2/luzhan/projects/bird_bench/train/train_databases/beer_factory/database_description/rootbeerreview.csv
/home/data2/luzhan/projects/bird_bench/train/train_databases/beer_factory/database_description/customers.csv
/home/data2/luzhan/projects/bird_bench/train/train_databases/beer_factory/database_description/rootbeer.csv
/home/data2/luzhan/projects/bird_bench/train/train_databases/beer_factory/database_description/transaction.csv
/home/data2/luzhan/projects/bird_bench/train/train_databases/beer_factory/database_description/rootbeerbrand.csv
/home/data2/luzhan/projects/bird_bench/train/train_databases/beer_factory/database_description/geolocation.csv
/home/data2/luzhan/projects/bird_bench/train/train_databases/beer_factory/database_description/location.csv
/home/data2/luzhan/projects/bird_bench/train/train_databases/world/database_description/Country.csv
/home/data2/luzhan/projects/bird_bench/train/train_databases/world/database_description/City.csv
/home/data2/luzhan/projects/b