In [4]:
# load ./out/training_data.json into df
import pandas as pd
import altair as alt
import json
alt.data_transformers.enable("vegafusion")

# Load the parquet file
df = pd.read_parquet('./out/training_data.parquet')
df['chart_complexity_ordinal'] = df['chart_complexity'].map({
    'simple': 0,
    'medium': 1,
    'complex': 2,
    'extra complex': 3,
    })

complexity_labelExpr = """
        {
            0: 'Simple',
            1: 'Medium',
            2: 'Complex',
            3: 'Extra Complex'
        }[datum.label]
        """
chart_complexity_legend = alt.Legend(
        labelExpr=complexity_labelExpr
        )
chart_complexity_format = alt.Axis(
    labelExpr=complexity_labelExpr
)

legend_labelExpr = """
    {
        "scatterplot": "Scatterplot",
        "barchart": "Barchart",
        "stacked_bar": "Stacked Bar",
        "grouped_bar": "Grouped Bar",
        "normalized_bar": "Normalized Bar",
        "circular": "Circular",
        "table": "Table",
        "line": "Line",
        "area": "Area",
        "grouped_line": "Grouped Line",
        "grouped_area": "Grouped Area",
        "grouped_scatter": "Grouped Scatter",
        "heatmap": "Heatmap",
        "histogram": "Histogram",
        "dot": "Dot",
        "grouped_dot": "Grouped Dot"
    }[datum.label]
    """

chart_type_legend = alt.Legend(
    labelExpr=legend_labelExpr
)

chart_type_format = alt.Axis(
    labelExpr=legend_labelExpr
)

In [5]:
# create a histogram of df['spec_key_count']
histogram = alt.Chart(df).mark_bar().encode(
    alt.X("spec_key_count:Q", bin=alt.Bin(maxbins=100)),
    alt.Y("count():Q"),
    alt.Color("chart_complexity_ordinal:N", legend=chart_complexity_legend),
    tooltip=["count():Q", "spec_key_count:Q"]
).properties(
    width=800,
    height=400
).configure_axis(
    labelFontSize=12,
    titleFontSize=14
).configure_title(
    fontSize=16
)

histogram.display()

In [6]:
# group by chart complexity and visualization type
grouped = df.groupby(['chart_complexity_ordinal', 'chart_type']).size().reset_index(name='count')


sort_order = [
            "table", "scatterplot", "barchart", "stacked_bar", "grouped_bar",
            "normalized_bar", "circular", "line", "area", "grouped_line",
            "grouped_area", "grouped_scatter", "heatmap", "histogram",
            "dot", "grouped_dot"
        ]

# create a heatmap
heatmap = alt.Chart(grouped).mark_rect().encode(
    y=alt.Y('chart_type:N', title='Chart Type', sort=sort_order),
    # x=alt.X('chart_complexity_ordinal:O', title='Chart Complexity'),
    x=alt.X('chart_complexity_ordinal:O', title='Chart Complexity', axis=chart_complexity_format),
    color=alt.Color('count:Q', scale=alt.Scale(scheme='blues')),
    tooltip=['chart_type:N', 'chart_complexity_ordinal:O', 'count:Q']
)

textlayer = alt.Chart(grouped).mark_text().encode(
    y=alt.Y(
        'chart_type:O',
        title='Chart Type',
        axis=chart_type_format,
        sort=sort_order
    ),
    x=alt.X('chart_complexity_ordinal:O', title='Chart Complexity'),
    text=alt.Text('count:Q', format=','),
    color=alt.condition(
        alt.datum.count > 150000,
        alt.value('white'),  # If count is greater than 6000, use white text
        alt.value('black')   # Otherwise, use black text
    ),
)

heatmap = heatmap + textlayer
heatmap = heatmap.properties(
    width=400,
    height=400
).configure_axisX(
    labelAngle=-0,
)

heatmap.save('./out/figures/chart_complexity.png')
heatmap.display()

In [7]:
# Get chart dimensions for data over table.

# group by chart complexity and dataset_schema
grouped = df.groupby(['chart_complexity_ordinal', 'dataset_schema']).size().reset_index(name='count')


pivoted = grouped.pivot(index='dataset_schema', columns='chart_complexity_ordinal', values='count')

# Optional: Sort columns if needed
pivoted = pivoted.sort_index(axis=1)

# Optional: Rename columns if desired (e.g., prefix with 'complexity_')
pivoted.columns = [f'complexity_{col}' for col in pivoted.columns]

# sort by complexity_0
pivoted = pivoted.sort_values(by='complexity_0', ascending=False)

# Display the result
pivoted

Unnamed: 0_level_0,complexity_0,complexity_1,complexity_2,complexity_3
dataset_schema,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hubmap_2025-05-05,321069,514813,147680,41695
MetabolomicsWorkbench,3375,3994,10950,681
4DN,2661,1800,10225,525
MoTrPAC,2075,1525,7000,275
SenNet,725,650,3347,125


In [8]:
# get the number of fields and entities (also for the overview table)

with open('./datasets/output_catalogue.json') as f:
    schema_list = json.load(f)
    dataset_entity_counts = {}
    dataset_field_counts = {}
    for schema in schema_list:
        schema_name = schema["udi:name"]
        entity_count = 0
        field_count = 0
        for file in schema["resources"]:
            row_count = file["udi:row_count"]
            if row_count == 0:
                continue
            entity_count += 1
            for field in file["schema"]["fields"]:
                if field["udi:cardinality"] == 0:
                    continue
                field_count += 1
        
        dataset_entity_counts[schema_name] = entity_count
        dataset_field_counts[schema_name] = field_count
    df_counts = pd.DataFrame.from_dict(dataset_entity_counts, orient='index', columns=['entity_count'])
    df_counts['field_count'] = pd.DataFrame.from_dict(dataset_field_counts, orient='index', columns=['field_count'])
    # sort by custom order: hubmap_2025-05-05, MetabolomicsWorkbench, 4DN, MoTrPAC, SenNet
    custom_order = [
        "hubmap_2025-05-05",
        "MetabolomicsWorkbench",
        "4DN",
        "MoTrPAC",
        "SenNet"
    ]
    df_counts['custom_order'] = df_counts.index.map(lambda x: custom_order.index(x) if x in custom_order else len(custom_order))
    df_counts = df_counts.sort_values(by='custom_order')
    df_counts = df_counts.drop(columns=['custom_order'])
    
df_counts.to_csv('./out/figures/dataset_entity_field_counts.csv', index=False, sep='\t')
df_counts

Unnamed: 0,entity_count,field_count
hubmap_2025-05-05,3,320
MetabolomicsWorkbench,22,99
4DN,20,101
MoTrPAC,14,68
SenNet,6,35
