In [1]:
import pandas as pd

# Step 1: Define standard fields with requirement levels and justifications
validated_fields = [
    ("FAIR", "dc:title", "‚úÖ Core", "Essential for discovery (DCMI)"),
    ("FAIR", "dc:description", "‚úÖ Core", "Helps users interpret dataset meaning"),
    ("FAIR", "dc:creator", "‚úÖ Core", "Attribution required by FAIR"),
    ("FAIR", "dc:license", "‚úÖ Core", "Required for reuse clarity"),
    ("FAIR", "dc:publisher", "üü° Recommended", "Common in DCAT and citation systems"),
    ("FAIR", "dc:subject", "üü° Recommended", "Improves search and categorization"),
    ("FAIR", "dc:issued", "üü° Recommended", "Standard publishing metadata"),
    ("FAIR", "dc:language", "üü§ Optional", "Relevant for multilingual datasets"),
    ("FAIR", "dcat:landingPage", "‚úÖ Core", "Required by DCAT for linking dataset access"),

    ("PROV-O", "prov:Entity", "‚úÖ Core", "Describes the dataset or model as an object"),
    ("PROV-O", "prov:Activity", "‚úÖ Core", "Connects actions to data objects"),
    ("PROV-O", "prov:Agent", "‚úÖ Core", "Identifies responsible party"),
    ("PROV-O", "prov:wasGeneratedBy", "‚úÖ Core", "Links output to activity"),
    ("PROV-O", "prov:used", "‚úÖ Core", "Links input to activity"),
    ("PROV-O", "prov:wasAssociatedWith", "üü° Recommended", "Useful for complex systems"),
    ("PROV-O", "prov:startedAtTime", "‚úÖ Core", "Supports reproducibility"),
    ("PROV-O", "prov:endedAtTime", "‚úÖ Core", "Supports reproducibility"),

    ("FAIR4ML", "fair4ml:trainedOn", "‚úÖ Core", "Links model to dataset"),
    ("FAIR4ML", "fair4ml:trainingStartTime", "‚úÖ Core", "Temporal traceability"),
    ("FAIR4ML", "fair4ml:trainingEndTime", "‚úÖ Core", "Temporal traceability"),
    ("FAIR4ML", "fair4ml:modelType", "‚úÖ Core", "Essential for reuse and understanding"),
    ("FAIR4ML", "fair4ml:targetVariable", "‚úÖ Core", "Key for supervised learning"),
    ("FAIR4ML", "fair4ml:trainingScriptVersion", "‚úÖ Core", "Links model to source code"),
    ("FAIR4ML", "fair4ml:runEnvironment", "üü° Recommended", "Helpful for reproduction"),

    ("MLSEA", "mlsea:accuracy", "‚úÖ Core", "Primary performance metric"),
    ("MLSEA", "mlsea:f1_score", "‚úÖ Core", "Widely used for imbalance"),
    ("MLSEA", "mlsea:roc_auc", "üü° Recommended", "For probabilistic classifiers"),
    ("MLSEA", "mlsea:precision", "üü° Recommended", "Used in evaluation"),
    ("MLSEA", "mlsea:recall", "üü° Recommended", "Used in evaluation"),

    ("Croissant", "mls:modelName", "‚úÖ Core", "Essential identifier"),
    ("Croissant", "mls:learningAlgorithm", "‚úÖ Core", "Defines the method used"),
    ("Croissant", "mls:hyperparameters", "üü° Recommended", "Enhances reproducibility"),
    ("Croissant", "mls:hasInput", "‚úÖ Core", "Links to input dataset"),
    ("Croissant", "mls:hasOutput", "üü° Recommended", "Expected result structure"),

    ("Internal", "session_metadata.username", "‚úÖ Core", "Links action to a person"),
    ("Internal", "session_metadata.role", "üü° Recommended", "Improves team attribution"),
    ("Internal", "git_metadata.commit_hash", "‚úÖ Core", "Traceability to exact code version"),
    ("Internal", "git_metadata.branch", "üü° Recommended", "Clarifies versioning"),
    ("Internal", "justification.why_model", "üü° Recommended", "Supports explainability"),
    ("Internal", "justification.why_dataset", "üü° Recommended", "Supports explainability"),
]

# Assign weights
weight_lookup = {"‚úÖ Core": 1.0, "üü° Recommended": 0.5, "üü§ Optional": 0.25}

# Create DataFrame
validated_df = pd.DataFrame([
    {
        "Standard": std,
        "Field": field,
        "Requirement": level,
        "Weight": weight_lookup.get(level, 0),
        "Justification": just
    } for std, field, level, just in validated_fields
])


In [3]:
import json

# Load structured metadata from your run
with open("../MODEL_PROVENANCE/RandomForest_Iris_v20250616_154241/structured_metadata.json", "r") as f:
    structured_metadata = json.load(f)

# Flatten all fields
flat_fields = set()
for section, fields in structured_metadata.items():
    if isinstance(fields, dict):
        for key in fields:
            if isinstance(fields[key], dict):
                for subkey in fields[key]:
                    flat_fields.add(f"{key}.{subkey}")
            else:
                flat_fields.add(key)


In [4]:
# Compare and score
total_possible = 0
total_achieved = 0
comparison = []

for _, row in validated_df.iterrows():
    field = row["Field"]
    found = "‚úÖ" if field in flat_fields else "‚ùå"
    score = row["Weight"] if found == "‚úÖ" else 0
    total_possible += row["Weight"]
    total_achieved += score
    comparison.append({
        "Standard": row["Standard"],
        "Field": field,
        "Requirement": row["Requirement"],
        "Captured?": found,
        "Score": score,
        "Max Score": row["Weight"],
        "Justification": row["Justification"]
    })

# Create results DataFrame and summary
comparison_df = pd.DataFrame(comparison)

print("üîé Metadata Coverage Summary")
print(f"Score: {total_achieved} / {total_possible}")
print(f"Coverage: {round((total_achieved / total_possible) * 100, 2)}%")
comparison_df.head()  # Display first few rows


üîé Metadata Coverage Summary
Score: 15.0 / 32.25
Coverage: 46.51%


Unnamed: 0,Standard,Field,Requirement,Captured?,Score,Max Score,Justification
0,FAIR,dc:title,‚úÖ Core,‚úÖ,1.0,1.0,Essential for discovery (DCMI)
1,FAIR,dc:description,‚úÖ Core,‚úÖ,1.0,1.0,Helps users interpret dataset meaning
2,FAIR,dc:creator,‚úÖ Core,‚úÖ,1.0,1.0,Attribution required by FAIR
3,FAIR,dc:license,‚úÖ Core,‚úÖ,1.0,1.0,Required for reuse clarity
4,FAIR,dc:publisher,üü° Recommended,‚ùå,0.0,0.5,Common in DCAT and citation systems
