<a href="https://colab.research.google.com/github/samer-glitch/Federated-Governance-and-Provenance-Scoring-for-Trustworthy-AI-A-Metadata-Ledger-Approach/blob/main/Provenance_Score_with_Multi_Dimensions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ──────────────────────────────────────────────────────────────────────────────
# 1) Installs & Imports
# ──────────────────────────────────────────────────────────────────────────────
!pip install --quiet pandas numpy

import io, numpy as np, pandas as pd
from datetime import datetime
from IPython.display import display, HTML

# Colab upload helper
try:
    from google.colab import files
    def upload_csv():
        return files.upload()
except ImportError:
    def upload_csv():
        raise RuntimeError("This code is meant to run in Google Colab with files.upload().")

def display_df(df, title=""):
    if title:
        display(HTML(f"<h3>{title}</h3>"))
    display(df)

# ──────────────────────────────────────────────────────────────────────────────
# 2) Upload, Clean & Shuffle
# ──────────────────────────────────────────────────────────────────────────────
print("⬆️ Upload your CSV (e.g. adult.csv)")
uploaded = upload_csv()
raw = pd.read_csv(io.BytesIO(next(iter(uploaded.values()))))
raw.columns = raw.columns.str.strip().str.lower().str.replace('-', '_')
raw.replace('?', np.nan, inplace=True)

RANDOM_STATE = 42
data = raw.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)

# ──────────────────────────────────────────────────────────────────────────────
# 3) Partition into Shards
# ──────────────────────────────────────────────────────────────────────────────
NUM_CLIENTS = int(input("Number of shards/clients: ").strip())
fractions = np.random.dirichlet([1.0]*NUM_CLIENTS, size=1)[0]
bounds = (fractions.cumsum()*len(data)).astype(int)
start = 0
client_data = {}
for i in range(NUM_CLIENTS):
    end = bounds[i]
    client_data[f"C{i+1}"] = data.iloc[start:end].reset_index(drop=True)
    start = end
if start < len(data):
    client_data[f"C{NUM_CLIENTS}"] = pd.concat(
        [client_data[f"C{NUM_CLIENTS}"], data.iloc[start:]]
    ).reset_index(drop=True)

# ──────────────────────────────────────────────────────────────────────────────
# 4) Scoring Helpers (raw + bucketed)
# ──────────────────────────────────────────────────────────────────────────────
def score_completeness(df):
    col_comp = 1 - df.isna().mean()
    overall = col_comp.mean()
    bins = [0.0, 0.90, 0.95, 0.98, 0.99, 0.995, 1.0]
    labels = [0,    1,    2,    3,     4,      5]
    score = int(pd.cut([overall], bins=bins, labels=labels, include_lowest=True)[0])
    return overall, score

def score_duplication(df):
    dup_frac = 1 - len(df.drop_duplicates())/len(df)
    bins = [0.0, 0.001, 0.005, 0.01, 0.02, 0.05, 1.0]
    labels = [5,    4,     3,     2,    1,    0]
    score = int(pd.cut([dup_frac], bins=bins, labels=labels, include_lowest=True)[0])
    return dup_frac, score

def score_error_rate(df):
    err_frac = df.isna().sum().sum() / (df.shape[0]*df.shape[1])
    bins = [0.0, 0.001, 0.005, 0.01, 0.02, 0.05, 1.0]
    labels = [5,    4,     3,    2,     1,    0]
    score = int(pd.cut([err_frac], bins=bins, labels=labels, include_lowest=True)[0])
    return err_frac, score

def score_consistency(df):
    cons_list = []
    for col in df:
        non_null = df[col].dropna()
        if non_null.empty:
            cons_list.append(1.0)
            continue
        if pd.api.types.is_numeric_dtype(non_null):
            valid = non_null.apply(lambda x: isinstance(x, (int, float))).mean()
        else:
            valid = non_null.apply(lambda x: isinstance(x, str)).mean()
        cons_list.append(valid)
    overall = float(np.mean(cons_list))
    bins = [0.0, 0.70, 0.88, 0.93, 0.95, 0.98, 1.0]
    labels = [0,    1,    2,    3,     4,     5]
    score = int(pd.cut([overall], bins=bins, labels=labels, include_lowest=True)[0])
    return overall, score

def score_freshness(ex_date):
    years = (pd.Timestamp.now() - pd.Timestamp(ex_date)).days / 365.0
    if   years > 5:    return 0
    elif years > 2:    return 1
    elif years > 1:    return 2
    elif years > 0.5:  return 3
    elif years > 1/12: return 4
    else:              return 5

# ──────────────────────────────────────────────────────────────────────────────
# 5) Context & DataCard Inputs
# ──────────────────────────────────────────────────────────────────────────────
ctx_weights = {
    "healthcare":  [0.25,0.15,0.10,0.10,0.30,0.10],
    "finance":     [0.20,0.25,0.15,0.10,0.25,0.05],
    "critical_ai": [0.30,0.20,0.15,0.15,0.10,0.10],
    "general":     [0.15,0.20,0.20,0.20,0.15,0.10],
}
ctx = input("Context (healthcare/finance/critical_ai/general): ").strip()
W1,W2,W3,W4,W5,W6 = ctx_weights[ctx]

d1_fields = ["Source Reputation","Data Controller","Data Objective"]
d1 = [int(input(f"Dim1.{f} (0–5): ")) for f in d1_fields]

d3_fields = ["Data Dictionary","Version Logs","Collection Protocol","Updates on Definitions"]
d3 = [int(input(f"Dim3.{f} (0–5): ")) for f in d3_fields]

d5_fields = ["Regulation Coverage","Explicit Consent","Geo-Restrictions",
             "Sensitivity Classification","Audits & Certifications"]
d5 = [int(input(f"Dim5.{f} (0–5): ")) for f in d5_fields]

d6_fields = ["License Terms","Ethical Reviews","Redistribution","User Agreements"]
d6 = [int(input(f"Dim6.{f} (0–5): ")) for f in d6_fields]

ex_date = pd.to_datetime(input("Extraction date (YYYY-MM-DD): ").strip())

# decision thresholds per context
decision_bounds = {
    "healthcare":  (2.9, 2.8),
    "finance":     (3, 2.5),
    "critical_ai": (3, 2.65),
    "general":     (2.5, 2.0),
}
accept_th, review_th = decision_bounds[ctx]

# ──────────────────────────────────────────────────────────────────────────────
# 6) Compute Hybrid Provenance Score & Decision
# ──────────────────────────────────────────────────────────────────────────────
records = []
for cid, df in client_data.items():
    _,   ch_score = score_completeness(df)
    _,   dp_score = score_duplication(df)
    _,   er_score = score_error_rate(df)
    _,   cs_score = score_consistency(df)
    fr_score     = score_freshness(ex_date)

    dim1 = np.mean(d1)
    dim2 = np.mean([ch_score, dp_score, er_score, cs_score])
    dim3 = np.mean(d3)
    dim4 = fr_score
    dim5 = np.mean(d5)
    dim6 = np.mean(d6)

    pscore = (W1*dim1 + W2*dim2 + W3*dim3 +
              W4*dim4 + W5*dim5 + W6*dim6)

    if   pscore >= accept_th: action = "ACCEPT"
    elif pscore >= review_th: action = "REVIEW"
    else:                     action = "QUARANTINE"

    rec = {
      "client":     cid,
      "dim1":       dim1,
      "dim2":       dim2,
      "dim3":       dim3,
      "dim4":       dim4,
      "dim5":       dim5,
      "dim6":       dim6,
      "pscore":     float(f"{pscore:.4f}"),
      "action":     action,
      "records":    len(df)
    }
    records.append(rec)

provenance_df = pd.DataFrame(records).set_index("client")

# ──────────────────────────────────────────────────────────────────────────────
# 7) Display Results
# ──────────────────────────────────────────────────────────────────────────────
display_df(provenance_df,   "Hybrid Provenance Scores & Actions")
display_df(provenance_df.T, "Transposed for Comparison")


⬆️ Upload your CSV (e.g. adult.csv)


Saving adult.csv to adult (17).csv
Number of shards/clients: 20
Context (healthcare/finance/critical_ai/general): healthcare
Dim1.Source Reputation (0–5): 5
Dim1.Data Controller (0–5): 5
Dim1.Data Objective (0–5): 5
Dim3.Data Dictionary (0–5): 4
Dim3.Version Logs (0–5): 0
Dim3.Collection Protocol (0–5): 1
Dim3.Updates on Definitions (0–5): 0
Dim5.Regulation Coverage (0–5): 0
Dim5.Explicit Consent (0–5): 1
Dim5.Geo-Restrictions (0–5): 4
Dim5.Sensitivity Classification (0–5): 3
Dim5.Audits & Certifications (0–5): 3
Dim6.License Terms (0–5): 1
Dim6.Ethical Reviews (0–5): 0
Dim6.Redistribution (0–5): 4
Dim6.User Agreements (0–5): 2
Extraction date (YYYY-MM-DD): 1996-01-01


Unnamed: 0_level_0,dim1,dim2,dim3,dim4,dim5,dim6,pscore,action,records
client,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
C1,5.0,4.25,1.25,0,2.2,1.75,2.8475,REVIEW,2532
C2,5.0,4.25,1.25,0,2.2,1.75,2.8475,REVIEW,1207
C3,5.0,4.25,1.25,0,2.2,1.75,2.8475,REVIEW,1124
C4,5.0,3.75,1.25,0,2.2,1.75,2.7725,QUARANTINE,413
C5,5.0,4.25,1.25,0,2.2,1.75,2.8475,REVIEW,145
C6,5.0,4.25,1.25,0,2.2,1.75,2.8475,REVIEW,3390
C7,5.0,4.25,1.25,0,2.2,1.75,2.8475,REVIEW,3438
C8,5.0,4.25,1.25,0,2.2,1.75,2.8475,REVIEW,1494
C9,5.0,3.75,1.25,0,2.2,1.75,2.7725,QUARANTINE,617
C10,5.0,4.25,1.25,0,2.2,1.75,2.8475,REVIEW,640


client,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,C15,C16,C17,C18,C19,C20
dim1,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
dim2,4.25,4.25,4.25,3.75,4.25,4.25,4.25,4.25,3.75,4.25,4.25,4.25,4.25,4.25,4.25,4.75,4.25,4.25,4.25,3.75
dim3,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25
dim4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
dim5,2.2,2.2,2.2,2.2,2.2,2.2,2.2,2.2,2.2,2.2,2.2,2.2,2.2,2.2,2.2,2.2,2.2,2.2,2.2,2.2
dim6,1.75,1.75,1.75,1.75,1.75,1.75,1.75,1.75,1.75,1.75,1.75,1.75,1.75,1.75,1.75,1.75,1.75,1.75,1.75,1.75
pscore,2.8475,2.8475,2.8475,2.7725,2.8475,2.8475,2.8475,2.8475,2.7725,2.8475,2.8475,2.8475,2.8475,2.8475,2.8475,2.9225,2.8475,2.8475,2.8475,2.7725
action,REVIEW,REVIEW,REVIEW,QUARANTINE,REVIEW,REVIEW,REVIEW,REVIEW,QUARANTINE,REVIEW,REVIEW,REVIEW,REVIEW,REVIEW,REVIEW,ACCEPT,REVIEW,REVIEW,REVIEW,QUARANTINE
records,2532,1207,1124,413,145,3390,3438,1494,617,640,1929,3389,3250,2255,1531,131,262,3410,1390,14
