<a href="https://colab.research.google.com/github/samer-glitch/Federated-Governance-and-Provenance-Scoring-for-Trustworthy-AI-A-Metadata-Ledger-Approach/blob/main/Provenance_Score_Engine_and_Implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# 1. INSTALL & IMPORT
!pip install --quiet pandas openpyxl
import pandas as pd, numpy as np, io, os
from datetime import datetime
from google.colab import files
from IPython.display import display, HTML

# 2. UPLOAD CONFIG & CHARTER
print("Upload config.xlsx and charter.xlsx")
uploaded = files.upload()
config_file = [fn for fn in uploaded if "config" in fn][0]
config = pd.read_excel(io.BytesIO(uploaded[config_file]), sheet_name='Config').set_index('Field')['Value']
# Charter is for admin reference only
charter_file = [fn for fn in uploaded if "charter" in fn][0]
charter = pd.read_excel(io.BytesIO(uploaded[charter_file]), sheet_name='Charter')

# 3. UPLOAD DATA
print("Upload raw.csv")
data_file = files.upload()
data = pd.read_csv(io.BytesIO(next(iter(data_file.values()))))
data.replace('?', np.nan, inplace=True)
data.columns = data.columns.str.strip().str.lower().str.replace('-', '_')

# 4. AUTOMATIC QUALITY SCORING (DIM 2)
def score_completeness(df):
    return 1 - df.isna().mean().mean()
def score_duplication(df):
    return 1 - len(df.drop_duplicates())/len(df)
def score_error_rate(df):
    return df.isna().sum().sum() / df.size
def score_consistency(df):
    scores = []
    for col in df:
        non_null = df[col].dropna()
        if non_null.empty: scores.append(1)
        elif pd.api.types.is_numeric_dtype(non_null): scores.append(non_null.apply(lambda x: isinstance(x, (int, float))).mean())
        else: scores.append(non_null.apply(lambda x: isinstance(x, str)).mean())
    return np.mean(scores)
def to_score(val, bins, labels):
    return int(pd.cut([val], bins=bins, labels=labels, include_lowest=True)[0])

comp_score = to_score(score_completeness(data), [0, .90, .95, .98, .99, .995, 1], [0,1,2,3,4,5])
dup_score  = to_score(score_duplication(data), [0,.001,.005,.01,.02,.05,1], [5,4,3,2,1,0])
err_score  = to_score(score_error_rate(data), [0,.001,.005,.01,.02,.05,1], [5,4,3,2,1,0])
cons_score = to_score(score_consistency(data), [0,.7,.88,.93,.95,.98,1], [0,1,2,3,4,5])
dim2_score = np.mean([comp_score, dup_score, err_score, cons_score])

# 5. MANUAL SCORING (ADMINISTRATOR: refer to charter.xlsx)
def prompt_scores(dim_name, fields):
    return [int(input(f"{dim_name}.{f} (0–5): ")) for f in fields]

dim1_fields = ["Source Reputation", "Data Controller", "Data Objective"]
dim3_fields = ["Data Dictionary", "Version Logs", "Collection Protocol", "Updates on Definitions"]
dim5_fields = ["Regulation Coverage", "Explicit Consent", "Geo-Restrictions", "Sensitivity Classification", "Audits & Certifications"]
dim6_fields = ["License Terms", "Ethical Reviews", "Redistribution", "User Agreements"]

print("Manual scoring based on your project policy and charter.xlsx.")
dim1_score = np.mean(prompt_scores("Dim1", dim1_fields))
dim3_score = np.mean(prompt_scores("Dim3", dim3_fields))
dim5_score = np.mean(prompt_scores("Dim5", dim5_fields))
dim6_score = np.mean(prompt_scores("Dim6", dim6_fields))

# 6. FRESHNESS (DIM 4)
ex_date = input("Extraction date (YYYY-MM-DD): ")
years = (pd.Timestamp.now() - pd.Timestamp(ex_date)).days / 365.0
freshness_score = max(0, min(5, 5 - int(years)))

# 7. PROVENANCE SCORE + DECISION
W = [float(config[f'W{i+1}']) for i in range(6)]
dims = [dim1_score, dim2_score, dim3_score, freshness_score, dim5_score, dim6_score]
pscore = sum([W[i]*dims[i] for i in range(6)])
accept_th = float(config['Accept_Threshold'])
review_th = float(config['Review_Threshold'])
quarantine_th = float(config['Quarantine_Threshold'])
if pscore >= accept_th:
    action = 'ACCEPT'
elif pscore >= review_th:
    action = 'REVIEW'
else:
    action = 'QUARANTINE'
provenance_df = pd.DataFrame([{
    'dim1':dim1_score, 'dim2':dim2_score, 'dim3':dim3_score, 'dim4':freshness_score,
    'dim5':dim5_score, 'dim6':dim6_score, 'pscore':round(pscore,3),
    'action':action, 'records':len(data)}], index=['Result'])
display(HTML("<h3>Hybrid Provenance Scores & Actions</h3>"))
display(provenance_df)

# 8. LEDGER UPDATE
import openpyxl
from openpyxl import Workbook, load_workbook

ledger_file = 'ledger.xlsx'
if os.path.exists(ledger_file):
    wb = load_workbook(ledger_file)
    ws = wb.active
else:
    wb = Workbook()
    ws = wb.active
    ws.append(['Timestamp','Dim1','Dim2','Dim3','Dim4','Dim5','Dim6','Pscore','Action','Records'])
row = [datetime.now().strftime("%Y-%m-%d %H:%M:%S")] + list(provenance_df.iloc[0])
ws.append(row)
wb.save(ledger_file)
print("Ledger updated.")


Upload config.xlsx and charter.xlsx


Saving charter.xlsx to charter (3).xlsx
Saving config.xlsx to config (8).xlsx
Upload raw.csv


Saving diabetic_data.csv to diabetic_data (4).csv
Manual scoring based on your project policy and charter.xlsx.
Dim1.Source Reputation (0–5): 5
Dim1.Data Controller (0–5): 5
Dim1.Data Objective (0–5): 5
Dim3.Data Dictionary (0–5): 5
Dim3.Version Logs (0–5): 5
Dim3.Collection Protocol (0–5): 1
Dim3.Updates on Definitions (0–5): 1
Dim5.Regulation Coverage (0–5): 5
Dim5.Explicit Consent (0–5): 5
Dim5.Geo-Restrictions (0–5): 5
Dim5.Sensitivity Classification (0–5): 2
Dim5.Audits & Certifications (0–5): 5
Dim6.License Terms (0–5): 5
Dim6.Ethical Reviews (0–5): 5
Dim6.Redistribution (0–5): 5
Dim6.User Agreements (0–5): 5
Extraction date (YYYY-MM-DD): 2016-12-01


Unnamed: 0,dim1,dim2,dim3,dim4,dim5,dim6,pscore,action,records
Result,5.0,2.75,3.0,0,4.4,5.0,3.782,ACCEPT,101766


Ledger updated.
