In [3]:
import pandas as pd
import numpy as np

def check_submission_vs_template(template_path, submission_path):
    print("=== Loading files ===")
    template = pd.read_csv(template_path)
    sub = pd.read_csv(submission_path)

    print(f"Template path : {template_path}")
    print(f"Submission path: {submission_path}")
    print()

    issues = []

    # 1) Basic shape
    print("=== Shape check ===")
    print(f"Template shape : {template.shape}")
    print(f"Submission shape: {sub.shape}")
    if template.shape != sub.shape:
        issues.append("Shape mismatch (rows or columns differ).")
    print()

    # 2) Column names and order
    print("=== Column names & order ===")
    print("Template columns :", template.columns.tolist())
    print("Submission columns:", sub.columns.tolist())
    if list(template.columns) != list(sub.columns):
        issues.append("Column names or order do not match template exactly.")
    print()

    # 3) Dtypes
    print("=== dtypes comparison ===")
    print("Template dtypes:")
    print(template.dtypes)
    print("\nSubmission dtypes:")
    print(sub.dtypes)
    for col in template.columns:
        if col in sub.columns:
            if template.dtypes[col] != sub.dtypes[col]:
                issues.append(f"dtype mismatch in column '{col}': template={template.dtypes[col]}, submission={sub.dtypes[col]}")
    print()

    # 4) ID / first-column checks
    id_col = template.columns[0]
    print("=== ID column check ===")
    print(f"ID column name (template): {id_col}")
    if id_col != sub.columns[0]:
        issues.append(f"First column name differs: template='{id_col}', submission='{sub.columns[0]}'")

    template_ids = template[id_col].values
    sub_ids = sub[id_col].values if id_col in sub.columns else sub.iloc[:, 0].values

    same_len = len(template_ids) == len(sub_ids)
    same_set = set(template_ids) == set(sub_ids)
    same_order = np.array_equal(template_ids, sub_ids)

    print(f"Same length : {same_len}")
    print(f"Same ID set : {same_set}")
    print(f"Same ID order: {same_order}")

    if not same_len:
        issues.append("Number of IDs differs from template.")
    if not same_set:
        issues.append("IDs in submission do not match template IDs.")
    if not same_order:
        issues.append("ID order differs from template (usually OK, but some scripts assume same order).")
    print()

    # 5) NaN check in prediction columns (all except first)
    print("=== NaN check in prediction columns ===")
    pred_cols = template.columns[1:]
    nan_counts = sub[pred_cols].isna().sum()
    print(nan_counts)
    if nan_counts.sum() > 0:
        issues.append("NaN values found in prediction columns.")
    print()

    # 6) Value range check for prediction columns
    print("=== Range check for prediction columns (should be probabilities in [0, 1]) ===")
    desc = sub[pred_cols].describe()
    print(desc)
    for col in pred_cols:
        col_min = sub[col].min()
        col_max = sub[col].max()
        if col_min < 0 - 1e-6 or col_max > 1 + 1e-6:
            issues.append(f"Column '{col}' has values outside [0,1]: min={col_min}, max={col_max}")
    print()

    # 7) Extra hidden columns (e.g. index)
    print("=== Check for hidden/index-like columns ===")
    # Example: columns starting with 'Unnamed:'
    unnamed_cols = [c for c in sub.columns if c.startswith("Unnamed")]
    if unnamed_cols:
        print("Submission has unnamed columns:", unnamed_cols)
        issues.append(f"Submission has extra unnamed columns: {unnamed_cols}")
    else:
        print("No unnamed columns found.")
    print()

    # Summary
    print("=== SUMMARY ===")
    if not issues:
        print("No obvious format issues detected. Submission looks consistent with the template.")
    else:
        print("Potential issues found:")
        for i, msg in enumerate(issues, 1):
            print(f"{i}. {msg}")

    return template, sub


In [4]:
template_path = "onsite_test_submission.csv"  # the original template you downloaded
submission_path = "submission/submission_resnet18_task1_1.csv"  # or whichever file you upload

template_df, sub_df = check_submission_vs_template(template_path, submission_path)


=== Loading files ===
Template path : onsite_test_submission.csv
Submission path: submission/submission_resnet18_task1_1.csv

=== Shape check ===
Template shape : (250, 4)
Submission shape: (250, 4)

=== Column names & order ===
Template columns : ['id', 'D', 'G', 'A']
Submission columns: ['id', 'D', 'G', 'A']

=== dtypes comparison ===
Template dtypes:
id    object
D      int64
G      int64
A      int64
dtype: object

Submission dtypes:
id     object
D     float64
G     float64
A     float64
dtype: object

=== ID column check ===
ID column name (template): id
Same length : True
Same ID set : True
Same ID order: True

=== NaN check in prediction columns ===
D    0
G    0
A    0
dtype: int64

=== Range check for prediction columns (should be probabilities in [0, 1]) ===
                D             G           A
count  250.000000  2.500000e+02  250.000000
mean     0.498471  1.948710e-01    0.274850
std      0.430912  2.781468e-01    0.344993
min      0.000002  7.872292e-07    0.000045
