## Label

labelme C:/Users/Charlotte/Desktop/dissertation/US_new/High_quality_images

In [30]:
import os, json
import pandas as pd

json_dir = r"C:\Users\Charlotte\Desktop\dissertation\US_new\Labelled_image"
records = []

for f in os.listdir(json_dir):
    if f.endswith(".json"):
        path = os.path.join(json_dir, f)
        with open(path, encoding="utf-8") as jf:
            data = json.load(jf)
            for shape in data.get("shapes", []):
                pts = shape["points"]
                if len(pts) == 2:
                    (x1, y1), (x2, y2) = pts
                    dist = ((x1 - x2)**2 + (y1 - y2)**2)**0.5
                    records.append({
                        "Filename": f.replace(".json", ""),
                        "Label": shape.get("label", ""),
                        "x1": x1, "y1": y1,
                        "x2": x2, "y2": y2,
                        "Pixel_Distance": dist
                    })

df = pd.DataFrame(records)

out_path = r"C:\Users\Charlotte\Desktop\dissertation\US_new\annotation_points_summary.xlsx"
df.to_excel(out_path, index=False)
print(f"{len(df)} measurement point coordinates have been extracted and saved to: {out_path}")

1380 measurement point coordinates have been extracted and saved to: C:\Users\Charlotte\Desktop\dissertation\US_new\annotation_points_summary.xlsx


In [31]:
import pandas as pd
import re

df = pd.read_excel("annotation_points_summary.xlsx")

def extract_info(filename, label):
    fn = str(filename)
    lb = str(label).lower()

    # Patient: Abbey_001 → Abbey 001
    patient_match = re.search(r"Abbey[_\s]*\d{1,3}", fn, re.I)
    patient = patient_match.group(0).replace("_", " ").strip() if patient_match else None

    # Stage: 匹配 base 或 v3/v4/v5
    stage_match = re.search(r"(base|v\d+)", fn, re.I)
    # Stage: 修正版
    stage = None
    if "v1" in fn.lower():
        stage = "base"
    elif "v3" in fn.lower():
        stage = "3"
    elif "v4" in fn.lower():
        stage = "4"
    elif "v5" in fn.lower():
        stage = "5"
    else:
        # 兜底匹配 base
        if "base" in fn.lower():
            stage = "base"


    # Location
    location = "contralateral" if "contralateral" in fn.lower() else ("treat" if "treat" in fn.lower() else None)

    # Part from Label
    if "medial" in lb:
        part = "medial"
    elif "femoral" in lb:
        part = "femoral"
    elif "lateral" in lb:
        part = "lateral"
    else:
        part = None

    return patient, stage, location, part

df[["Patient", "Stage", "Location", "Part"]] = df.apply(lambda x: pd.Series(extract_info(x["Filename"], x["Label"])), axis=1)

grouped = df.groupby(["Patient", "Stage", "Location", "Part"])["Pixel_Distance"].mean().reset_index()

pivot = grouped.pivot_table(index="Patient", columns=["Part","Location","Stage"], values="Pixel_Distance")
pivot.columns = [f"US_{p}_{l}_{s}" for (p,l,s) in pivot.columns]
pivot.reset_index(inplace=True)

cols = ["Patient",
    "US_medial_treat_base","US_femoral_treat_base","US_lateral_treat_base",
    "US_medial_contralateral_base","US_femoral_contralateral_base","US_lateral_contralateral_base",
    "US_medial_treat_3","US_femoral_treat_3","US_lateral_treat_3",
    "US_medial_contralateral_3","US_femoral_contralateral_3","US_lateral_contralateral_3",
    "US_medial_treat_4","US_femoral_treat_4","US_lateral_treat_4",
    "US_medial_contralateral_4","US_femoral_contralateral_4","US_lateral_contralateral_4",
    "US_medial_treat_5","US_femoral_treat_5","US_lateral_treat_5",
    "US_medial_contralateral_5","US_femoral_contralateral_5","US_lateral_contralateral_5"
]

for c in cols:
    if c not in pivot.columns:
        pivot[c] = None
pivot = pivot[cols]

pivot.to_excel("annotation_points_summary_averaged_formatted.xlsx", index=False)
print("Book-format summary saved as annotation_points_summary_averaged_formatted.xlsx")

Book-format summary saved as annotation_points_summary_averaged_formatted.xlsx


In [33]:
import numpy as np
import pandas as pd

df_book = pd.read_excel('Book2.xlsx')
df_ann = pd.read_excel('annotation_points_summary_averaged_formatted.xlsx')

us_cols_book = [c for c in df_book.columns if c.startswith('US_')]
us_cols_ann = [c for c in df_ann.columns if c.startswith('US_')]
overlap = [c for c in us_cols_book if c in us_cols_ann]

merged = pd.merge(df_book[['Patient'] + overlap], df_ann[['Patient'] + overlap], on='Patient', how='inner', suffixes=('_book', '_ann'))

pairs_true = []
pairs_pred = []
for col in overlap:
    tb = f'{col}_book'
    ta = f'{col}_ann'
    sub = merged[[tb, ta]].dropna()
    if not sub.empty:
        pairs_true.append(sub[tb].values.astype(float))
        pairs_pred.append(sub[ta].values.astype(float))
if len(pairs_true) == 0:
    raise ValueError('No valid overlapping measurements to optimize on.')
y_true = np.concatenate(pairs_true)
x_pred = np.concatenate(pairs_pred)

ratios = np.linspace(0.001, 0.1, 200)
best_ratio = None
best_mae = float('inf')
for r in ratios:
    mae = np.mean(np.abs(y_true - x_pred * r))
    if mae < best_mae:
        best_mae = mae
        best_ratio = r

df_ann_scaled = df_ann.copy()
for col in overlap:
    if col in df_ann_scaled.columns:
        df_ann_scaled[col] = df_ann_scaled[col].astype(float) * best_ratio

merged_scaled = pd.merge(df_book[['Patient'] + overlap], df_ann_scaled[['Patient'] + overlap], on='Patient', how='inner', suffixes=('_book', '_ann'))

errs = []
rows = []
for col in overlap:
    tb = f'{col}_book'
    ta = f'{col}_ann'
    sub = merged_scaled[['Patient', tb, ta]].dropna()
    if not sub.empty:
        e = sub[ta].values.astype(float) - sub[tb].values.astype(float)
        errs.append(e)
        for i in range(len(sub)):
            rows.append((sub.iloc[i]['Patient'], col, float(sub.iloc[i][ta]), float(sub.iloc[i][tb]), float(e[i])))
if len(errs) == 0:
    raise ValueError('No valid pairs after scaling.')
errs_all = np.concatenate(errs)
mu = float(np.mean(errs_all))
sd = float(np.std(errs_all))
lower = mu - 2.0 * sd
upper = mu + 2.0 * sd

abnormal = []
for patient, col, pred_cm, book_cm, e in rows:
    if e < lower or e > upper:
        abnormal.append((patient, col, pred_cm, book_cm, e))

df_ann_scaled.to_excel('annotation_points_summary_averaged_formatted.xlsx', index=False)

print('Optimized ratio:', round(best_ratio, 6))
print('Pairs used:', len(errs_all))
print('Error mean:', round(mu, 6))
print('Error std:', round(sd, 6))
print('Lower bound (mean-2SD):', round(lower, 6))
print('Upper bound (mean+2SD):', round(upper, 6))
print('Abnormal count:', len(abnormal))
if len(abnormal) > 0:
    head = min(50, len(abnormal))
    print('Abnormal samples (first {}):'.format(head))
    for i in range(head):
        p, c, pv, bv, e = abnormal[i]
        print('Patient:', p, '| Measure:', c, '| Pred(cm):', round(pv, 4), '| Book(cm):', round(bv, 4), '| Error:', round(e, 4))


Optimized ratio: 0.003985
Pairs used: 402
Error mean: -0.008319
Error std: 0.055875
Lower bound (mean-2SD): -0.12007
Upper bound (mean+2SD): 0.103432
Abnormal count: 18
Abnormal samples (first 18):
Patient: Abbey 009 | Measure: US_femoral_treat_base | Pred(cm): 0.3799 | Book(cm): 0.25 | Error: 0.1299
Patient: Abbey 003 | Measure: US_medial_contralateral_base | Pred(cm): 0.0613 | Book(cm): 0.22 | Error: -0.1587
Patient: Abbey 011 | Measure: US_medial_contralateral_base | Pred(cm): 0.1516 | Book(cm): 0.04 | Error: 0.1116
Patient: Abbey 009 | Measure: US_femoral_contralateral_base | Pred(cm): 0.3419 | Book(cm): 0.21 | Error: 0.1319
Patient: Abbey 018 | Measure: US_femoral_contralateral_base | Pred(cm): 0.4293 | Book(cm): 0.19 | Error: 0.2393
Patient: Abbey 010 | Measure: US_medial_treat_3 | Pred(cm): 0.075 | Book(cm): 0.2 | Error: -0.125
Patient: Abbey 011 | Measure: US_femoral_treat_3 | Pred(cm): 0.2104 | Book(cm): 0.08 | Error: 0.1304
Patient: Abbey 009 | Measure: US_femoral_contralater

## re-Label

labelme C:/Users/Charlotte/Desktop/dissertation/US_new/High_quality_images

应该是Abbey 13 - Patient: Abbey 009 | Measure: US_femoral_treat_base （删掉）
Patient: Abbey 003 | Measure: US_medial_contralateral_base （修改）
Patient: Abbey 011 | Measure: US_medial_contralateral_base （修改）
Patient: Abbey 009 | Measure: US_femoral_contralateral_base（修改）
Patient: Abbey 018 | Measure: US_femoral_contralateral_base（修改）
Patient: Abbey 010 | Measure: US_medial_treat_3（修改）
Patient: Abbey 011 | Measure: US_femoral_treat_3（修改）
Patient: Abbey 009 | Measure: US_femoral_contralateral_3（修改）
Patient: Abbey 011 | Measure: US_lateral_contralateral_3（修改）
Patient: Abbey 009 | Measure: US_medial_treat_4（修改）
Patient: Abbey 012 | Measure: US_femoral_treat_4（修改）
Patient: Abbey 003 | Measure: US_lateral_contralateral_4（修改）
Patient: Abbey 007 | Measure: US_lateral_contralateral_4（修改）
Patient: Abbey 007 | Measure: US_femoral_treat_5（修改）
Patient: Abbey 007 | Measure: US_lateral_treat_5（修改）
Patient: Abbey 003 | Measure: US_lateral_contralateral_5（修改）
Patient: Abbey 005 | Measure: US_lateral_contralateral_5（修改）
Patient: Abbey 007 | Measure: US_lateral_contralateral_5

In [34]:
import os, json
import pandas as pd

json_dir = r"C:\Users\Charlotte\Desktop\dissertation\US_new\Labelled_image"
records = []

for f in os.listdir(json_dir):
    if f.endswith(".json"):
        path = os.path.join(json_dir, f)
        with open(path, encoding="utf-8") as jf:
            data = json.load(jf)
            for shape in data.get("shapes", []):
                pts = shape["points"]
                if len(pts) == 2:
                    (x1, y1), (x2, y2) = pts
                    dist = ((x1 - x2)**2 + (y1 - y2)**2)**0.5
                    records.append({
                        "Filename": f.replace(".json", ""),
                        "Label": shape.get("label", ""),
                        "x1": x1, "y1": y1,
                        "x2": x2, "y2": y2,
                        "Pixel_Distance": dist
                    })

df = pd.DataFrame(records)

out_path = r"C:\Users\Charlotte\Desktop\dissertation\US_new\annotation_points_summary2.xlsx"
df.to_excel(out_path, index=False)
print(f"{len(df)} measurement point coordinates have been extracted and saved to: {out_path}")

1372 measurement point coordinates have been extracted and saved to: C:\Users\Charlotte\Desktop\dissertation\US_new\annotation_points_summary2.xlsx


In [35]:
import pandas as pd
import re

df = pd.read_excel("annotation_points_summary2.xlsx")

def extract_info(filename, label):
    fn = str(filename)
    lb = str(label).lower()

    # Patient: Abbey_001 → Abbey 001
    patient_match = re.search(r"Abbey[_\s]*\d{1,3}", fn, re.I)
    patient = patient_match.group(0).replace("_", " ").strip() if patient_match else None

    # Stage: 匹配 base 或 v3/v4/v5
    stage_match = re.search(r"(base|v\d+)", fn, re.I)
    # Stage: 修正版
    stage = None
    if "v1" in fn.lower():
        stage = "base"
    elif "v3" in fn.lower():
        stage = "3"
    elif "v4" in fn.lower():
        stage = "4"
    elif "v5" in fn.lower():
        stage = "5"
    else:
        # 兜底匹配 base
        if "base" in fn.lower():
            stage = "base"


    # Location
    location = "contralateral" if "contralateral" in fn.lower() else ("treat" if "treat" in fn.lower() else None)

    # Part from Label
    if "medial" in lb:
        part = "medial"
    elif "femoral" in lb:
        part = "femoral"
    elif "lateral" in lb:
        part = "lateral"
    else:
        part = None

    return patient, stage, location, part

df[["Patient", "Stage", "Location", "Part"]] = df.apply(lambda x: pd.Series(extract_info(x["Filename"], x["Label"])), axis=1)

grouped = df.groupby(["Patient", "Stage", "Location", "Part"])["Pixel_Distance"].mean().reset_index()

pivot = grouped.pivot_table(index="Patient", columns=["Part","Location","Stage"], values="Pixel_Distance")
pivot.columns = [f"US_{p}_{l}_{s}" for (p,l,s) in pivot.columns]
pivot.reset_index(inplace=True)

cols = ["Patient",
    "US_medial_treat_base","US_femoral_treat_base","US_lateral_treat_base",
    "US_medial_contralateral_base","US_femoral_contralateral_base","US_lateral_contralateral_base",
    "US_medial_treat_3","US_femoral_treat_3","US_lateral_treat_3",
    "US_medial_contralateral_3","US_femoral_contralateral_3","US_lateral_contralateral_3",
    "US_medial_treat_4","US_femoral_treat_4","US_lateral_treat_4",
    "US_medial_contralateral_4","US_femoral_contralateral_4","US_lateral_contralateral_4",
    "US_medial_treat_5","US_femoral_treat_5","US_lateral_treat_5",
    "US_medial_contralateral_5","US_femoral_contralateral_5","US_lateral_contralateral_5"
]

for c in cols:
    if c not in pivot.columns:
        pivot[c] = None
pivot = pivot[cols]

pivot.to_excel("annotation_points_summary_averaged_formatted2.xlsx", index=False)
print("Book-format summary saved as annotation_points_summary_averaged_formatted2.xlsx")

Book-format summary saved as annotation_points_summary_averaged_formatted2.xlsx


In [36]:
import numpy as np
import pandas as pd

df_book = pd.read_excel('Book2.xlsx')
df_ann = pd.read_excel('annotation_points_summary_averaged_formatted2.xlsx')

us_cols_book = [c for c in df_book.columns if c.startswith('US_')]
us_cols_ann = [c for c in df_ann.columns if c.startswith('US_')]
overlap = [c for c in us_cols_book if c in us_cols_ann]

merged = pd.merge(df_book[['Patient'] + overlap], df_ann[['Patient'] + overlap], on='Patient', how='inner', suffixes=('_book', '_ann'))

pairs_true = []
pairs_pred = []
for col in overlap:
    tb = f'{col}_book'
    ta = f'{col}_ann'
    sub = merged[[tb, ta]].dropna()
    if not sub.empty:
        pairs_true.append(sub[tb].values.astype(float))
        pairs_pred.append(sub[ta].values.astype(float))
if len(pairs_true) == 0:
    raise ValueError('No valid overlapping measurements to optimize on.')
y_true = np.concatenate(pairs_true)
x_pred = np.concatenate(pairs_pred)

ratios = np.linspace(0.001, 0.1, 200)
best_ratio = None
best_mae = float('inf')
for r in ratios:
    mae = np.mean(np.abs(y_true - x_pred * r))
    if mae < best_mae:
        best_mae = mae
        best_ratio = r

df_ann_scaled = df_ann.copy()
for col in overlap:
    if col in df_ann_scaled.columns:
        df_ann_scaled[col] = df_ann_scaled[col].astype(float) * best_ratio

merged_scaled = pd.merge(df_book[['Patient'] + overlap], df_ann_scaled[['Patient'] + overlap], on='Patient', how='inner', suffixes=('_book', '_ann'))

errs = []
rows = []
for col in overlap:
    tb = f'{col}_book'
    ta = f'{col}_ann'
    sub = merged_scaled[['Patient', tb, ta]].dropna()
    if not sub.empty:
        e = sub[ta].values.astype(float) - sub[tb].values.astype(float)
        errs.append(e)
        for i in range(len(sub)):
            rows.append((sub.iloc[i]['Patient'], col, float(sub.iloc[i][ta]), float(sub.iloc[i][tb]), float(e[i])))
if len(errs) == 0:
    raise ValueError('No valid pairs after scaling.')
errs_all = np.concatenate(errs)
mu = float(np.mean(errs_all))
sd = float(np.std(errs_all))
lower = mu - 2.0 * sd
upper = mu + 2.0 * sd

abnormal = []
for patient, col, pred_cm, book_cm, e in rows:
    if e < lower or e > upper:
        abnormal.append((patient, col, pred_cm, book_cm, e))

df_ann_scaled.to_excel('annotation_points_summary_averaged_formatted2.xlsx', index=False)

print('Optimized ratio:', round(best_ratio, 6))
print('Pairs used:', len(errs_all))
print('Error mean:', round(mu, 6))
print('Error std:', round(sd, 6))
print('Lower bound (mean-2SD):', round(lower, 6))
print('Upper bound (mean+2SD):', round(upper, 6))
print('Abnormal count:', len(abnormal))
if len(abnormal) > 0:
    head = min(50, len(abnormal))
    print('Abnormal samples (first {}):'.format(head))
    for i in range(head):
        p, c, pv, bv, e = abnormal[i]
        print('Patient:', p, '| Measure:', c, '| Pred(cm):', round(pv, 4), '| Book(cm):', round(bv, 4), '| Error:', round(e, 4))


Optimized ratio: 0.003985
Pairs used: 399
Error mean: -0.009972
Error std: 0.050526
Lower bound (mean-2SD): -0.111023
Upper bound (mean+2SD): 0.09108
Abnormal count: 13
Abnormal samples (first 13):
Patient: Abbey 003 | Measure: US_femoral_contralateral_base | Pred(cm): 0.3161 | Book(cm): 0.21 | Error: 0.1061
Patient: Abbey 011 | Measure: US_medial_contralateral_3 | Pred(cm): 0.1318 | Book(cm): 0.04 | Error: 0.0918
Patient: Abbey 009 | Measure: US_medial_treat_4 | Pred(cm): 0.2825 | Book(cm): 0.18 | Error: 0.1025
Patient: Abbey 012 | Measure: US_femoral_treat_4 | Pred(cm): 0.3678 | Book(cm): 0.16 | Error: 0.2078
Patient: Abbey 003 | Measure: US_lateral_treat_4 | Pred(cm): 0.1159 | Book(cm): 0.23 | Error: -0.1141
Patient: Abbey 009 | Measure: US_lateral_treat_4 | Pred(cm): 0.2799 | Book(cm): 0.18 | Error: 0.0999
Patient: Abbey 007 | Measure: US_femoral_treat_5 | Pred(cm): 0.1351 | Book(cm): 0.28 | Error: -0.1449
Patient: Abbey 011 | Measure: US_femoral_treat_5 | Pred(cm): 0.1703 | Book(c

## re-re-Label

In [None]:
Patient: Abbey 003 | Measure: US_femoral_contralateral_base *
Patient: Abbey 011 | Measure: US_medial_contralateral_3 *
Patient: Abbey 009 | Measure: US_medial_treat_4 *
Patient: Abbey 012 | Measure: US_femoral_treat_4 *
Patient: Abbey 003 | Measure: US_lateral_treat_4 *
Patient: Abbey 009 | Measure: US_lateral_treat_4 *
Patient: Abbey 007 | Measure: US_femoral_treat_5 *
Patient: Abbey 011 | Measure: US_femoral_treat_5 * 
Patient: Abbey 007 | Measure: US_lateral_treat_5 *
Patient: Abbey 016 | Measure: US_medial_contralateral_5 *
Patient: Abbey 003 | Measure: US_lateral_contralateral_5 EG
Patient: Abbey 005 | Measure: US_lateral_contralateral_5 *
Patient: Abbey 012 | Measure: US_lateral_contralateral_5 *

In [48]:
import os, json
import pandas as pd

json_dir = r"C:\Users\Charlotte\Desktop\dissertation\US_new\Labelled_image"
records = []

for f in os.listdir(json_dir):
    if f.endswith(".json"):
        path = os.path.join(json_dir, f)
        with open(path, encoding="utf-8") as jf:
            data = json.load(jf)
            for shape in data.get("shapes", []):
                pts = shape["points"]
                if len(pts) == 2:
                    (x1, y1), (x2, y2) = pts
                    dist = ((x1 - x2)**2 + (y1 - y2)**2)**0.5
                    records.append({
                        "Filename": f.replace(".json", ""),
                        "Label": shape.get("label", ""),
                        "x1": x1, "y1": y1,
                        "x2": x2, "y2": y2,
                        "Pixel_Distance": dist
                    })

df = pd.DataFrame(records)

out_path = r"C:\Users\Charlotte\Desktop\dissertation\US_new\annotation_points_summary2.xlsx"
df.to_excel(out_path, index=False)
print(f"{len(df)} measurement point coordinates have been extracted and saved to: {out_path}")

1374 measurement point coordinates have been extracted and saved to: C:\Users\Charlotte\Desktop\dissertation\US_new\annotation_points_summary2.xlsx


In [49]:
import pandas as pd
import re

df = pd.read_excel("annotation_points_summary2.xlsx")

def extract_info(filename, label):
    fn = str(filename)
    lb = str(label).lower()

    # Patient: Abbey_001 → Abbey 001
    patient_match = re.search(r"Abbey[_\s]*\d{1,3}", fn, re.I)
    patient = patient_match.group(0).replace("_", " ").strip() if patient_match else None

    # Stage: 匹配 base 或 v3/v4/v5
    stage_match = re.search(r"(base|v\d+)", fn, re.I)
    # Stage: 修正版
    stage = None
    if "v1" in fn.lower():
        stage = "base"
    elif "v3" in fn.lower():
        stage = "3"
    elif "v4" in fn.lower():
        stage = "4"
    elif "v5" in fn.lower():
        stage = "5"
    else:
        # 兜底匹配 base
        if "base" in fn.lower():
            stage = "base"


    # Location
    location = "contralateral" if "contralateral" in fn.lower() else ("treat" if "treat" in fn.lower() else None)

    # Part from Label
    if "medial" in lb:
        part = "medial"
    elif "femoral" in lb:
        part = "femoral"
    elif "lateral" in lb:
        part = "lateral"
    else:
        part = None

    return patient, stage, location, part

df[["Patient", "Stage", "Location", "Part"]] = df.apply(lambda x: pd.Series(extract_info(x["Filename"], x["Label"])), axis=1)

grouped = df.groupby(["Patient", "Stage", "Location", "Part"])["Pixel_Distance"].mean().reset_index()

pivot = grouped.pivot_table(index="Patient", columns=["Part","Location","Stage"], values="Pixel_Distance")
pivot.columns = [f"US_{p}_{l}_{s}" for (p,l,s) in pivot.columns]
pivot.reset_index(inplace=True)

cols = ["Patient",
    "US_medial_treat_base","US_femoral_treat_base","US_lateral_treat_base",
    "US_medial_contralateral_base","US_femoral_contralateral_base","US_lateral_contralateral_base",
    "US_medial_treat_3","US_femoral_treat_3","US_lateral_treat_3",
    "US_medial_contralateral_3","US_femoral_contralateral_3","US_lateral_contralateral_3",
    "US_medial_treat_4","US_femoral_treat_4","US_lateral_treat_4",
    "US_medial_contralateral_4","US_femoral_contralateral_4","US_lateral_contralateral_4",
    "US_medial_treat_5","US_femoral_treat_5","US_lateral_treat_5",
    "US_medial_contralateral_5","US_femoral_contralateral_5","US_lateral_contralateral_5"
]

for c in cols:
    if c not in pivot.columns:
        pivot[c] = None
pivot = pivot[cols]

pivot.to_excel("annotation_points_summary_averaged_formatted2.xlsx", index=False)
print("Book-format summary saved as annotation_points_summary_averaged_formatted2.xlsx")

Book-format summary saved as annotation_points_summary_averaged_formatted2.xlsx


In [50]:
import numpy as np
import pandas as pd

df_book = pd.read_excel('Book2.xlsx')
df_ann = pd.read_excel('annotation_points_summary_averaged_formatted2.xlsx')

us_cols_book = [c for c in df_book.columns if c.startswith('US_')]
us_cols_ann = [c for c in df_ann.columns if c.startswith('US_')]
overlap = [c for c in us_cols_book if c in us_cols_ann]

merged = pd.merge(df_book[['Patient'] + overlap], df_ann[['Patient'] + overlap], on='Patient', how='inner', suffixes=('_book', '_ann'))

pairs_true = []
pairs_pred = []
for col in overlap:
    tb = f'{col}_book'
    ta = f'{col}_ann'
    sub = merged[[tb, ta]].dropna()
    if not sub.empty:
        pairs_true.append(sub[tb].values.astype(float))
        pairs_pred.append(sub[ta].values.astype(float))
if len(pairs_true) == 0:
    raise ValueError('No valid overlapping measurements to optimize on.')
y_true = np.concatenate(pairs_true)
x_pred = np.concatenate(pairs_pred)

ratios = np.linspace(0.001, 0.1, 200)
best_ratio = None
best_mae = float('inf')
for r in ratios:
    mae = np.mean(np.abs(y_true - x_pred * r))
    if mae < best_mae:
        best_mae = mae
        best_ratio = r

df_ann_scaled = df_ann.copy()
for col in overlap:
    if col in df_ann_scaled.columns:
        df_ann_scaled[col] = df_ann_scaled[col].astype(float) * best_ratio

merged_scaled = pd.merge(df_book[['Patient'] + overlap], df_ann_scaled[['Patient'] + overlap], on='Patient', how='inner', suffixes=('_book', '_ann'))

errs = []
rows = []
for col in overlap:
    tb = f'{col}_book'
    ta = f'{col}_ann'
    sub = merged_scaled[['Patient', tb, ta]].dropna()
    if not sub.empty:
        e = sub[ta].values.astype(float) - sub[tb].values.astype(float)
        errs.append(e)
        for i in range(len(sub)):
            rows.append((sub.iloc[i]['Patient'], col, float(sub.iloc[i][ta]), float(sub.iloc[i][tb]), float(e[i])))
if len(errs) == 0:
    raise ValueError('No valid pairs after scaling.')
errs_all = np.concatenate(errs)
mu = float(np.mean(errs_all))
sd = float(np.std(errs_all))
lower = mu - 2.0 * sd
upper = mu + 2.0 * sd

abnormal = []
for patient, col, pred_cm, book_cm, e in rows:
    if e < lower or e > upper:
        abnormal.append((patient, col, pred_cm, book_cm, e))

df_ann_scaled.to_excel('annotation_points_summary_averaged_formatted2.xlsx', index=False)

print('Optimized ratio:', round(best_ratio, 6))
print('Pairs used:', len(errs_all))
print('Error mean:', round(mu, 6))
print('Error std:', round(sd, 6))
print('Lower bound (mean-2SD):', round(lower, 6))
print('Upper bound (mean+2SD):', round(upper, 6))
print('Abnormal count:', len(abnormal))
if len(abnormal) > 0:
    head = min(50, len(abnormal))
    print('Abnormal samples (first {}):'.format(head))
    for i in range(head):
        p, c, pv, bv, e = abnormal[i]
        print('Patient:', p, '| Measure:', c, '| Pred(cm):', round(pv, 4), '| Book(cm):', round(bv, 4), '| Error:', round(e, 4))


Optimized ratio: 0.003985
Pairs used: 399
Error mean: -0.011181
Error std: 0.048017
Lower bound (mean-2SD): -0.107216
Upper bound (mean+2SD): 0.084854
Abnormal count: 12
Abnormal samples (first 12):
Patient: Abbey 005 | Measure: US_medial_contralateral_base | Pred(cm): 0.2478 | Book(cm): 0.16 | Error: 0.0878
Patient: Abbey 012 | Measure: US_medial_contralateral_base | Pred(cm): 0.1977 | Book(cm): 0.11 | Error: 0.0877
Patient: Abbey 009 | Measure: US_femoral_contralateral_base | Pred(cm): 0.3002 | Book(cm): 0.21 | Error: 0.0902
Patient: Abbey 007 | Measure: US_femoral_treat_3 | Pred(cm): 0.1713 | Book(cm): 0.28 | Error: -0.1087
Patient: Abbey 007 | Measure: US_medial_contralateral_3 | Pred(cm): 0.0802 | Book(cm): 0.19 | Error: -0.1098
Patient: Abbey 009 | Measure: US_lateral_contralateral_3 | Pred(cm): 0.2352 | Book(cm): 0.15 | Error: 0.0852
Patient: Abbey 012 | Measure: US_femoral_treat_4 | Pred(cm): 0.3498 | Book(cm): 0.16 | Error: 0.1898
Patient: Abbey 018 | Measure: US_femoral_treat

In [1]:
from pathlib import Path
import re


root = Path(r"C:\Users\Charlotte\Desktop\dissertation\US_new\High_quality_images")
def normalize_filename(name: str) -> str:
    """
    将:
        'Abbey 001_v1_contralateral_base (1).jpg'
    统一成:
        'Abbey_001_v1_contralateral_base_1.jpg'
    """
    stem, _, ext = name.rpartition(".")
    ext = ext.lower() if ext else "jpg"
    s = stem.strip()
    s = re.sub(r"\s*\((\d+)\)\s*$", r"_\1", s)
    s = re.sub(r"\s+", "_", s)
    s = re.sub(r"_+", "_", s)
    s = re.sub(r"([A-Za-z]+)_0*(\d{1,3})", lambda m: f"{m.group(1)}_{int(m.group(2)):03d}", s)

    return s + "." + ext


changed = []
for p in root.glob("*.*"):
    if p.suffix.lower() not in [".jpg", ".jpeg", ".png"]:
        continue
    new_name = normalize_filename(p.name)
    if p.name != new_name:
        new_path = p.with_name(new_name)
        if new_path.exists():
            print(f"⚠️ 目标已存在，跳过: {new_path.name}")
            continue
        p.rename(new_path)
        changed.append((p.name, new_name))

print(f"✅ 已重命名 {len(changed)} 个文件:")
for old, new in changed[:10]:
    print(f" - {old}  →  {new}")
if len(changed) > 10:
    print(f" ... 共 {len(changed)} 个文件已更新。")

✅ 已重命名 1179 个文件:
 - Abbey 001_v1_contralateral_base (1).jpg  →  Abbey_001_v1_contralateral_base_001.jpg
 - Abbey 001_v1_contralateral_base (4).jpg  →  Abbey_001_v1_contralateral_base_004.jpg
 - Abbey 001_v1_treat_base (6).jpg  →  Abbey_001_v1_treat_base_006.jpg
 - Abbey 001_v1_treat_base (7).jpg  →  Abbey_001_v1_treat_base_007.jpg
 - Abbey 001_v3_contralateral_3 (1).jpg  →  Abbey_001_v3_contralateral_003_1.jpg
 - Abbey 001_v3_contralateral_3 (4).jpg  →  Abbey_001_v3_contralateral_003_4.jpg
 - Abbey 001_v3_contralateral_3 (5).jpg  →  Abbey_001_v3_contralateral_003_5.jpg
 - Abbey 001_v3_treat_3 (11).jpg  →  Abbey_001_v3_treat_003_11.jpg
 - Abbey 001_v3_treat_3 (12).jpg  →  Abbey_001_v3_treat_003_12.jpg
 - Abbey 001_v3_treat_3 (7).jpg  →  Abbey_001_v3_treat_003_7.jpg
 ... 共 1179 个文件已更新。


In [3]:
from pathlib import Path
import re

# === 设置图像文件夹路径 ===
root = Path(r"C:\Users\Charlotte\Desktop\dissertation\US_new\High_quality_images")

# === 规则：将所有中间或末尾 "_00X" → "_X"，但保留病人编号 "Abbey_001" 不变 ===
def fix_all_numbers(name: str) -> str:
    """
    移除除病人编号外的多余前导零
    e.g. Abbey_001_v4_treat_004_7 -> Abbey_001_v4_treat_4_7
    """
    stem, _, ext = name.rpartition(".")
    ext = ext.lower()

    # 保留病人编号 (第一个 "_001")
    parts = stem.split("_")
    if len(parts) > 2:
        fixed_parts = [parts[0], parts[1]]  # 前两段: Abbey, 001
        for p in parts[2:]:
            fixed_parts.append(re.sub(r"^0+(\d+)$", r"\1", p))
        new_stem = "_".join(fixed_parts)
    else:
        new_stem = re.sub(r"_0+(\d+)", r"_\1", stem)

    return new_stem + "." + ext


# === 执行批量修改 ===
changed = []
for p in root.glob("*.*"):
    if p.suffix.lower() not in [".jpg", ".jpeg", ".png"]:
        continue
    new_name = fix_all_numbers(p.name)
    if new_name != p.name:
        new_path = p.with_name(new_name)
        if new_path.exists():
            print(f"⚠️ 目标已存在，跳过: {new_path.name}")
            continue
        p.rename(new_path)
        changed.append((p.name, new_name))

print(f"✅ 已修正 {len(changed)} 个文件名:")
for old, new in changed[:10]:
    print(f" - {old}  →  {new}")
if len(changed) > 10:
    print(f" ... 共 {len(changed)} 个文件已更新。")

✅ 已修正 831 个文件名:
 - Abbey_001_v3_contralateral_003_1.jpg  →  Abbey_001_v3_contralateral_3_1.jpg
 - Abbey_001_v3_contralateral_003_4.jpg  →  Abbey_001_v3_contralateral_3_4.jpg
 - Abbey_001_v3_contralateral_003_5.jpg  →  Abbey_001_v3_contralateral_3_5.jpg
 - Abbey_001_v3_treat_003_11.jpg  →  Abbey_001_v3_treat_3_11.jpg
 - Abbey_001_v3_treat_003_12.jpg  →  Abbey_001_v3_treat_3_12.jpg
 - Abbey_001_v3_treat_003_7.jpg  →  Abbey_001_v3_treat_3_7.jpg
 - Abbey_001_v3_treat_003_8.jpg  →  Abbey_001_v3_treat_3_8.jpg
 - Abbey_001_v4_contralateral_004_1.jpg  →  Abbey_001_v4_contralateral_4_1.jpg
 - Abbey_001_v4_contralateral_004_2.jpg  →  Abbey_001_v4_contralateral_4_2.jpg
 - Abbey_001_v4_contralateral_004_4.jpg  →  Abbey_001_v4_contralateral_4_4.jpg
 ... 共 831 个文件已更新。
