In [30]:
import pandas as pd

methods = ['v1', 'robust', 'synthstrip', 'hdctbet', 'ctbet', 'brainchop', 'dockerctbet']
prefixes = ("6046", "6084", "6096", "6246", "6315", "6342", "6499")

for method in methods:
    df = pd.read_csv(f"/Users/rushil/ichseg/{method}/annotations.csv")
    # count how many filenames start with any of the prefixes
    cnt = df['Filename'].str.startswith(prefixes).sum()
    print(f"{method}: {cnt} files match the prefixes")

v1: 47 files match the prefixes
robust: 47 files match the prefixes
synthstrip: 47 files match the prefixes
hdctbet: 47 files match the prefixes
ctbet: 22 files match the prefixes
brainchop: 47 files match the prefixes
dockerctbet: 47 files match the prefixes


In [31]:
import pandas as pd
result = pd.read_csv("/Users/rushil/Downloads/header_data.csv")
exclude = [6046, 6084, 6096, 6246, 6315, 6342, 6499]
out = result[~result['id_patient_short'].isin(exclude)].copy()

out['patient_id'] = out['dir_series'].str.extract(r'(\d+-\d+)')

# 2b) or, if you want it as a DataFrame:
counts_df = (
    out
    .groupby('patient_id')
    .size()
    .reset_index(name='count')
    .sort_values('count', ascending=False)
)
counts_df



  result = pd.read_csv("/Users/rushil/Downloads/header_data.csv")


Unnamed: 0,patient_id,count
349,6356-154,19
336,6343-214,19
387,6394-143,18
393,6400-223,17
439,6446-280,17
...,...,...
305,6310-320,4
4,6005-230,4
287,6292-216,3
48,6050-296,3


In [32]:
import pandas as pd

# 1) load your annotations
df = pd.read_csv("/Users/rushil/ichseg/robust/annotations.csv")

# 2) pull out the numeric patient‐short (e.g. “6022” from “6022-240_…”)
df["id_patient_short"] = (
    df["Filename"].str.extract(r"^(\d+)", expand=False)
      .astype(int)
)

# 3) drop the unwanted patients
exclude = [6046, 6084, 6096, 6246, 6315, 6342, 6499]
df = df[~df["id_patient_short"].isin(exclude)]

# 4) now extract the full patient‐series ID and count
df["patient_id"] = df["Filename"].str.extract(r"^(\d+-\d+)", expand=False)
counts = df["patient_id"].value_counts().reset_index()
counts.columns = ["patient_id", "count"]

counts


Unnamed: 0,patient_id,count
0,6356-154,19
1,6343-214,19
2,6394-143,18
3,6400-223,17
4,6446-280,17
...,...,...
494,6005-230,4
495,6310-320,4
496,6050-296,3
497,6292-216,3


In [33]:
import pandas as pd

# assume you already have these two DataFrames:
# counts_df  → columns ['patient_id','count']  from header_data.csv
# counts     → columns ['patient_id','count']  from annotations.csv

# 1) rename the count columns so they don’t clash
hdr = counts_df.rename(columns={'count':'hdr_count'})
ann = counts   .rename(columns={'count':'ann_count'})

# 2) outer‐join on patient_id
cmp = (hdr
       .merge(ann, on='patient_id', how='outer')
       .fillna(0)
    )

# ensure integers
cmp['hdr_count'] = cmp['hdr_count'].astype(int)
cmp['ann_count'] = cmp['ann_count'].astype(int)

# 3) find mismatches
mismatch = cmp[cmp['hdr_count'] != cmp['ann_count']]

print("These patient_ids have different counts (header vs annotation):")
print(mismatch)


These patient_ids have different counts (header vs annotation):
    patient_id  hdr_count  ann_count
72    6109-317         13         12
77    6480-154         13         12
105   6193-324         12         11
183   6470-296         11         10
246   6142-308         10          9
318   6418-193          9          8
418   6257-335          8          7


In [34]:
import pandas as pd

# 1) Load both tables
hdr = pd.read_csv("/Users/rushil/Downloads/header_data.csv", low_memory=False)
ann = pd.read_csv("/Users/rushil/ichseg/robust/annotations.csv")

# 2) Re‑derive the same `patient_id` and “base” filename in each
hdr["patient_id"] = hdr["dir_series"].str.extract(r"(\d+-\d+)")
# pull just the filename (no path), swap .nii.gz → .png
hdr["base_file"] = (
    hdr["file_nifti"]
      .str.extract(r"([^/]+)\.nii\.gz$")[0]
      .astype(str)
      + ".png"
)

ann["patient_id"] = ann["Filename"].str.extract(r"^(\d+-\d+)")
ann["base_file"]  = ann["Filename"]

# 3) Your 7 mismatched IDs
mismatch_ids = ["6109-317","6480-154","6193-324","6470-296",
                "6142-308","6418-193","6257-335"]

# 4) Compute “header minus annotation” for each ID
missing = {}
for pid in mismatch_ids:
    hdr_set = set(hdr.loc[hdr["patient_id"]==pid, "base_file"])
    ann_set = set(ann.loc[ann["patient_id"]==pid,   "base_file"])
    missing[pid] = sorted(hdr_set - ann_set)

# 5) Display results
for pid, files in missing.items():
    print(f"{pid}: {len(files)} missing → {files}")


6109-317: 1 missing → ['6109-317_20150302_0647_ct.png']
6480-154: 1 missing → ['6480-154_20170622_0937_ct.png']
6193-324: 1 missing → ['6193-324_20150924_1431_ct.png']
6470-296: 1 missing → ['6470-296_20170602_0607_ct.png']
6142-308: 1 missing → ['6142-308_20150610_0707_ct.png']
6418-193: 1 missing → ['6418-193_20161228_1248_ct.png']
6257-335: 1 missing → ['6257-335_20160118_1150_ct.png']


In [38]:
# List of missing files to remove
missing_files_to_remove = [
    '6109-317_20150302_0647_ct.png',
    '6480-154_20170622_0937_ct.png', 
    '6193-324_20150924_1431_ct.png',
    '6470-296_20170602_0607_ct.png',
    '6142-308_20150610_0707_ct.png',
    '6418-193_20161228_1248_ct.png',
    '6257-335_20160118_1150_ct.png'
]

print("Before removal:")
for pid in mismatch_ids:
    print(f"\n{pid} - Header data:")
    print(hdr[hdr["patient_id"] == pid][["patient_id", "base_file", "dir_series"]])

# Remove the missing files from hdr dataframe
hdr_cleaned = hdr[~hdr["base_file"].isin(missing_files_to_remove)].copy()

print("\n" + "="*50)
print("After removal:")
for pid in mismatch_ids:
    print(f"\n{pid} - Header data (cleaned):")
    remaining_rows = hdr_cleaned[hdr_cleaned["patient_id"] == pid][["patient_id", "base_file", "dir_series"]]
    print(remaining_rows)
    print(f"Count: {len(remaining_rows)}")

# Verify the counts now match
print("\n" + "="*50)
print("Verification - counts should now match:")
for pid in mismatch_ids:
    hdr_count = len(hdr_cleaned[hdr_cleaned["patient_id"] == pid])
    ann_count = len(ann[ann["patient_id"] == pid])
    print(f"{pid}: Header={hdr_count}, Annotations={ann_count}, Match={hdr_count == ann_count}")
    
len(hdr_cleaned)

Before removal:

6109-317 - Header data:
     patient_id                      base_file  \
1097   6109-317  6109-317_20150202_1547_ct.png   
1098   6109-317  6109-317_20150202_2027_ct.png   
1099   6109-317  6109-317_20150203_0511_ct.png   
1100   6109-317  6109-317_20150204_0520_ct.png   
1101   6109-317  6109-317_20150204_1748_ct.png   
1102   6109-317  6109-317_20150205_1923_ct.png   
1103   6109-317  6109-317_20150206_0528_ct.png   
1104   6109-317  6109-317_20150207_0535_ct.png   
1105   6109-317  6109-317_20150208_0812_ct.png   
1106   6109-317  6109-317_20150208_1637_ct.png   
1107   6109-317  6109-317_20150209_1218_ct.png   
1108   6109-317  6109-317_20150210_0547_ct.png   
1109   6109-317  6109-317_20150302_0647_ct.png   

                                                                                                                   dir_series  
1097            /Users/johnmuschelli/Desktop/mistie_3/data/dicom/6109-317/6109-317_CT_20150202_1547/6109-317_CT_20150202_1547  
10

5109

In [40]:
hdr_cleaned.to_csv("/Users/rushil/ichseg/robust/header_data_cleaned.csv", index=False)