In [None]:
import pandas as pd
import os

# Understand masterlist files
sandy = {
    '9910' : pd.read_excel("9910-Master-deid-SF-8.12.21QA.xlsx").rename(columns=str.lower),
    '9202' : pd.read_excel("9202-Master-Deid-SF-Upload-8.12.2021.xlsx").rename(columns=str.lower),
    '9408' : pd.read_excel("9408-Master-Deid-SF-8.12.21.xlsx").rename(columns=str.lower),
    '9413' : pd.read_excel("9413-Master-De-id List_SF-8.12.2021.xlsx").rename(columns=str.lower),
    '0126' : pd.read_excel("0126 Master Export-De-id-SF-8.12.2021.xlsx").rename(columns=str.lower),
}
sandy = {key : val.rename(columns={'cn_deid' : 'cn_deidentified'}) for key,val in sandy.items()}

stats = []
stats_details = {}
for sn in ['9202', '9408', '9413', '0126', '9910']:
    cn_sandy = set(sandy[sn]['cn_deidentified'].unique())
    cn_filesystem_quilts = set([float(f.split("_")[0]) for f in os.listdir("/export/medical_ai/ucsf/ssl_rtog/moco/model_R50_b=256_lr=0.03_pg4plus_fulldata/features/RTOG-{}_quilts/".format(sn))])
    stats.append((sn, 
                  len(cn_sandy), 
                  len(cn_filesystem_quilts),
                  len(cn_sandy - cn_filesystem_quilts), 
                 ))
    stats_details[sn] = {'cn_sandy' : cn_sandy,
                         'cn_filesystem_quilts' : cn_filesystem_quilts,
                        }
    
stats = pd.DataFrame(stats, columns=['sn', 
                                     'cn_sandy', 
                                     'cn_filesystem_quilts',
                                     'sandy - filesystem_quilts', 
                                    ])
stats = stats.append(stats.sum(), ignore_index=True)
display(stats)
print("Example missing Quilts from 9202:")
print(set(stats_details['9202']['cn_sandy'] - stats_details['9202']['cn_filesystem_quilts']))
print()
print("Consider case 17:")
display(sandy['9202'][sandy['9202']['cn_deidentified'] == 17.0])
print()
print("It's image_ids are in the feature pickles, but its cn_deidentified is not:")
base_path = "/export/medical_ai/ucsf/ssl_rtog/moco/model_R50_b=256_lr=0.03_pg4plus_fulldata/features/"
print(base_path + "RTOG-9202_features/99653.pkl")
print(os.path.exists(
    base_path + "RTOG-9202_features/99653.pkl"
))
print(base_path + "RTOG-9202_quilts/17_quilt_feature.pkl")
print(os.path.exists(
    base_path + "RTOG-9202_quilts/17_quilt_feature.pkl"
))
print()

# Hypothesis: Sandy's list is in our features, but not in our quilts
print("Hypothesis: Sandy's list is in our features, but not in our quilts")
feature_path = "/export/medical_ai/ucsf/ssl_rtog/moco/model_R50_b=256_lr=0.03_pg4plus_fulldata/features/RTOG-{}_features/{:0.0f}.pkl"
missing_features = []
for sn in ['9202', '9408', '9413', '0126', '9910']:
    sandy_minus_quilts = set(stats_details[sn]['cn_sandy'] - stats_details[sn]['cn_filesystem_quilts'])
    for cn_deid in sandy_minus_quilts:
        image_ids = sandy[sn][sandy[sn]['cn_deidentified'] == cn_deid]['image id'].values
        for iid in image_ids:
            fp = feature_path.format(sn, iid)
            if not os.path.exists(fp):
                missing_features.append(fp)
print("{} feature pickles are missing from {}".format(len(missing_features), feature_path))

feature_quilt_path = "/export/medical_ai/ucsf/ssl_rtog/moco/model_R50_b=256_lr=0.03_pg4plus_fulldata/features/RTOG-{}_quilts/{:0.0f}.pkl"
missing_quilts = []
for sn in ['9202', '9408', '9413', '0126', '9910']:
    sandy_minus_quilts = set(stats_details[sn]['cn_sandy'] - stats_details[sn]['cn_filesystem_quilts'])
    for cn_deid in sandy_minus_quilts:
        fp = feature_path.format(sn, cn_deid)
        if not os.path.exists(fp):
            missing_quilts.append(fp)
print("{} feature quilts are missing from {}".format(len(missing_quilts), feature_quilt_path))

In [None]:
df = sandy["9202"].copy()
sandy["9202"]["image id"] = sandy["9202"]["image id"].apply(lambda x: str(int(x))+".pkl")
sandy["9202"]["cn_deidentified"] = sandy["9202"]["cn_deidentified"].apply(lambda x: str(int(x)))
sandy["9202"]

In [None]:
for key in list(sandy.keys()):
    df = sandy[key]
    df = df.rename(columns={'image id': 'image_id'})
    df = df.dropna(subset=["cn_deidentified"])
    
    df["image_id"] = df["image_id"].apply(lambda x: str(int(x))+".pkl")
    df["cn_deidentified"] = df["cn_deidentified"].apply(lambda x: str(int(x)))
    df = df.drop_duplicates(subset='image_id', keep="last")
    df.to_csv("/export/home/rtog_dfs/master_lists/{}.csv".format(key))
df