In [None]:
import pandas as pd
import os

# CheXpert

In [None]:
BASE_DIR_CHEXPERT = os.environ['BASE_DIR_CHEXPERT']

In [None]:
data_df = pd.read_csv(f'{BASE_DIR_CHEXPERT}/chexpert_batch_1_valid_and_csv/train.csv')
demo_df = pd.read_csv(f'{BASE_DIR_CHEXPERT}/chexpert_batch_1_valid_and_csv/chexpert_race_labels.csv')
data_df = data_df.rename(columns={"Path": "path"})

In [None]:
path_split =  data_df.path.str.split("/", expand = True)
data_df["patient_id"] = path_split[2]
demo_df = demo_df.rename(columns={'PATIENT': 'patient_id'})
data_df = data_df.merge(demo_df, on="patient_id")

mask = (data_df.PRIMARY_RACE.str.contains("Black", na=False))
data_df.loc[mask, "race"] = "BLACK/AFRICAN AMERICAN"

mask = (data_df.PRIMARY_RACE.str.contains("White", na=False))
data_df.loc[mask, "race"] = "WHITE"

mask = (data_df.PRIMARY_RACE.str.contains("Asian", na=False))
data_df.loc[mask, "race"] = "ASIAN"

mask = (data_df.PRIMARY_RACE.str.contains("Other", na=False))
data_df.loc[mask, "race"] = "OTHER"

In [None]:
data_df.to_csv(f"{BASE_DIR_CHEXPERT}/chexpert_batch_1_valid_and_csv/train_with_race_labels.csv")

# MIMIC-CXR

In [None]:
BASE_DIR_MIMIC = os.environ['BASE_DIR_MIMIC']

In [None]:
metadata_df = pd.read_csv(f'{BASE_DIR_MIMIC}/mimic-cxr-2.0.0-metadata.csv')
demographic_df = pd.read_csv(f'{BASE_DIR_MIMIC}/admissions.csv')
demographic_df = demographic_df.drop_duplicates(subset='subject_id')
patients = pd.read_csv(f'{BASE_DIR_MIMIC}/patients.csv')
patients = patients.drop_duplicates(subset='subject_id')
chexpert_labels = pd.read_csv(f'{BASE_DIR_MIMIC}/mimic-cxr-2.0.0-chexpert.csv')

In [None]:
# remove patients who have inconsistent documented race information
# credit to github.com/robintibor
ethnicity_df = demographic_df.loc[:,['subject_id', 'race']].drop_duplicates()

v = ethnicity_df.subject_id.value_counts()
subject_id_more_than_once = v.index[v.gt(1)]

ambiguous_ethnicity_df = ethnicity_df[ethnicity_df.subject_id.isin(subject_id_more_than_once)]
inconsistent_race = ambiguous_ethnicity_df.subject_id.unique()

grouped = ambiguous_ethnicity_df.groupby('subject_id')
grouped.aggregate(lambda x: "_".join(sorted(x))).race.value_counts()

In [None]:
merge_df = pd.merge(metadata_df,demographic_df,on='subject_id')
merge_df = pd.merge(merge_df,patients,on='subject_id')
merge_df = pd.merge(merge_df,chexpert_labels,on=['subject_id', 'study_id'])

merge_df = merge_df[~merge_df.subject_id.isin(inconsistent_race)]
merge_df = merge_df.rename(columns={"ethnicity": "race"})
merge_df = merge_df.rename(columns={"anchor_age": "age"})

merge_df = merge_df[merge_df.race.isin(['BLACK/AFRICAN AMERICAN','WHITE'])]
merge_df = merge_df[merge_df.ViewPosition.isin(['AP','PA', 'LATERAL'])]

In [None]:
data_df = merge_df.drop_duplicates(subset=['subject_id']) #.query("dicom_id in @dicom_ids")
data_df.subject_id = data_df.subject_id.astype(str)
data_df.study_id = data_df.study_id.astype(str)
data_df = data_df.fillna(0)
data_df.insert(2, "path", "")
data_df.path = data_df.subject_id.str[0:2]
data_df.path = "p" + data_df.path
data_df.path = data_df.path + "/p" + data_df.subject_id + "/s" + data_df.study_id + "/" + data_df.dicom_id + ".jpg"

In [None]:
data_df['path'].apply(lambda x: 'files/' + x).to_csv("/projects/leelab3/sgadgil/mimic_cxr_jpg/new_test_set_jpgs.txt", header=None, index=None)

In [None]:
data_df.to_csv(f"{BASE_DIR_MIMIC}/metadata.csv", index=False)