In [48]:
import pandas as pd
import numpy as np

from src.config import METADATA_PATH, SPLIT_PATH, NEGIBOX_PATH, CHEXPERT_PATH

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [49]:
df_split = pd.read_csv(SPLIT_PATH)
df_metadata = pd.read_csv(METADATA_PATH)

In [50]:
# initialize view with a mapping from ViewPosition
VIEW_MAP = {
    'AP': 'frontal',
    'PA': 'frontal',
    'LATERAL': 'lateral',
    'LL': 'lateral',
    'LPO': 'other',
    'RAO': 'other',
    'RPO': 'other',
    'LAO': 'other',
    # the below are overwritten in some instances by manual review
    'AP AXIAL': 'other',
    'XTABLE LATERAL': 'other',
    'AP LLD': 'other',
    'PA LLD': 'other',
    'L5 S1': 'other',
    'SWIMMERS': 'other',
    'AP RLD': 'other',
    'PA RLD': 'other',
}
df_metadata['view'] = df_metadata['ViewPosition'].map(VIEW_MAP)

In [51]:
nb = pd.read_csv(NEGIBOX_PATH)
cx = pd.read_csv(CHEXPERT_PATH)

In [52]:
df_metadata[df_metadata['view'] == 'frontal'].merge(df_split, on='dicom_id')['split'].value_counts(dropna=False, normalize=True)

train       0.977964
test        0.013985
validate    0.008051
Name: split, dtype: float64

In [53]:
nb

Unnamed: 0,subject_id,study_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices
0,10000032,50414267,,,,,,,,,1.0,,,,,
1,10000032,53189527,,,,,,,,,1.0,,,,,
2,10000032,53911762,,,,,,,,,1.0,,,,,
3,10000032,56699142,,,,,,,,,1.0,,,,,
4,10000764,57375967,,,1.0,,,,,,,,,-1.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227822,19999442,58708861,,,,,,,,,1.0,,,,,1.0
227823,19999733,57132437,,,,,,,,,1.0,,,,,
227824,19999987,55368167,1.0,-1.0,,,,,0.0,,,0.0,,,0.0,
227825,19999987,58621812,1.0,,,,,,,,,,,,,1.0


In [57]:
metadata_cols = ['dicom_id', 'subject_id', 'study_id', 'view']
nb_cols = ['Pleural Effusion', 'subject_id', 'study_id']
split_cols = ['dicom_id', 'split']

df = df_metadata[metadata_cols]\
        .merge(nb[nb_cols], on=['subject_id', 'study_id'])\
        .drop(columns=['subject_id', 'study_id'])\
        .merge(df_split[split_cols], on='dicom_id')
df = df[df.view == 'frontal']


df['Pleural Effusion'] = df['Pleural Effusion'].fillna('na')
mapper = {
    -1   : 0,
    'na' : 0,
    0    : 0,
    1    : 1
}
df['Pleural Effusion_mapped'] = df['Pleural Effusion'].map(mapper)


df.groupby(['split', 'Pleural Effusion_mapped']).count()
['dicom_id']

split     Pleural Effusion_mapped
test      0                            2327
          1                            1076
train     0                          182839
          1                           55123
validate  0                            1461
          1                             498
Name: dicom_id, dtype: int64

In [69]:
train_data = df[df.split == 'train']
train_data['Pleural Effusion_mapped'].unique()

array([0, 1], dtype=int64)

In [77]:
from src.data.loaders import create_data_folder_for_model

for label in train_data['Pleural Effusion_mapped'].unique():
    create_data_folder_for_model(list(train_data['dicom_id']), type='train', label=str(label))

{'02aa804e-bde0afdd-112c0b34-7bc16630-4e384014': 'C:/Users/itama/Google Drive/github/MAE-MIMIC-CXR/data/mimic-cxr/files/p10/p10000032/s50414267/02aa804e-bde0afdd-112c0b34-7bc16630-4e384014.jpg', '2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab': 'C:/Users/itama/Google Drive/github/MAE-MIMIC-CXR/data/mimic-cxr/files/p10/p10000032/s53189527/2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab.jpg', '68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714': 'C:/Users/itama/Google Drive/github/MAE-MIMIC-CXR/data/mimic-cxr/files/p10/p10000032/s53911762/68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714.jpg', 'fffabebf-74fd3a1f-673b6b41-96ec0ac9-2ab69818': 'C:/Users/itama/Google Drive/github/MAE-MIMIC-CXR/data/mimic-cxr/files/p10/p10000032/s53911762/fffabebf-74fd3a1f-673b6b41-96ec0ac9-2ab69818.jpg', 'ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c': 'C:/Users/itama/Google Drive/github/MAE-MIMIC-CXR/data/mimic-cxr/files/p10/p10000032/s56699142/ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c.jpg', '096052b7-d256dc40-453a102b-f