# ChestX-ray14 CSV

This notebook generates `chestxray14.csv` and `chestxray14_nf.csv` assuming the following are downloaded:

* [ChestX-ray14 Images and metadata](https://nihcc.app.box.com/v/ChestXray-NIHCC/folder/36938765345)


In [1]:
from os.path import join

import pandas as pd

from common import read_toml
from common import AGE_INTERVAL
from common import CHESTXRAY14_PATHOLOGIES as PATHOLOGIES

UNIQUE_STUDIES = False

### Listing files

In [2]:
ds_name = 'chestxray14'
config = read_toml('config.toml')
metachest_dir = config['metachest_dir']
base_dir = config[f'{ds_name}_dir']
!ls -hs1 {base_dir}

total 42G
8.6M Data_Entry_2017_v2020.csv
3.9M images
1.9G images_01.tar.gz
3.7G images_02.tar.gz
3.7G images_03.tar.gz
3.6G images_04.tar.gz
3.7G images_05.tar.gz
3.8G images_06.tar.gz
3.8G images_07.tar.gz
3.8G images_08.tar.gz
3.9G images_09.tar.gz
3.9G images_10.tar.gz
3.9G images_11.tar.gz
2.8G images_12.tar.gz


### Generating CSV

In [None]:
data_csv_path = join(base_dir, 'Data_Entry_2017_v2020.csv')
data_df = pd.read_csv(data_csv_path)
data_df

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y]
0,00000001_000.png,Cardiomegaly,0,1,57,M,PA,2682,2749,0.143,0.143
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168
3,00000002_000.png,No Finding,0,2,80,M,PA,2500,2048,0.171,0.171
4,00000003_001.png,Hernia,0,3,74,F,PA,2500,2048,0.168,0.168
...,...,...,...,...,...,...,...,...,...,...,...
112115,00030801_001.png,Mass|Pneumonia,1,30801,38,M,PA,2048,2500,0.168,0.168
112116,00030802_000.png,No Finding,0,30802,28,M,PA,2048,2500,0.168,0.168
112117,00030803_000.png,No Finding,0,30803,42,F,PA,2048,2500,0.168,0.168
112118,00030804_000.png,No Finding,0,30804,29,F,PA,2048,2500,0.168,0.168


In [4]:
if UNIQUE_STUDIES:
    data_df = data_df.groupby('Patient ID').first()
    data_df

Filtering, formating and renaming columns:

In [None]:
new_data_df = data_df.rename(columns={
    'Image Index': 'name',
    'Finding Labels': 'labels',
    'Patient Age': 'age',
    'Patient Gender': 'sex',
    'View Position': 'view',
})
print('Available views: ', new_data_df.view.unique())
new_data_df = new_data_df[['name', 'age', 'sex', 'view', 'labels']]
# new_data_df = new_data_df[new_data_df['labels'] != 'No Finding']
new_data_df = new_data_df[new_data_df['age'].between(*AGE_INTERVAL, inclusive='both')]
new_data_df = new_data_df.reset_index(drop=True)
new_data_df['name'] = new_data_df['name'].str[:-4]
new_data_df['sex'] = new_data_df['sex'].str.lower()
new_data_df['view'] = new_data_df['view'].str.lower()
new_data_df['labels'] = new_data_df['labels'].str.lower()
new_data_df

Available views:  ['PA' 'AP']


Unnamed: 0,name,age,sex,view,labels
0,00000001_000,57,m,pa,cardiomegaly
1,00000001_001,58,m,pa,cardiomegaly|emphysema
2,00000001_002,58,m,pa,cardiomegaly|effusion
3,00000002_000,80,m,pa,no finding
4,00000003_001,74,f,pa,hernia
...,...,...,...,...,...
109746,00030801_001,38,m,pa,mass|pneumonia
109747,00030802_000,28,m,pa,no finding
109748,00030803_000,42,f,pa,no finding
109749,00030804_000,29,f,pa,no finding


Build labels dataframe:

In [6]:
labels = {}
labels_col = new_data_df['labels']
for pathology in PATHOLOGIES:
    mask = labels_col.str.contains(pathology)
    labels[pathology] = mask.values.astype(int)

labels_df = pd.DataFrame(labels)
# join
df = pd.concat([new_data_df, labels_df], axis=1)
df = df.drop(['labels'], axis=1)
df = df.reset_index(drop=True)
df.insert(0, 'dataset', 'chestxray14', True)
df

Unnamed: 0,dataset,name,age,sex,view,atelectasis,cardiomegaly,consolidation,edema,effusion,emphysema,fibrosis,hernia,infiltration,mass,nodule,pleural_thickening,pneumonia,pneumothorax
0,chestxray14,00000001_000,57,m,pa,0,1,0,0,0,0,0,0,0,0,0,0,0,0
1,chestxray14,00000001_001,58,m,pa,0,1,0,0,0,1,0,0,0,0,0,0,0,0
2,chestxray14,00000001_002,58,m,pa,0,1,0,0,1,0,0,0,0,0,0,0,0,0
3,chestxray14,00000002_000,80,m,pa,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,chestxray14,00000003_001,74,f,pa,0,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109746,chestxray14,00030801_001,38,m,pa,0,0,0,0,0,0,0,0,0,1,0,0,1,0
109747,chestxray14,00030802_000,28,m,pa,0,0,0,0,0,0,0,0,0,0,0,0,0,0
109748,chestxray14,00030803_000,42,f,pa,0,0,0,0,0,0,0,0,0,0,0,0,0,0
109749,chestxray14,00030804_000,29,f,pa,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Remove images incorrectly labeled:

In [None]:
incorrect_images = [
    '00009683_005',
    '00001153_005',
]
df = df[~df['name'].isin(incorrect_images)]
df = df.reset_index(drop=True)
df

Unnamed: 0,dataset,name,age,sex,view,atelectasis,cardiomegaly,consolidation,edema,effusion,emphysema,fibrosis,hernia,infiltration,mass,nodule,pleural_thickening,pneumonia,pneumothorax
0,chestxray14,00000001_000,57,m,pa,0,1,0,0,0,0,0,0,0,0,0,0,0,0
1,chestxray14,00000001_001,58,m,pa,0,1,0,0,0,1,0,0,0,0,0,0,0,0
2,chestxray14,00000001_002,58,m,pa,0,1,0,0,1,0,0,0,0,0,0,0,0,0
3,chestxray14,00000002_000,80,m,pa,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,chestxray14,00000003_001,74,f,pa,0,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109744,chestxray14,00030801_001,38,m,pa,0,0,0,0,0,0,0,0,0,1,0,0,1,0
109745,chestxray14,00030802_000,28,m,pa,0,0,0,0,0,0,0,0,0,0,0,0,0,0
109746,chestxray14,00030803_000,42,f,pa,0,0,0,0,0,0,0,0,0,0,0,0,0,0
109747,chestxray14,00030804_000,29,f,pa,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [8]:
wf_df = df[df[PATHOLOGIES].any(axis=1)]
nf_df = df[~df[PATHOLOGIES].any(axis=1)]

### Overview

In [9]:
print(
    f'        Total: {df.shape[0]}\n'
    f'With findings: {wf_df.shape[0]:6d}\n'
    f'  No findings: {nf_df.shape[0]:6d}'
)
df[PATHOLOGIES].sum()

        Total: 109749
With findings:  50651
  No findings:  59098


atelectasis           11335
cardiomegaly           2701
consolidation          4505
edema                  2269
effusion              13086
emphysema              2484
fibrosis               1650
hernia                  197
infiltration          19362
mass                   5682
nodule                 6238
pleural_thickening     3326
pneumonia              1381
pneumothorax           5220
dtype: int64

### Saving


With findings:

In [10]:
wf_filepath = join(metachest_dir, f'{ds_name}.csv')
wf_df.to_csv(wf_filepath, index=False)
wf_filepath

'/data/datasets/metachest/chestxray14.csv'

No findings and MTL partition:

In [None]:
def generate_mtl_nf_partition(nf_df, seed=0, mset=(0, 1, 2)):
    n_mtrn = 380503
    n_mval = 6793
    n_mtst = 209198
    n_total = n_mtrn + n_mval + n_mtst
    pct_mtrn = n_mtrn / n_total
    pct_mval = n_mval / n_total
    pct_mtst = n_mtst / n_total

    nf_df = nf_df.iloc[:, :5]
    nf_df = nf_df.sample(frac=1)

    n_total = nf_df.shape[0]
    n_mtrn = int(n_total * pct_mtrn)
    n_mtst = int(n_total * pct_mtst)
    n_mval = n_total - (n_mtrn + n_mtst)

    mtrn_df = nf_df.iloc[:n_mtrn].copy()
    mval_df = nf_df.iloc[n_mtrn:n_mtrn+n_mval].copy()
    mtst_df = nf_df.iloc[n_mtrn+n_mval:].copy()
    mtrn_df['mset'] = mset[0]
    mval_df['mset'] = mset[1]
    mtst_df['mset'] = mset[2]

    nf_mtl_df = pd.concat([mtrn_df, mval_df, mtst_df])
    nf_filepath = join(metachest_dir, f'{ds_name}_nf.csv')
    nf_mtl_df.to_csv(nf_filepath, index=False)

    final_pct_mtrn = mtrn_df.shape[0] / nf_mtl_df.shape[0]
    final_pct_mval = mval_df.shape[0] / nf_mtl_df.shape[0]
    final_pct_mtst = mtst_df.shape[0] / nf_mtl_df.shape[0]

    print(f'Original: '
        f'mtrn={pct_mtrn*100:5.2f}% '
        f'mval={pct_mval*100:5.2f}% '
        f'mtst={pct_mtst*100:5.2f}%\n'
        f'     New: '
        f'mtrn={final_pct_mtrn*100:5.2f}% '
        f'mval={final_pct_mval*100:5.2f}% '
        f'mtst={final_pct_mtst*100:5.2f}%\n'
        f'     New: '
        f'mtrn={mtrn_df.shape[0]} '
        f'mval={mval_df.shape[0]} '
        f'mtst={mtst_df.shape[0]}'
    )

    print(f'Saved to {nf_filepath}')


generate_mtl_nf_partition(nf_df)


Original: mtrn=63.79% mval= 1.14% mtst=35.07%
     New: mtrn=63.79% mval= 1.14% mtst=35.07%
     New: mtrn=37698 mval=674 mtst=20726
Saved to /data/datasets/metachest/chestxray14_nf.csv
