# ChestX-ray14 CSV

This notebook generates `chestxray14.csv` and `chestxray14_nf.csv` assuming the following are downloaded:

* [ChestX-ray14 Images and metadata](https://nihcc.app.box.com/v/ChestXray-NIHCC/folder/36938765345)


In [None]:
import sys
from os.path import join

import pandas as pd

sys.path.append('../')
from common import read_toml
from common import AGE_INTERVAL
from common import CHESTXRAY14_PATHOLOGIES as PATHOLOGIES

UNIQUE_STUDIES = False

### Listing files

In [None]:
ds_name = 'chestxray14'
config = read_toml('../config.toml')
metachest_dir = config['metachest_dir']
base_dir = config[f'{ds_name}_dir']
!ls -hs1 {base_dir}

### Generating CSV

In [None]:
data_csv_path = join(base_dir, 'Data_Entry_2017_v2020.csv')
data_df = pd.read_csv(data_csv_path)
data_df

In [None]:
if UNIQUE_STUDIES:
    data_df = data_df.groupby('Patient ID').first()
    data_df

Filtering, formating and renaming columns:

In [None]:
new_data_df = data_df.rename(columns={
    'Image Index': 'name',
    'Finding Labels': 'labels',
    'Patient Age': 'age',
    'Patient Gender': 'sex',
    'View Position': 'view',
})
print('Available views: ', new_data_df.view.unique())
new_data_df = new_data_df[['name', 'age', 'sex', 'view', 'labels']]
# new_data_df = new_data_df[new_data_df['labels'] != 'No Finding']
new_data_df = new_data_df[new_data_df['age'].between(*AGE_INTERVAL, inclusive='both')]
new_data_df = new_data_df.reset_index(drop=True)
new_data_df['name'] = new_data_df['name'].str[:-4]
new_data_df['sex'] = new_data_df['sex'].str.lower()
new_data_df['view'] = new_data_df['view'].str.lower()
new_data_df['labels'] = new_data_df['labels'].str.lower()
new_data_df

Build labels dataframe:

In [None]:
labels = {}
labels_col = new_data_df['labels']
for pathology in PATHOLOGIES:
    mask = labels_col.str.contains(pathology)
    labels[pathology] = mask.values.astype(int)

labels_df = pd.DataFrame(labels)
# join
df = pd.concat([new_data_df, labels_df], axis=1)
df = df.drop(['labels'], axis=1)
df = df.reset_index(drop=True)
df.insert(0, 'dataset', 'chestxray14', True)
df

Remove images incorrectly labeled:

In [None]:
incorrect_images = [
    '00009683_005',
    '00001153_005',
]
df = df[~df['name'].isin(incorrect_images)]
df = df.reset_index(drop=True)
df

In [None]:
wf_df = df[df[PATHOLOGIES].any(axis=1)]
nf_df = df[~df[PATHOLOGIES].any(axis=1)]

### Overview

In [None]:
print(
    f'        Total: {df.shape[0]}\n'
    f'With findings: {wf_df.shape[0]:6d}\n'
    f'  No findings: {nf_df.shape[0]:6d}'
)
df[PATHOLOGIES].sum()

### Saving


With findings:

In [None]:
wf_filepath = join(metachest_dir, f'{ds_name}.csv')
wf_df.to_csv(wf_filepath, index=False)
wf_filepath

No findings and MTL partition:

In [None]:
def generate_mtl_nf_partition(nf_df, seed=0, mset=(0, 1, 2)):
    n_mtrn = 380503
    n_mval = 6793
    n_mtst = 209198
    n_total = n_mtrn + n_mval + n_mtst
    pct_mtrn = n_mtrn / n_total
    pct_mval = n_mval / n_total
    pct_mtst = n_mtst / n_total

    nf_df = nf_df.iloc[:, :5]
    nf_df = nf_df.sample(frac=1)

    n_total = nf_df.shape[0]
    n_mtrn = int(n_total * pct_mtrn)
    n_mtst = int(n_total * pct_mtst)
    n_mval = n_total - (n_mtrn + n_mtst)

    mtrn_df = nf_df.iloc[:n_mtrn].copy()
    mval_df = nf_df.iloc[n_mtrn:n_mtrn+n_mval].copy()
    mtst_df = nf_df.iloc[n_mtrn+n_mval:].copy()
    mtrn_df['mset'] = mset[0]
    mval_df['mset'] = mset[1]
    mtst_df['mset'] = mset[2]

    nf_mtl_df = pd.concat([mtrn_df, mval_df, mtst_df])
    nf_filepath = join(metachest_dir, f'{ds_name}_nf.csv')
    nf_mtl_df.to_csv(nf_filepath, index=False)

    final_pct_mtrn = mtrn_df.shape[0] / nf_mtl_df.shape[0]
    final_pct_mval = mval_df.shape[0] / nf_mtl_df.shape[0]
    final_pct_mtst = mtst_df.shape[0] / nf_mtl_df.shape[0]

    print(f'Original: '
        f'mtrn={pct_mtrn*100:5.2f}% '
        f'mval={pct_mval*100:5.2f}% '
        f'mtst={pct_mtst*100:5.2f}%\n'
        f'     New: '
        f'mtrn={final_pct_mtrn*100:5.2f}% '
        f'mval={final_pct_mval*100:5.2f}% '
        f'mtst={final_pct_mtst*100:5.2f}%\n'
        f'     New: '
        f'mtrn={mtrn_df.shape[0]} '
        f'mval={mval_df.shape[0]} '
        f'mtst={mtst_df.shape[0]}'
    )

    print(f'Saved to {nf_filepath}')


generate_mtl_nf_partition(nf_df)
