# ChestX-ray14 CSV

This notebook generates `chestxray14.csv` assuming the following are downloaded:

* [ChestX-ray14 Images and metadata](https://nihcc.app.box.com/v/ChestXray-NIHCC/folder/36938765345)


In [None]:
from os.path import join

import pandas as pd

from common import read_toml
from common import AGE_INTERVAL
from common import CHESTXRAY14_PATHOLOGIES as PATHOLOGIES

UNIQUE_STUDIES = False

### Listing files

In [None]:
config = read_toml('config.toml')
base_dir = config['chestxray14_dir']
!ls -hs1 {base_dir}

### Generating CSV

In [None]:
data_csv_path = join(base_dir, 'Data_Entry_2017_v2020.csv')
data_df = pd.read_csv(data_csv_path)
data_df

In [None]:
if UNIQUE_STUDIES:
    data_df = data_df.groupby('Patient ID').first()
    data_df

Filtering, formating and renaming columns:

In [None]:
new_data_df = data_df.rename(columns={
    'Image Index': 'name',
    'Finding Labels': 'labels',
    'Patient Age': 'age',
    'Patient Gender': 'sex',
    'View Position': 'view',
})
print('Available views: ', new_data_df.view.unique())
new_data_df = new_data_df[['name', 'age', 'sex', 'view', 'labels']]
new_data_df = new_data_df[new_data_df['labels'] != 'No Finding']
new_data_df = new_data_df[new_data_df['age'].between(*AGE_INTERVAL, inclusive='both')]
new_data_df = new_data_df.reset_index(drop=True)
new_data_df['name'] = new_data_df['name'].str[:-4]
new_data_df['sex'] = new_data_df['sex'].str.lower()
new_data_df['view'] = new_data_df['view'].str.lower()
new_data_df['labels'] = new_data_df['labels'].str.lower()
new_data_df

Build labels dataframe:

In [None]:
labels = {}
labels_col = new_data_df['labels']
for pathology in PATHOLOGIES:
    mask = labels_col.str.contains(pathology)
    labels[pathology] = mask.values.astype(int)

labels_df = pd.DataFrame(labels)
# join
df = pd.concat([new_data_df, labels_df], axis=1)
df = df[df[PATHOLOGIES].any(axis=1)]
df = df.drop(['labels'], axis=1)
df = df.reset_index(drop=True)
df.insert(0, 'dataset', 'chestxray14', True)
df

Remove images incorrectly labeled:

In [None]:
incorrect_images = [
    '00009683_005',
    '00001153_005',
]
df = df[~df['name'].isin(incorrect_images)]
df = df.reset_index(drop=True)
df

 Check there are no normal examples:

In [None]:
df[(~df[PATHOLOGIES].astype(bool)).all(axis=1)]

### Saving CSV

In [None]:
path = join(read_toml('config.toml')['metachest_dir'], 'chestxray14.csv')
df.to_csv(path, index=False)
path

Overview:

In [None]:
print(f'Total: {df.shape[0]}')
df[PATHOLOGIES].sum()