# ChestX-ray14 CSV

This notebook generates `chestxray14.csv` assuming the following are downloaded:

* [ChestX-ray14 Images and metadata](https://nihcc.app.box.com/v/ChestXray-NIHCC/folder/36938765345)


In [1]:
from os.path import join

import pandas as pd

from common import read_toml
from common import AGE_INTERVAL
from common import CHESTXRAY14_PATHOLOGIES as PATHOLOGIES

UNIQUE_STUDIES = False

### Listing files

In [2]:
config = read_toml('config.toml')
base_dir = config['chestxray14_dir']
!ls -hs1 {base_dir}

total 42G
8.6M Data_Entry_2017_v2020.csv
3.9M images
1.9G images_01.tar.gz
3.7G images_02.tar.gz
3.7G images_03.tar.gz
3.6G images_04.tar.gz
3.7G images_05.tar.gz
3.8G images_06.tar.gz
3.8G images_07.tar.gz
3.8G images_08.tar.gz
3.9G images_09.tar.gz
3.9G images_10.tar.gz
3.9G images_11.tar.gz
2.8G images_12.tar.gz


### Generating CSV

In [3]:
data_csv_path = join(base_dir, 'Data_Entry_2017_v2020.csv')
data_df = pd.read_csv(data_csv_path)
data_df

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y]
0,00000001_000.png,Cardiomegaly,0,1,57,M,PA,2682,2749,0.143,0.143
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168
3,00000002_000.png,No Finding,0,2,80,M,PA,2500,2048,0.171,0.171
4,00000003_001.png,Hernia,0,3,74,F,PA,2500,2048,0.168,0.168
...,...,...,...,...,...,...,...,...,...,...,...
112115,00030801_001.png,Mass|Pneumonia,1,30801,38,M,PA,2048,2500,0.168,0.168
112116,00030802_000.png,No Finding,0,30802,28,M,PA,2048,2500,0.168,0.168
112117,00030803_000.png,No Finding,0,30803,42,F,PA,2048,2500,0.168,0.168
112118,00030804_000.png,No Finding,0,30804,29,F,PA,2048,2500,0.168,0.168


In [4]:
if UNIQUE_STUDIES:
    data_df = data_df.groupby('Patient ID').first()
    data_df

Filtering, formating and renaming columns:

In [5]:
new_data_df = data_df.rename(columns={
    'Image Index': 'name',
    'Finding Labels': 'labels',
    'Patient Age': 'age',
    'Patient Gender': 'sex',
    'View Position': 'view',
})
print('Available views: ', new_data_df.view.unique())
new_data_df = new_data_df[['name', 'age', 'sex', 'view', 'labels']]
new_data_df = new_data_df[new_data_df['labels'] != 'No Finding']
new_data_df = new_data_df[new_data_df['age'].between(*AGE_INTERVAL, inclusive='both')]
new_data_df = new_data_df.reset_index(drop=True)
new_data_df['name'] = new_data_df['name'].str[:-4]
new_data_df['sex'] = new_data_df['sex'].str.lower()
new_data_df['view'] = new_data_df['view'].str.lower()
new_data_df['labels'] = new_data_df['labels'].str.lower()
new_data_df

Available views:  ['PA' 'AP']


Unnamed: 0,name,age,sex,view,labels
0,00000001_000,57,m,pa,cardiomegaly
1,00000001_001,58,m,pa,cardiomegaly|emphysema
2,00000001_002,58,m,pa,cardiomegaly|effusion
3,00000003_001,74,f,pa,hernia
4,00000003_002,75,f,pa,hernia
...,...,...,...,...,...
50648,00030786_006,61,f,ap,consolidation
50649,00030789_000,51,f,pa,infiltration
50650,00030793_000,57,f,pa,mass|nodule
50651,00030795_000,52,f,pa,pleural_thickening


Build labels dataframe:

In [6]:
labels = {}
labels_col = new_data_df['labels']
for pathology in PATHOLOGIES:
    mask = labels_col.str.contains(pathology)
    labels[pathology] = mask.values.astype(int)

labels_df = pd.DataFrame(labels)
# join
df = pd.concat([new_data_df, labels_df], axis=1)
df = df[df[PATHOLOGIES].any(axis=1)]
df = df.drop(['labels'], axis=1)
df = df.reset_index(drop=True)
df.insert(0, 'dataset', 'chestxray14', True)
df

Unnamed: 0,dataset,name,age,sex,view,atelectasis,cardiomegaly,consolidation,edema,effusion,emphysema,fibrosis,hernia,infiltration,mass,nodule,pleural_thickening,pneumonia,pneumothorax
0,chestxray14,00000001_000,57,m,pa,0,1,0,0,0,0,0,0,0,0,0,0,0,0
1,chestxray14,00000001_001,58,m,pa,0,1,0,0,0,1,0,0,0,0,0,0,0,0
2,chestxray14,00000001_002,58,m,pa,0,1,0,0,1,0,0,0,0,0,0,0,0,0
3,chestxray14,00000003_001,74,f,pa,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,chestxray14,00000003_002,75,f,pa,0,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50648,chestxray14,00030786_006,61,f,ap,0,0,1,0,0,0,0,0,0,0,0,0,0,0
50649,chestxray14,00030789_000,51,f,pa,0,0,0,0,0,0,0,0,1,0,0,0,0,0
50650,chestxray14,00030793_000,57,f,pa,0,0,0,0,0,0,0,0,0,1,1,0,0,0
50651,chestxray14,00030795_000,52,f,pa,0,0,0,0,0,0,0,0,0,0,0,1,0,0


Remove images incorrectly labeled:

In [7]:
incorrect_images = [
    '00009683_005',
    '00001153_005',
]
df = df[~df['name'].isin(incorrect_images)]
df = df.reset_index(drop=True)
df

Unnamed: 0,dataset,name,age,sex,view,atelectasis,cardiomegaly,consolidation,edema,effusion,emphysema,fibrosis,hernia,infiltration,mass,nodule,pleural_thickening,pneumonia,pneumothorax
0,chestxray14,00000001_000,57,m,pa,0,1,0,0,0,0,0,0,0,0,0,0,0,0
1,chestxray14,00000001_001,58,m,pa,0,1,0,0,0,1,0,0,0,0,0,0,0,0
2,chestxray14,00000001_002,58,m,pa,0,1,0,0,1,0,0,0,0,0,0,0,0,0
3,chestxray14,00000003_001,74,f,pa,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,chestxray14,00000003_002,75,f,pa,0,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50646,chestxray14,00030786_006,61,f,ap,0,0,1,0,0,0,0,0,0,0,0,0,0,0
50647,chestxray14,00030789_000,51,f,pa,0,0,0,0,0,0,0,0,1,0,0,0,0,0
50648,chestxray14,00030793_000,57,f,pa,0,0,0,0,0,0,0,0,0,1,1,0,0,0
50649,chestxray14,00030795_000,52,f,pa,0,0,0,0,0,0,0,0,0,0,0,1,0,0


 Check there are no normal examples:

In [8]:
df[(~df[PATHOLOGIES].astype(bool)).all(axis=1)]

Unnamed: 0,dataset,name,age,sex,view,atelectasis,cardiomegaly,consolidation,edema,effusion,emphysema,fibrosis,hernia,infiltration,mass,nodule,pleural_thickening,pneumonia,pneumothorax


### Saving CSV

In [9]:
path = 'chestxray14.csv'
df.to_csv(path, index=False)
path

'chestxray14.csv'

Overview:

In [10]:
print(f'Total: {df.shape[0]}')
df[PATHOLOGIES].sum()

Total: 50651


atelectasis           11335
cardiomegaly           2701
consolidation          4505
edema                  2269
effusion              13086
emphysema              2484
fibrosis               1650
hernia                  197
infiltration          19362
mass                   5682
nodule                 6238
pleural_thickening     3326
pneumonia              1381
pneumothorax           5220
dtype: int64