# CheXpert

This notebook generates `chexpert.csv` and `chexpert_nf.csv` assuming the following are downloaded:

* [`CheXpert-v1.0.zip`](https://stanfordmlgroup.github.io/competitions/chexpert/) (azcopy is recommended to download)
* [`train_cheXbert.csv`](https://stanfordmlgroup.github.io/competitions/chexpert/)


### Listing files

In [1]:
from os.path import join

import pandas as pd

from common import read_toml
from common import AGE_INTERVAL
from common import CHEX_PATHOLOGIES as PATHOLOGIES


UNIQUE_STUDIES = False

In [2]:
ds_name = 'chexpert'
config = read_toml('config.toml')
metachest_dir = config['metachest_dir']
base_dir = config[f'{ds_name}_dir']
!ls -hs1 {base_dir}

total 439G
 28M azcopy
2.0M 'CHEXPERT DEMO.xlsx'
4.0K CheXpert-v1.0
439G CheXpert-v1.0.zip
4.0K download.sh
4.0K README.md
 23M train_cheXbert.csv
 29M train_visualCheXbert.csv


### Generating CSV

In [3]:
train_df_path = join(base_dir, 'train_cheXbert.csv')
train_df = pd.read_csv(train_df_path)
train_df

Unnamed: 0,Path,Sex,Age,Frontal/Lateral,AP/PA,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,No Finding
0,CheXpert-v1.0/train/patient00001/study1/view1_...,Female,68,Frontal,AP,,,,,,,,,0.0,,,,1.0,1.0
1,CheXpert-v1.0/train/patient00002/study2/view1_...,Female,87,Frontal,AP,,1.0,1.0,,-1.0,-1.0,,-1.0,,-1.0,,1.0,,
2,CheXpert-v1.0/train/patient00002/study1/view1_...,Female,83,Frontal,AP,,,1.0,,,-1.0,,,,,,1.0,,
3,CheXpert-v1.0/train/patient00002/study1/view2_...,Female,83,Lateral,,,,1.0,,,-1.0,,,,,,1.0,,
4,CheXpert-v1.0/train/patient00003/study1/view1_...,Male,41,Frontal,AP,,,,,1.0,,,,0.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223409,CheXpert-v1.0/train/patient64537/study2/view1_...,Male,59,Frontal,AP,,,-1.0,,,,,-1.0,0.0,1.0,,,,
223410,CheXpert-v1.0/train/patient64537/study1/view1_...,Male,59,Frontal,AP,,,1.0,,,,0.0,-1.0,,1.0,,,,
223411,CheXpert-v1.0/train/patient64538/study1/view1_...,Female,0,Frontal,AP,,,,,-1.0,,,,,,,,,
223412,CheXpert-v1.0/train/patient64539/study1/view1_...,Female,0,Frontal,AP,,1.0,1.0,,,,-1.0,1.0,0.0,,,,,


Processing:

In [4]:
original_size = len(train_df)

# drop no-finding
df = train_df[~(train_df['No Finding'] == 1.0)]
df = df.drop(['No Finding'], axis=1)

# rename cols
df = df.rename(columns={
    'Path': 'name',
    'Age': 'age',
    'Sex': 'sex',
    'AP/PA': 'view'
})
df = df.rename(columns={
    'Pleural Effusion': 'Effusion',
    'Lung Opacity': 'Lung_opacity',
})
for pathology in PATHOLOGIES:
    df = df.rename(columns={pathology.capitalize(): pathology})

# drop recors witrh missing metadata
df = df.dropna(subset=['age', 'sex', 'view'])

# keep records with at least one patology
df = df.fillna(0)
df = df.replace(-1.0, 0)
df[PATHOLOGIES] = df[PATHOLOGIES].astype(int)

# filter columns
df = df[['name', 'age', 'sex', 'view'] + PATHOLOGIES]
df = df[df['age'].between(*AGE_INTERVAL, inclusive='both')]

df['name'] = df['name'].str[14:-4]
df['sex'] = df['sex'].str[:1].str.lower()
print('Unique views: ', df.view.unique())
df['view'] = df['view'].str.lower()
df = df[df['view'].isin(['ap', 'pa'])]

df.insert(0, 'dataset', 'chexpert', True)

df = df.reset_index(drop=True)
dropped_size = len(df)
print(f'train_df size: original {original_size}, new {dropped_size}, dropped {original_size - dropped_size}')
df

Unique views:  ['AP' 'PA' 'LL' 'RL']
train_df size: original 223414, new 147795, dropped 75619


Unnamed: 0,dataset,name,age,sex,view,atelectasis,cardiomegaly,consolidation,edema,effusion,lung_opacity,pneumonia,pneumothorax
0,chexpert,train/patient00003/study1/view1_frontal,41,m,ap,0,0,0,1,0,0,0,0
1,chexpert,train/patient00005/study2/view1_frontal,33,m,ap,0,0,0,0,0,0,0,1
2,chexpert,train/patient00005/study2/view2_frontal,33,m,ap,0,0,0,0,0,0,0,1
3,chexpert,train/patient00007/study1/view1_frontal,69,m,ap,1,1,0,0,0,0,0,0
4,chexpert,train/patient00007/study2/view1_frontal,69,m,ap,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
147790,chexpert,train/patient64535/study1/view1_frontal,60,m,ap,0,0,0,0,0,1,0,0
147791,chexpert,train/patient64536/study2/view1_frontal,61,f,ap,0,0,0,1,1,0,0,0
147792,chexpert,train/patient64536/study1/view1_frontal,61,f,ap,1,0,0,1,0,0,0,0
147793,chexpert,train/patient64537/study2/view1_frontal,59,m,ap,0,0,0,0,1,0,0,0


In [5]:
wf_df = df[df[PATHOLOGIES].any(axis=1)]
nf_df = df[~df[PATHOLOGIES].any(axis=1)]

### Overview

In [6]:
print(
    f'        Total: {df.shape[0]}\n'
    f'With findings: {wf_df.shape[0]:6d}\n'
    f'  No findings: {nf_df.shape[0]:6d}'
)
df[PATHOLOGIES].sum()

        Total: 147795
With findings: 130930
  No findings:  16865


atelectasis      25980
cardiomegaly     20391
consolidation    10340
edema            41247
effusion         66484
lung_opacity     77194
pneumonia         2986
pneumothorax     14977
dtype: int64

### Saving


With findings:

In [7]:
wf_filepath = join(metachest_dir, f'{ds_name}.csv')
wf_df.to_csv(wf_filepath, index=False)
wf_filepath

'/data/datasets/metachest/chexpert.csv'

No findings and MTL partition:

In [None]:
def generate_mtl_nf_partition(nf_df, seed=0, mset=(0, 1, 2)):
    n_mtrn = 380503
    n_mval = 6793
    n_mtst = 209198
    n_total = n_mtrn + n_mval + n_mtst
    pct_mtrn = n_mtrn / n_total
    pct_mval = n_mval / n_total
    pct_mtst = n_mtst / n_total

    nf_df = nf_df.iloc[:, :5]
    nf_df = nf_df.sample(frac=1)

    n_total = nf_df.shape[0]
    n_mtrn = int(n_total * pct_mtrn)
    n_mtst = int(n_total * pct_mtst)
    n_mval = n_total - (n_mtrn + n_mtst)

    mtrn_df = nf_df.iloc[:n_mtrn].copy()
    mval_df = nf_df.iloc[n_mtrn:n_mtrn+n_mval].copy()
    mtst_df = nf_df.iloc[n_mtrn+n_mval:].copy()
    mtrn_df['mset'] = mset[0]
    mval_df['mset'] = mset[1]
    mtst_df['mset'] = mset[2]

    nf_mtl_df = pd.concat([mtrn_df, mval_df, mtst_df])
    nf_filepath = join(metachest_dir, f'{ds_name}_nf.csv')
    nf_mtl_df.to_csv(nf_filepath, index=False)

    final_pct_mtrn = mtrn_df.shape[0] / nf_mtl_df.shape[0]
    final_pct_mval = mval_df.shape[0] / nf_mtl_df.shape[0]
    final_pct_mtst = mtst_df.shape[0] / nf_mtl_df.shape[0]

    print(f'Original: '
        f'mtrn={pct_mtrn*100:5.2f}% '
        f'mval={pct_mval*100:5.2f}% '
        f'mtst={pct_mtst*100:5.2f}%\n'
        f'     New: '
        f'mtrn={final_pct_mtrn*100:5.2f}% '
        f'mval={final_pct_mval*100:5.2f}% '
        f'mtst={final_pct_mtst*100:5.2f}%\n'
        f'     New: '
        f'mtrn={mtrn_df.shape[0]} '
        f'mval={mval_df.shape[0]} '
        f'mtst={mtst_df.shape[0]}'
    )

    print(f'Saved to {nf_filepath}')


generate_mtl_nf_partition(nf_df)


Original: mtrn=63.79% mval= 1.14% mtst=35.07%
     New: mtrn=63.79% mval= 1.14% mtst=35.07%
     New: mtrn=10758 mval=193 mtst=5914
Saved to /data/datasets/metachest/chexpert_nf.csv
