# CheXpert

This notebook generates `chexpert.csv` and `chexpert_nf.csv` assuming the following are downloaded:

* [`CheXpert-v1.0.zip`](https://stanfordmlgroup.github.io/competitions/chexpert/) (azcopy is recommended to download)
* [`train_cheXbert.csv`](https://stanfordmlgroup.github.io/competitions/chexpert/)


### Listing files

In [None]:
from os.path import join

import pandas as pd

from common import read_toml
from common import AGE_INTERVAL
from common import CHEX_PATHOLOGIES as PATHOLOGIES


UNIQUE_STUDIES = False

In [None]:
ds_name = 'chexpert'
config = read_toml('config.toml')
metachest_dir = config['metachest_dir']
base_dir = config[f'{ds_name}_dir']
!ls -hs1 {base_dir}

### Generating CSV

In [None]:
train_df_path = join(base_dir, 'train_cheXbert.csv')
train_df = pd.read_csv(train_df_path)
train_df

Processing:

In [None]:
original_size = len(train_df)

# drop no-finding
df = train_df[~(train_df['No Finding'] == 1.0)]
df = df.drop(['No Finding'], axis=1)

# rename cols
df = df.rename(columns={
    'Path': 'name',
    'Age': 'age',
    'Sex': 'sex',
    'AP/PA': 'view'
})
df = df.rename(columns={
    'Pleural Effusion': 'Effusion',
    'Lung Opacity': 'Lung_opacity',
})
for pathology in PATHOLOGIES:
    df = df.rename(columns={pathology.capitalize(): pathology})

# drop recors witrh missing metadata
df = df.dropna(subset=['age', 'sex', 'view'])

# keep records with at least one patology
df = df.fillna(0)
df = df.replace(-1.0, 0)
df[PATHOLOGIES] = df[PATHOLOGIES].astype(int)

# filter columns
df = df[['name', 'age', 'sex', 'view'] + PATHOLOGIES]
df = df[df['age'].between(*AGE_INTERVAL, inclusive='both')]

df['name'] = df['name'].str[14:-4]
df['sex'] = df['sex'].str[:1].str.lower()
print('Unique views: ', df.view.unique())
df['view'] = df['view'].str.lower()
df = df[df['view'].isin(['ap', 'pa'])]

df.insert(0, 'dataset', 'chexpert', True)

df = df.reset_index(drop=True)
dropped_size = len(df)
print(f'train_df size: original {original_size}, new {dropped_size}, dropped {original_size - dropped_size}')
df

In [None]:
wf_df = df[df[PATHOLOGIES].any(axis=1)]
nf_df = df[~df[PATHOLOGIES].any(axis=1)]

### Overview

In [None]:
print(
    f'        Total: {df.shape[0]}\n'
    f'With findings: {wf_df.shape[0]:6d}\n'
    f'  No findings: {nf_df.shape[0]:6d}'
)
df[PATHOLOGIES].sum()

### Saving


With findings:

In [None]:
wf_filepath = join(metachest_dir, f'{ds_name}.csv')
wf_df.to_csv(wf_filepath, index=False)
wf_filepath

No findings and MTL partition:

In [None]:
def generate_mtl_nf_partition(nf_df, seed=0, mset=(0, 1, 2)):
    n_mtrn = 380503
    n_mval = 6793
    n_mtst = 209198
    n_total = n_mtrn + n_mval + n_mtst
    pct_mtrn = n_mtrn / n_total
    pct_mval = n_mval / n_total
    pct_mtst = n_mtst / n_total

    nf_df = nf_df.iloc[:, :5]
    nf_df = nf_df.sample(frac=1)

    n_total = nf_df.shape[0]
    n_mtrn = int(n_total * pct_mtrn)
    n_mtst = int(n_total * pct_mtst)
    n_mval = n_total - (n_mtrn + n_mtst)

    mtrn_df = nf_df.iloc[:n_mtrn].copy()
    mval_df = nf_df.iloc[n_mtrn:n_mtrn+n_mval].copy()
    mtst_df = nf_df.iloc[n_mtrn+n_mval:].copy()
    mtrn_df['mset'] = mset[0]
    mval_df['mset'] = mset[1]
    mtst_df['mset'] = mset[2]

    nf_mtl_df = pd.concat([mtrn_df, mval_df, mtst_df])
    nf_filepath = join(metachest_dir, f'{ds_name}_nf.csv')
    nf_mtl_df.to_csv(nf_filepath, index=False)

    final_pct_mtrn = mtrn_df.shape[0] / nf_mtl_df.shape[0]
    final_pct_mval = mval_df.shape[0] / nf_mtl_df.shape[0]
    final_pct_mtst = mtst_df.shape[0] / nf_mtl_df.shape[0]

    print(f'Original: '
        f'mtrn={pct_mtrn*100:5.2f}% '
        f'mval={pct_mval*100:5.2f}% '
        f'mtst={pct_mtst*100:5.2f}%\n'
        f'     New: '
        f'mtrn={final_pct_mtrn*100:5.2f}% '
        f'mval={final_pct_mval*100:5.2f}% '
        f'mtst={final_pct_mtst*100:5.2f}%\n'
        f'     New: '
        f'mtrn={mtrn_df.shape[0]} '
        f'mval={mval_df.shape[0]} '
        f'mtst={mtst_df.shape[0]}'
    )

    print(f'Saved to {nf_filepath}')


generate_mtl_nf_partition(nf_df)
