# CheXpert

This notebook generates `chex.csv` assuming the following are downloaded:

* [`CheXpert-v1.0.zip`](https://stanfordmlgroup.github.io/competitions/chexpert/) (azcopy is recommended to download)
* [`train_cheXbert.csv`](https://stanfordmlgroup.github.io/competitions/chexpert/)


### Listing files

In [None]:
from os.path import join

import pandas as pd

from common import read_toml
from common import AGE_INTERVAL
from common import CHEX_PATHOLOGIES as PATHOLOGIES


UNIQUE_STUDIES = False

In [None]:
config = read_toml('config.toml')
base_dir = config['chexpert_dir']
!ls -hs1 {base_dir}

### Generating CSV

In [None]:
train_df_path = join(base_dir, 'train_cheXbert.csv')
train_df = pd.read_csv(train_df_path)
train_df

Processing:

In [None]:
original_size = len(train_df)

# drop no-finding
df = train_df[~(train_df['No Finding'] == 1.0)]
df = df.drop(['No Finding'], axis=1)

# rename cols
df = df.rename(columns={
    'Path': 'name',
    'Age': 'age',
    'Sex': 'sex',
    'AP/PA': 'view'
})
df = df.rename(columns={
    'Pleural Effusion': 'Effusion',
    'Lung Opacity': 'Lung_opacity',
})
for pathology in PATHOLOGIES:
    df = df.rename(columns={pathology.capitalize(): pathology})

# drop recors witrh missing metadata
df = df.dropna(subset=['age', 'sex', 'view'])

# keep records with at least one patology
df = df.fillna(0)
df = df.replace(-1.0, 0)
df[PATHOLOGIES] = df[PATHOLOGIES].astype(int)
df = df[df[PATHOLOGIES].any(axis=1)]

# filter columns
df = df[['name', 'age', 'sex', 'view'] + PATHOLOGIES]
df = df[df['age'].between(*AGE_INTERVAL, inclusive='both')]

df['name'] = df['name'].str[14:-4]
df['sex'] = df['sex'].str[:1].str.lower()
print('Unique views: ', df.view.unique())
df['view'] = df['view'].str.lower()
df = df[df['view'].isin(['ap', 'pa'])]

df.insert(0, 'dataset', 'chexpert', True)

df = df.reset_index(drop=True)
dropped_size = len(df)
print(f'train_df size: original {original_size}, new {dropped_size}, dropped {original_size - dropped_size}')
df

 Check there are no normal examples:

In [None]:
df[(~df[PATHOLOGIES].astype(bool)).all(axis=1)]

### Saving CSV

In [None]:
path = join(read_toml('config.toml')['metachest_dir'], 'chexpert.csv')
df.to_csv(path, index=False)
path

Overview:

In [None]:
print(f'Total: {df.shape[0]}')
df[PATHOLOGIES].sum()