# PadChest CSV

This notebook generates `padchest.csv` and `padchest_nf.csv` assuming the following are downloaded:

* [PadChest](https://bimcv.cipf.es/bimcv-projects/padchest/) dataset


In [None]:
import sys
from os.path import join

import pandas as pd

sys.path.append('../')
from common import read_toml
from common import AGE_INTERVAL
from common import PADCHEST_PATHOLOGIES as PATHOLOGIES

UNIQUE_STUDIES = False

### Listing files

In [None]:
ds_name = 'padchest'
config = read_toml('../config.toml')
metachest_dir = config['metachest_dir']
base_dir = config[f'{ds_name}_dir']
!ls -hs1 {base_dir}

### Generating CSV

In [None]:
padchest_df_path = join(base_dir, 'PADCHEST_chest_x_ray_images_labels_160K_01.02.19.csv.gz')
padchest_df = pd.read_csv(padchest_df_path, index_col=0, compression='gzip', low_memory=False)
padchest_df

Drop records with missing attributes:

In [None]:
original_size = len(padchest_df)
padchest_df = padchest_df.dropna(
    subset=['Projection',
            'PatientSex_DICOM',
            'PatientBirth',
            'StudyDate_DICOM',
            'Labels'])
dropped_size = len(padchest_df)
print(f'padchest_df size: original {original_size}, new {dropped_size}, dropped {original_size - dropped_size}')

In [None]:
if UNIQUE_STUDIES:
    padchest_df = padchest_df.groupby('PatientID').first()

Build labels df and join:

In [None]:
# reset index to match with labels df
padchest_df.reset_index(drop=True, inplace=True)

aliases = {
    'infiltration': {
        'infiltrates',
        'interstitial pattern',
        'ground glass pattern',
        'reticular interstitial pattern',
        'reticulonodular interstitial pattern',
        'alveolar pattern',
        'consolidation',
        'air bronchogram'
    },
    'pleural_thickening' : {
        'pleural thickening'
    }
}

labels = {}
labels_col = padchest_df['Labels']
for pathology in PATHOLOGIES:
    mask = labels_col.str.contains(pathology)
    pat_aliases = aliases.get(pathology, None)
    if pat_aliases:
        for pat_alias in pat_aliases:
            mask |= labels_col.str.contains(pat_alias)
    labels[pathology] = mask.values.astype(int)

labels_df = pd.DataFrame(labels)
# join
df = pd.concat([padchest_df, labels_df], axis=1)
df

Rename columns and filter:

In [None]:
original_size = len(df)

# rename cols
df = df.rename(columns={
    'ImageID': 'name',
    'Projection': 'view',
    'PatientSex_DICOM': 'sex',
    'PatientBirth': 'birth_date',
    'StudyDate_DICOM': 'study_date'
})
# keep views
print('Available views: ', df.view.unique())
df = df[df['view'].isin(['AP', 'PA', 'AP_horizontal'])]
# compute age in the study
df['age'] = df['study_date'].astype(str).str[:4].astype(int) - df['birth_date'].astype(int)
df = df[df['age'].between(*AGE_INTERVAL, inclusive='both')]
# keep cols
df = df[['name', 'age', 'sex', 'view'] + PATHOLOGIES]

# remove image extension
df['name'] = df['name'].str[:-4]
df['view'] = df['view'].replace('AP_horizontal','ap')
df['sex'] = df['sex'].str.lower()
df['view'] = df['view'].str.lower()

df.insert(0, 'dataset', 'padchest', True)

dropped_size = len(df)
print(f'df size: original {original_size}, new {dropped_size}, dropped {original_size - dropped_size}')

df

Removing records with wrong labels or corrupted images:

In [None]:
incorrect_labels = [
    '216840111366964012558082906712010102112808556_03-181-172',
]
corrupted = [
    # https://github.com/mlmed/torchxrayvision/blob/7879060cbe0a8172d8f91ddab786ba707fbfa5ec/torchxrayvision/datasets.py#L746
    "216840111366964012819207061112010307142602253_04-014-084",
    "216840111366964012989926673512011074122523403_00-163-058",
    "216840111366964012959786098432011033083840143_00-176-115",
    "216840111366964012558082906712009327122220177_00-102-064",
    "216840111366964012339356563862009072111404053_00-043-192",
    "216840111366964013076187734852011291090445391_00-196-188",
    "216840111366964012373310883942009117084022290_00-064-025",
    "216840111366964012283393834152009033102258826_00-059-087",
    "216840111366964012373310883942009170084120009_00-097-074",
    "216840111366964012819207061112010315104455352_04-024-184",
    "216840111366964012819207061112010306085429121_04-020-102",
    # truncated
    '216840111366964013590140476722013058110301622_02-056-111',
    '216840111366964013590140476722013043111952381_02-065-198',
    '216840111366964013829543166512013353113303615_02-092-190',
    '216840111366964012373310883942009180082307973_00-097-011',
    # corruted nf
    '216840111366964012989926673512011083134050913_00-168-009',
    '216840111366964012989926673512011151082430686_00-157-045',
    '216840111366964012487858717522009280135853083_00-075-001',
    '216840111366964013962490064942014134093945580_01-178-104',
    '216840111366964012373310883942009152114636712_00-102-045',
    '216840111366964013686042548532013208193054515_02-026-007',
]
wrong = incorrect_labels + corrupted
df = df[~df['name'].isin(wrong)]
df = df.reset_index(drop=True)
df

In [None]:
wf_df = df[df[PATHOLOGIES].any(axis=1)]
nf_df = df[~df[PATHOLOGIES].any(axis=1)]

### Overview

In [None]:
print(
    f'        Total: {df.shape[0]}\n'
    f'With findings: {wf_df.shape[0]:5d}\n'
    f'  No findings: {nf_df.shape[0]:5d}'
)
df[PATHOLOGIES].sum()

### Saving


With findings:

In [None]:
wf_filepath = join(metachest_dir, f'{ds_name}.csv')
wf_df.to_csv(wf_filepath, index=False)
wf_filepath

No findings and MTL partition:

In [None]:
def generate_mtl_nf_partition(nf_df, seed=0, mset=(0, 1, 2)):
    n_mtrn = 380503
    n_mval = 6793
    n_mtst = 209198
    n_total = n_mtrn + n_mval + n_mtst
    pct_mtrn = n_mtrn / n_total
    pct_mval = n_mval / n_total
    pct_mtst = n_mtst / n_total

    nf_df = nf_df.iloc[:, :5]
    nf_df = nf_df.sample(frac=1)

    n_total = nf_df.shape[0]
    n_mtrn = int(n_total * pct_mtrn)
    n_mtst = int(n_total * pct_mtst)
    n_mval = n_total - (n_mtrn + n_mtst)

    mtrn_df = nf_df.iloc[:n_mtrn].copy()
    mval_df = nf_df.iloc[n_mtrn:n_mtrn+n_mval].copy()
    mtst_df = nf_df.iloc[n_mtrn+n_mval:].copy()
    mtrn_df['mset'] = mset[0]
    mval_df['mset'] = mset[1]
    mtst_df['mset'] = mset[2]

    nf_mtl_df = pd.concat([mtrn_df, mval_df, mtst_df])
    nf_filepath = join(metachest_dir, f'{ds_name}_nf.csv')
    nf_mtl_df.to_csv(nf_filepath, index=False)

    final_pct_mtrn = mtrn_df.shape[0] / nf_mtl_df.shape[0]
    final_pct_mval = mval_df.shape[0] / nf_mtl_df.shape[0]
    final_pct_mtst = mtst_df.shape[0] / nf_mtl_df.shape[0]

    print(f'Original: '
        f'mtrn={pct_mtrn*100:5.2f}% '
        f'mval={pct_mval*100:5.2f}% '
        f'mtst={pct_mtst*100:5.2f}%\n'
        f'     New: '
        f'mtrn={final_pct_mtrn*100:5.2f}% '
        f'mval={final_pct_mval*100:5.2f}% '
        f'mtst={final_pct_mtst*100:5.2f}%\n'
        f'     New: '
        f'mtrn={mtrn_df.shape[0]} '
        f'mval={mval_df.shape[0]} '
        f'mtst={mtst_df.shape[0]}'
    )

    print(f'Saved to {nf_filepath}')


generate_mtl_nf_partition(nf_df)
