# PadChest CSV

This notebook generates `padchest.csv` and `padchest_nf.csv` assuming the following are downloaded:

* [PadChest](https://bimcv.cipf.es/bimcv-projects/padchest/) dataset


In [1]:
from os.path import join

import pandas as pd

from common import read_toml
from common import AGE_INTERVAL
from common import PADCHEST_PATHOLOGIES as PATHOLOGIES

UNIQUE_STUDIES = False

### Listing files

In [2]:
ds_name = 'padchest'
config = read_toml('config.toml')
metachest_dir = config['metachest_dir']
base_dir = config[f'{ds_name}_dir']
!ls -hs1 {base_dir}

total 985G
 21G 0.zip
240K 0.zip.unzip-l.txt
 21G 10.zip
236K 10.zip.unzip-l.txt
 21G 11.zip
240K 11.zip.unzip-l.txt
 21G 12.zip
284K 12.zip.unzip-l.txt
 19G 13.zip
288K 13.zip.unzip-l.txt
 19G 14.zip
288K 14.zip.unzip-l.txt
 19G 15.zip
288K 15.zip.unzip-l.txt
 20G 16.zip
288K 16.zip.unzip-l.txt
 20G 17.zip
288K 17.zip.unzip-l.txt
 20G 18.zip
288K 18.zip.unzip-l.txt
 20G 19.zip
288K 19.zip.unzip-l.txt
 21G 1.zip
240K 1.zip.unzip-l.txt
 20G 20.zip
288K 20.zip.unzip-l.txt
 19G 21.zip
288K 21.zip.unzip-l.txt
 21G 22.zip
288K 22.zip.unzip-l.txt
 19G 23.zip
288K 23.zip.unzip-l.txt
 19G 24.zip
288K 24.zip.unzip-l.txt
 18G 25.zip
288K 25.zip.unzip-l.txt
 20G 26.zip
288K 26.zip.unzip-l.txt
 20G 27.zip
288K 27.zip.unzip-l.txt
 20G 28.zip
288K 28.zip.unzip-l.txt
 20G 29.zip
288K 29.zip.unzip-l.txt
 21G 2.zip
236K 2.zip.unzip-l.txt
 20G 30.zip
288K 30.zip.unzip-l.txt
 19G 31.zip
288K 31.zip.unzip-l.txt
 20G 32.zip
288K 32.zip.unzip-l.txt
 21G 33.zip
288K 33.zip.unzip-l.txt
 21G 34.zip
284K 34.zip

### Generating CSV

In [3]:
padchest_df_path = join(base_dir, 'PADCHEST_chest_x_ray_images_labels_160K_01.02.19.csv.gz')
padchest_df = pd.read_csv(padchest_df_path, index_col=0, compression='gzip', low_memory=False)
padchest_df

Unnamed: 0,ImageID,ImageDir,StudyDate_DICOM,StudyID,PatientID,PatientBirth,PatientSex_DICOM,ViewPosition_DICOM,Projection,MethodProjection,...,ExposureTime,RelativeXRayExposure_DICOM,ReportID,Report,MethodLabel,Labels,Localizations,LabelsLocalizationsBySentence,labelCUIS,LocalizationsCUIS
0,20536686640136348236148679891455886468_k6ga29.png,0,20140915,20536686640136348236148679891455886468,839860488694292331637988235681460987,1930.0,F,POSTEROANTERIOR,PA,Manual review of DICOM fields,...,10.0,-1.42,4765777,sin hallazg patolog edad pacient .,Physician,['normal'],[],"[['normal'], ['normal']]",[],[]
1,135803415504923515076821959678074435083_fzis7d...,0,20150914,135803415504923515076821959678074435083,313572750430997347502932654319389875966,1929.0,M,LATERAL,L,Manual review of DICOM fields,...,25.0,,4991845,cambi pulmonar cronic sever . sign fibrosis b...,Physician,"['pulmonary fibrosis', 'chronic changes', 'kyp...","['loc basal', 'loc basal bilateral']","[['pulmonary fibrosis', 'loc basal bilateral']...",['C0034069' 'C0742362' 'C2115817' 'C3544344'],['C1282378']
2,135803415504923515076821959678074435083_fzis7b...,0,20150914,135803415504923515076821959678074435083,313572750430997347502932654319389875966,1929.0,M,POSTEROANTERIOR,PA,Manual review of DICOM fields,...,10.0,,4991845,cambi pulmonar cronic sever . sign fibrosis b...,Physician,"['pulmonary fibrosis', 'chronic changes', 'kyp...","['loc basal', 'loc basal bilateral']","[['pulmonary fibrosis', 'loc basal bilateral']...",['C0034069' 'C0742362' 'C2115817' 'C3544344'],['C1282378']
3,113855343774216031107737439268243531979_3k951l...,0,20150717,113855343774216031107737439268243531979,50783093527901818115346441867348318648,1925.0,F,POSTEROANTERIOR,PA,Manual review of DICOM fields,...,8.0,,4955977,. . siluet cardi mediastin dentr normal . cam...,Physician,['chronic changes'],"['loc cardiac', 'loc mediastinum', 'loc costop...","[['chronic changes'], ['chronic changes'], ['n...",['C0742362'],['C1522601' 'C0025066' 'C0230151']
4,113855343774216031107737439268243531979_3k951n...,0,20150717,113855343774216031107737439268243531979,50783093527901818115346441867348318648,1925.0,F,LATERAL,L,Manual review of DICOM fields,...,20.0,,4955977,. . siluet cardi mediastin dentr normal . cam...,Physician,['chronic changes'],"['loc cardiac', 'loc mediastinum', 'loc costop...","[['chronic changes'], ['chronic changes'], ['n...",['C0742362'],['C1522601' 'C0025066' 'C0230151']
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160856,1284011361929414522814654121696751542351444145...,49,20110321,1284011361929414522814654121696751542351444145...,112930952416074060371371014599496493673,1948.0,M,POSTEROANTERIOR,PA,Manual review of DICOM fields,...,10.0,-0.69,4018689,import sign radiolog epoc . pinzamient ambos ...,RNN_model,"['COPD signs', 'costophrenic angle blunting']",['loc costophrenic angle'],"['COPD signs', 'costophrenic angle blunting', ...",['C0024117' 'C0742855'],['C0230151']
160857,1284011361929414522094646571696751542351444145...,49,20090609,1284011361929414522094646571696751542351444145...,282743729971423358706056731890510600934,1944.0,F,POSTEROANTERIOR,PA,Manual review of DICOM fields,...,10.0,-0.85,3639865,dentr normal .,RNN_model,['normal'],[],['normal'],[],[]
160858,1284011361929414522086390631696751542351444145...,49,20110415,1284011361929414522086390631696751542351444145...,52648743308541843883453242716226652771,1965.0,M,,AP_horizontal,Manual review of DICOM fields,...,,784.00,4035503,tub endotraqueal sond nasogastr situacion cor...,RNN_model,"['NSG tube', ' endotracheal tube']",['loc tracheal'],"['NSG tube', ' endotracheal tube', 'loc trache...",['C0336630'],['C0040578']
160859,1284011361929414522084108901696751542351444145...,49,20101214,1284011361929414522084108901696751542351444145...,228646130593152933811948996634154201216,1943.0,F,POSTEROANTERIOR,PA,Manual review of DICOM fields,...,10.0,-1.00,3958189,dentr normal .,RNN_model,['normal'],[],['normal'],[],[]


Drop records with missing attributes:

In [4]:
original_size = len(padchest_df)
padchest_df = padchest_df.dropna(
    subset=['Projection',
            'PatientSex_DICOM',
            'PatientBirth',
            'StudyDate_DICOM',
            'Labels'])
dropped_size = len(padchest_df)
print(f'padchest_df size: original {original_size}, new {dropped_size}, dropped {original_size - dropped_size}')

padchest_df size: original 160861, new 160748, dropped 113


In [5]:
if UNIQUE_STUDIES:
    padchest_df = padchest_df.groupby('PatientID').first()

Build labels df and join:

In [6]:
# reset index to match with labels df
padchest_df.reset_index(drop=True, inplace=True)

aliases = {
    'infiltration': {
        'infiltrates',
        'interstitial pattern',
        'ground glass pattern',
        'reticular interstitial pattern',
        'reticulonodular interstitial pattern',
        'alveolar pattern',
        'consolidation',
        'air bronchogram'
    },
    'pleural_thickening' : {
        'pleural thickening'
    }
}

labels = {}
labels_col = padchest_df['Labels']
for pathology in PATHOLOGIES:
    mask = labels_col.str.contains(pathology)
    pat_aliases = aliases.get(pathology, None)
    if pat_aliases:
        for pat_alias in pat_aliases:
            mask |= labels_col.str.contains(pat_alias)
    labels[pathology] = mask.values.astype(int)

labels_df = pd.DataFrame(labels)
# join
df = pd.concat([padchest_df, labels_df], axis=1)
df

Unnamed: 0,ImageID,ImageDir,StudyDate_DICOM,StudyID,PatientID,PatientBirth,PatientSex_DICOM,ViewPosition_DICOM,Projection,MethodProjection,...,effusion,emphysema,fibrosis,hernia,infiltration,mass,nodule,pleural_thickening,pneumonia,pneumothorax
0,20536686640136348236148679891455886468_k6ga29.png,0,20140915,20536686640136348236148679891455886468,839860488694292331637988235681460987,1930.0,F,POSTEROANTERIOR,PA,Manual review of DICOM fields,...,0,0,0,0,0,0,0,0,0,0
1,135803415504923515076821959678074435083_fzis7d...,0,20150914,135803415504923515076821959678074435083,313572750430997347502932654319389875966,1929.0,M,LATERAL,L,Manual review of DICOM fields,...,0,0,1,0,1,0,1,0,0,0
2,135803415504923515076821959678074435083_fzis7b...,0,20150914,135803415504923515076821959678074435083,313572750430997347502932654319389875966,1929.0,M,POSTEROANTERIOR,PA,Manual review of DICOM fields,...,0,0,1,0,1,0,1,0,0,0
3,113855343774216031107737439268243531979_3k951l...,0,20150717,113855343774216031107737439268243531979,50783093527901818115346441867348318648,1925.0,F,POSTEROANTERIOR,PA,Manual review of DICOM fields,...,0,0,0,0,0,0,0,0,0,0
4,113855343774216031107737439268243531979_3k951n...,0,20150717,113855343774216031107737439268243531979,50783093527901818115346441867348318648,1925.0,F,LATERAL,L,Manual review of DICOM fields,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160743,1284011361929414522814654121696751542351444145...,49,20110321,1284011361929414522814654121696751542351444145...,112930952416074060371371014599496493673,1948.0,M,POSTEROANTERIOR,PA,Manual review of DICOM fields,...,0,0,0,0,0,0,0,0,0,0
160744,1284011361929414522094646571696751542351444145...,49,20090609,1284011361929414522094646571696751542351444145...,282743729971423358706056731890510600934,1944.0,F,POSTEROANTERIOR,PA,Manual review of DICOM fields,...,0,0,0,0,0,0,0,0,0,0
160745,1284011361929414522086390631696751542351444145...,49,20110415,1284011361929414522086390631696751542351444145...,52648743308541843883453242716226652771,1965.0,M,,AP_horizontal,Manual review of DICOM fields,...,0,0,0,0,0,0,0,0,0,0
160746,1284011361929414522084108901696751542351444145...,49,20101214,1284011361929414522084108901696751542351444145...,228646130593152933811948996634154201216,1943.0,F,POSTEROANTERIOR,PA,Manual review of DICOM fields,...,0,0,0,0,0,0,0,0,0,0


Rename columns and filter:

In [7]:
original_size = len(df)

# rename cols
df = df.rename(columns={
    'ImageID': 'name',
    'Projection': 'view',
    'PatientSex_DICOM': 'sex',
    'PatientBirth': 'birth_date',
    'StudyDate_DICOM': 'study_date'
})
# keep views
print('Available views: ', df.view.unique())
df = df[df['view'].isin(['AP', 'PA', 'AP_horizontal'])]
# compute age in the study
df['age'] = df['study_date'].astype(str).str[:4].astype(int) - df['birth_date'].astype(int)
df = df[df['age'].between(*AGE_INTERVAL, inclusive='both')]
# keep cols
df = df[['name', 'age', 'sex', 'view'] + PATHOLOGIES]

# remove image extension
df['name'] = df['name'].str[:-4]
df['view'] = df['view'].replace('AP_horizontal','ap')
df['sex'] = df['sex'].str.lower()
df['view'] = df['view'].str.lower()

df.insert(0, 'dataset', 'padchest', True)

dropped_size = len(df)
print(f'df size: original {original_size}, new {dropped_size}, dropped {original_size - dropped_size}')

df

Available views:  ['PA' 'L' 'AP' 'AP_horizontal' 'COSTAL' 'UNK' 'EXCLUDE']
df size: original 160748, new 93781, dropped 66967


Unnamed: 0,dataset,name,age,sex,view,atelectasis,cardiomegaly,consolidation,edema,effusion,emphysema,fibrosis,hernia,infiltration,mass,nodule,pleural_thickening,pneumonia,pneumothorax
5,padchest,313903302629300007485735352869488750471_75sg0k,39,m,ap,0,0,0,0,0,0,0,0,1,0,0,0,0,0
7,padchest,3137231742710829928-247610802266403640553_kine6a,58,m,pa,1,0,0,0,1,0,0,0,0,0,0,1,0,0
8,padchest,313723174271082992847610802266403640553-4_hhi4rq,58,m,ap,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,padchest,313723174271082992847610802266403640553-2_40kxq3,58,m,ap,0,0,0,0,1,0,0,0,1,0,0,0,0,0
10,padchest,313723174271082992847610802266403640553_w8dk8c,58,m,ap,0,0,0,0,1,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160743,padchest,1284011361929414522814654121696751542351444145...,63,m,pa,0,0,0,0,0,0,0,0,0,0,0,0,0,0
160744,padchest,1284011361929414522094646571696751542351444145...,65,f,pa,0,0,0,0,0,0,0,0,0,0,0,0,0,0
160745,padchest,1284011361929414522086390631696751542351444145...,46,m,ap,0,0,0,0,0,0,0,0,0,0,0,0,0,0
160746,padchest,1284011361929414522084108901696751542351444145...,67,f,pa,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Removing records with wrong labels or corrupted images:

In [8]:
incorrect_labels = [
    '216840111366964012558082906712010102112808556_03-181-172',
]
corrupted = [
    # https://github.com/mlmed/torchxrayvision/blob/7879060cbe0a8172d8f91ddab786ba707fbfa5ec/torchxrayvision/datasets.py#L746
    "216840111366964012819207061112010307142602253_04-014-084",
    "216840111366964012989926673512011074122523403_00-163-058",
    "216840111366964012959786098432011033083840143_00-176-115",
    "216840111366964012558082906712009327122220177_00-102-064",
    "216840111366964012339356563862009072111404053_00-043-192",
    "216840111366964013076187734852011291090445391_00-196-188",
    "216840111366964012373310883942009117084022290_00-064-025",
    "216840111366964012283393834152009033102258826_00-059-087",
    "216840111366964012373310883942009170084120009_00-097-074",
    "216840111366964012819207061112010315104455352_04-024-184",
    "216840111366964012819207061112010306085429121_04-020-102",
    # truncated
    '216840111366964013590140476722013058110301622_02-056-111',
    '216840111366964013590140476722013043111952381_02-065-198',
    '216840111366964013829543166512013353113303615_02-092-190',
    '216840111366964012373310883942009180082307973_00-097-011',
    # corruted nf
    '216840111366964012989926673512011083134050913_00-168-009',
    '216840111366964012989926673512011151082430686_00-157-045',
    '216840111366964012487858717522009280135853083_00-075-001',
    '216840111366964013962490064942014134093945580_01-178-104',
    '216840111366964012373310883942009152114636712_00-102-045',
    '216840111366964013686042548532013208193054515_02-026-007',
]
wrong = incorrect_labels + corrupted
df = df[~df['name'].isin(wrong)]
df = df.reset_index(drop=True)
df

Unnamed: 0,dataset,name,age,sex,view,atelectasis,cardiomegaly,consolidation,edema,effusion,emphysema,fibrosis,hernia,infiltration,mass,nodule,pleural_thickening,pneumonia,pneumothorax
0,padchest,313903302629300007485735352869488750471_75sg0k,39,m,ap,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,padchest,3137231742710829928-247610802266403640553_kine6a,58,m,pa,1,0,0,0,1,0,0,0,0,0,0,1,0,0
2,padchest,313723174271082992847610802266403640553-4_hhi4rq,58,m,ap,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,padchest,313723174271082992847610802266403640553-2_40kxq3,58,m,ap,0,0,0,0,1,0,0,0,1,0,0,0,0,0
4,padchest,313723174271082992847610802266403640553_w8dk8c,58,m,ap,0,0,0,0,1,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93758,padchest,1284011361929414522814654121696751542351444145...,63,m,pa,0,0,0,0,0,0,0,0,0,0,0,0,0,0
93759,padchest,1284011361929414522094646571696751542351444145...,65,f,pa,0,0,0,0,0,0,0,0,0,0,0,0,0,0
93760,padchest,1284011361929414522086390631696751542351444145...,46,m,ap,0,0,0,0,0,0,0,0,0,0,0,0,0,0
93761,padchest,1284011361929414522084108901696751542351444145...,67,f,pa,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [9]:
wf_df = df[df[PATHOLOGIES].any(axis=1)]
nf_df = df[~df[PATHOLOGIES].any(axis=1)]

### Overview

In [10]:
print(
    f'        Total: {df.shape[0]}\n'
    f'With findings: {wf_df.shape[0]:5d}\n'
    f'  No findings: {nf_df.shape[0]:5d}'
)
df[PATHOLOGIES].sum()

        Total: 93763
With findings: 29025
  No findings: 64738


atelectasis            4808
cardiomegaly           6782
consolidation          1197
edema                   865
effusion               5075
emphysema               939
fibrosis                489
hernia                 1034
infiltration          10455
mass                    738
nodule                 3429
pleural_thickening     2691
pneumonia              3548
pneumothorax            306
dtype: int64

### Saving


With findings:

In [11]:
wf_filepath = join(metachest_dir, f'{ds_name}.csv')
wf_df.to_csv(wf_filepath, index=False)
wf_filepath

'/data/datasets/metachest/padchest.csv'

No findings and MTL partition:

In [12]:
def generate_mtl_nf_partition(nf_df, seed=0, mset=(0, 1, 2)):
    n_mtrn = 380503
    n_mval = 6793
    n_mtst = 209198
    n_total = n_mtrn + n_mval + n_mtst
    pct_mtrn = n_mtrn / n_total
    pct_mval = n_mval / n_total
    pct_mtst = n_mtst / n_total

    nf_df = nf_df.iloc[:, :5]
    nf_df = nf_df.sample(frac=1)

    n_total = nf_df.shape[0]
    n_mtrn = int(n_total * pct_mtrn)
    n_mtst = int(n_total * pct_mtst)
    n_mval = n_total - (n_mtrn + n_mtst)

    mtrn_df = nf_df.iloc[:n_mtrn].copy()
    mval_df = nf_df.iloc[n_mtrn:n_mtrn+n_mval].copy()
    mtst_df = nf_df.iloc[n_mtrn+n_mval:].copy()
    mtrn_df['mset'] = mset[0]
    mval_df['mset'] = mset[1]
    mtst_df['mset'] = mset[2]

    nf_mtl_df = pd.concat([mtrn_df, mval_df, mtst_df])
    nf_filepath = join(metachest_dir, f'{ds_name}_nf.csv')
    nf_mtl_df.to_csv(nf_filepath, index=False)

    final_pct_mtrn = mtrn_df.shape[0] / nf_mtl_df.shape[0]
    final_pct_mval = mval_df.shape[0] / nf_mtl_df.shape[0]
    final_pct_mtst = mtst_df.shape[0] / nf_mtl_df.shape[0]

    print(f'Original: '
        f'mtrn={pct_mtrn*100:5.2f}% '
        f'mval={pct_mval*100:5.2f}% '
        f'mtst={pct_mtst*100:5.2f}%\n'
        f'     New: '
        f'mtrn={final_pct_mtrn*100:5.2f}% '
        f'mval={final_pct_mval*100:5.2f}% '
        f'mtst={final_pct_mtst*100:5.2f}%\n'
        f'     New: '
        f'mtrn={mtrn_df.shape[0]} '
        f'mval={mval_df.shape[0]} '
        f'mtst={mtst_df.shape[0]}'
    )

    print(f'Saved to {nf_filepath}')


generate_mtl_nf_partition(nf_df)


Original: mtrn=63.79% mval= 1.14% mtst=35.07%
     New: mtrn=63.79% mval= 1.14% mtst=35.07%
     New: mtrn=41296 mval=738 mtst=22704
Saved to /data/datasets/metachest/padchest_nf.csv
