In [1]:
import pandas as pd
import numpy as np
import glob
import json
import glob
from PIL import Image
from multiprocessing import Pool
from tqdm import tqdm
import os
import random
import shutil

from sklearn.model_selection import train_test_split
from collections import Counter


Label Dictionaries for multi-label classification:

COVID: 0 for negative, 1 for positive

CheXechonet: 0 for negative, 1 for positive

TBX11K: 0 for healthy, 1 for sick, 2 for tb

RSNA: 0 for normal, 1 for not normal & no lung opacity, 2 for lung opacity

SIIM: 0 for negative, 1 for positive

#### CheXpert

In [2]:
!ls ../data/CheXpert\ Plus

chexbert_labels      df_chexpert_plus_240401.csv  radgraph-XL-annotations.zip
chexbert_labels.zip  radgraph-XL-annotations


In [3]:
!ls ../data/CheXpert

archive.zip			     df_chexpert_plus_240401.csv
chexpert_5x200.csv		     final-datasets
chexpert_plus_labels_with_5x200.csv  README.md
chexpert_processed.csv		     train_cheXbert.xls
CheXpert-v1.0-small		     train_visualCheXbert.xls


In [4]:
chexpert_plus = "../data/CheXpert Plus/df_chexpert_plus_240401.csv"
chexpert_plus = pd.read_csv(chexpert_plus)
print(chexpert_plus.shape)

def get_report_from_row(row):
    def replace_nan(t):
        return "" if pd.isna(t) else t
    
    section_impression = replace_nan(row['section_impression'])
    section_findings = replace_nan(row['section_findings'])
    section_summary = replace_nan(row['section_summary'])
    section_report = replace_nan(row['report'])    
    
    if section_impression and section_findings and section_summary:
        text = f"IMPRESSION: {section_impression}\nFINDINGS: {section_findings}\nSUMMARY: {section_summary}"
    elif section_impression and section_findings:
        text = f"IMPRESSION: {section_impression}\nFINDINGS: {section_findings}"
    elif section_impression:
        text = f"IMPRESSION: {section_impression}"
    elif section_findings:
        text = f"FINDINGS: {section_findings}"
    elif section_report:
        text = f"REPORT: {section_report}"
    else:
        text = ""
        
    text = " ".join(text.split()).strip().lower()
    
    return text

#keep the following columns: path_to_image age sex race split report

chexpert_plus['report'] = chexpert_plus.apply(get_report_from_row, axis=1)
chexpert_plus = chexpert_plus[['path_to_image', 'age', 'sex', 'race', 'split', 'report']]
chexpert_plus.head()


chexpert_plus.head()


(223462, 27)


Unnamed: 0,path_to_image,age,sex,race,split,report
0,train/patient42142/study5/view1_frontal.jpg,62.0,Female,White,train,impression: 1.tracheostomy tube remains in pla...
1,train/patient42142/study8/view1_frontal.jpg,62.0,Female,White,train,impression: 1.tracheostomy and subdiaphragmati...
2,train/patient42142/study2/view1_frontal.jpg,62.0,Female,White,train,impression: 1.single portable semiupright ap v...
3,train/patient42142/study4/view1_frontal.jpg,62.0,Female,White,train,impression: 1. patchy subsegmental bibasilar (...
4,train/patient42142/study3/view1_frontal.jpg,62.0,Female,White,train,impression: 1.upright frontal view of the ches...


In [4]:
# Load labels for chexpert plus
chexpert_plus_labels = "../data/CheXpert Plus/chexbert_labels/report_fixed.json"
chexpert_plus_labels = pd.read_json(chexpert_plus_labels, lines=True)
chexpert_plus_labels.iloc[:, 1:] = chexpert_plus_labels.iloc[:, 1:].fillna(0) # Replace nan with 0 in columns 1 onwards
chexpert_plus_labels.iloc[:, 1:] = chexpert_plus_labels.iloc[:, 1:].replace(-1.0, 1.0) # Replace -1.0 with 1.0
print(chexpert_plus_labels.shape)
chexpert_plus_labels.head()

(223462, 15)


Unnamed: 0,path_to_image,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,No Finding
0,train/patient42142/study5/view1_frontal.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,train/patient42142/study8/view1_frontal.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,train/patient42142/study2/view1_frontal.jpg,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
3,train/patient42142/study4/view1_frontal.jpg,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,train/patient42142/study3/view1_frontal.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [5]:
chexpert_plus_labels.columns

Index(['path_to_image', 'Enlarged Cardiomediastinum', 'Cardiomegaly',
       'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia',
       'Atelectasis', 'Pneumothorax', 'Pleural Effusion', 'Pleural Other',
       'Fracture', 'Support Devices', 'No Finding'],
      dtype='object')

In [6]:
chexpert_original = "../data/CheXpert/train_cheXbert.xls"
chexpert_original = pd.read_csv(chexpert_original)
chexpert_original.iloc[:, 5:] = chexpert_original.iloc[:, 5:].fillna(0) # Replace NaNs from columns 5 onwards with 0
chexpert_original.iloc[:, 5:] = chexpert_original.iloc[:, 5:].replace(-1.0, 1.0) # Replace -1.0 with 1.0
# Replace CheXpert-v1.0 with empty string
chexpert_original['Path'] = chexpert_original['Path'].apply(lambda x: x.replace("CheXpert-v1.0/", ""))
print(chexpert_original.shape)
chexpert_original.head()

(223414, 19)


Unnamed: 0,Path,Sex,Age,Frontal/Lateral,AP/PA,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,No Finding
0,train/patient00001/study1/view1_frontal.jpg,Female,68,Frontal,AP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,train/patient00002/study2/view1_frontal.jpg,Female,87,Frontal,AP,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
2,train/patient00002/study1/view1_frontal.jpg,Female,83,Frontal,AP,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,train/patient00002/study1/view2_lateral.jpg,Female,83,Lateral,,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,train/patient00003/study1/view1_frontal.jpg,Male,41,Frontal,AP,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# Check if there is any row in path_to_image column that has the word "train"
print(chexpert_plus[chexpert_plus['path_to_image'].str.contains('train')].shape)
print(chexpert_original[chexpert_original['Path'].str.contains('train')].shape)


(223228, 6)
(223414, 19)


In [8]:
# check for overlap in path_to_image column and Path column
print(chexpert_plus[chexpert_plus['path_to_image'].isin(chexpert_original['Path'])].shape)
print(chexpert_original[chexpert_original['Path'].isin(chexpert_plus['path_to_image'])].shape)


(223228, 6)
(223228, 19)


In [9]:
# Import required libraries
import pandas as pd
import numpy as np
from collections import defaultdict

# Define the common labels between datasets
chexpert_labels = [
    'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity', 
    'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia',
    'Atelectasis', 'Pneumothorax', 'Pleural Effusion', 'Pleural Other',
    'Fracture', 'Support Devices', 'No Finding'
]

# Create a comparison dictionary to store statistics
comparison_stats = defaultdict(lambda: {'matching': 0, 'different': 0, 'total': 0})

# Get overlapping image paths
common_paths = set(chexpert_plus_labels['path_to_image']).intersection(
    set(chexpert_original['Path'])
)

print(len(chexpert_plus_labels), len(chexpert_original), len(common_paths))

# Print label distributions side by side
print("\nLabel Distribution Comparison:")
print(f"{'Label':<30} {'CheXpert Plus':<25} {'Original CheXpert':<25}")
print("-" * 80)
for label in chexpert_labels:
    plus_count = chexpert_plus_labels[label].sum()
    plus_total = len(chexpert_plus_labels)
    plus_pct = f"{plus_count} ({(plus_count/plus_total*100):.2f}%)"
    
    orig_count = chexpert_original[label].sum() 
    orig_total = len(chexpert_original)
    orig_pct = f"{orig_count} ({(orig_count/orig_total*100):.2f}%)"
    
    print(f"{label:<30} {plus_pct:<25} {orig_pct:<25}")

# Vectorized comparison for each label
for label in chexpert_labels:
    # Create mapping series for fast lookup
    plus_labels = pd.Series(
        chexpert_plus_labels[label].values,
        index=chexpert_plus_labels['path_to_image']
    )
    orig_labels = pd.Series(
        chexpert_original[label].values,
        index=chexpert_original['Path']
    )
    
    # Get labels for common paths
    plus_common = plus_labels[list(common_paths)]
    orig_common = orig_labels[list(common_paths)]
    
    # Compare labels
    matching = (plus_common == orig_common).sum()
    total = len(common_paths)
    different = total - matching
    
    comparison_stats[label] = {
        'matching': matching,
        'different': different,
        'total': total,
        'match_percentage': (matching/total * 100) if total > 0 else 0
    }

# Convert to DataFrame for better visualization
comparison_df = pd.DataFrame(comparison_stats).T
print("\nLabel Comparison Statistics:")
print(comparison_df.sort_values('match_percentage', ascending=False))

# Print overall statistics
total_comparisons = sum(stats['total'] for stats in comparison_stats.values())
total_matching = sum(stats['matching'] for stats in comparison_stats.values())
print(f"\nOverall match percentage: {(total_matching/total_comparisons * 100):.2f}%")

223462 223414 223228

Label Distribution Comparison:
Label                          CheXpert Plus             Original CheXpert        
--------------------------------------------------------------------------------
Enlarged Cardiomediastinum     28468.0 (12.74%)          22664.0 (10.14%)         
Cardiomegaly                   43195.0 (19.33%)          34483.0 (15.43%)         
Lung Opacity                   111130.0 (49.73%)         103407.0 (46.28%)        
Lung Lesion                    14510.0 (6.49%)           10919.0 (4.89%)          
Edema                          67287.0 (30.11%)          65269.0 (29.21%)         
Consolidation                  42810.0 (19.16%)          40371.0 (18.07%)         
Pneumonia                      29008.0 (12.98%)          24242.0 (10.85%)         
Atelectasis                    73550.0 (32.91%)          68278.0 (30.56%)         
Pneumothorax                   22014.0 (9.85%)           20304.0 (9.09%)          
Pleural Effusion               10154

In [6]:
# map chexpert_plus to chexpert_plus_labels
chexpert_plus = chexpert_plus.merge(chexpert_plus_labels, on='path_to_image')
print(chexpert_plus.shape)
chexpert_plus.head()

(223462, 20)


Unnamed: 0,path_to_image,age,sex,race,split,report,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,No Finding
0,train/patient42142/study5/view1_frontal.jpg,62.0,Female,White,train,impression: 1.tracheostomy tube remains in pla...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,train/patient42142/study8/view1_frontal.jpg,62.0,Female,White,train,impression: 1.tracheostomy and subdiaphragmati...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,train/patient42142/study2/view1_frontal.jpg,62.0,Female,White,train,impression: 1.single portable semiupright ap v...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
3,train/patient42142/study4/view1_frontal.jpg,62.0,Female,White,train,impression: 1. patchy subsegmental bibasilar (...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,train/patient42142/study3/view1_frontal.jpg,62.0,Female,White,train,impression: 1.upright frontal view of the ches...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [7]:
chexpert_5x200 = "../data/CheXpert/chexpert_5x200.csv"
chexpert_5x200 = pd.read_csv(chexpert_5x200)
chexpert_5x200.drop(columns=['Unnamed: 0'], inplace=True)
print(chexpert_5x200.shape)
chexpert_5x200['Path'] = chexpert_5x200['Path'].apply(lambda x: x.replace("CheXpert-v1.0/", ""))
chexpert_5x200_paths = set(chexpert_5x200['Path'].to_list())
chexpert_5x200.head()

(1000, 20)


Unnamed: 0,Path,Sex,Age,Frontal/Lateral,AP/PA,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,Report Impression
0,train/patient38614/study1/view1_frontal.jpg,Male,26,Frontal,AP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,\n1. bilateral apices of the lungs are obscure...
1,train/patient53625/study1/view1_frontal.jpg,Female,46,Frontal,AP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,\n1. ap supine chest radiograph. there has...
2,train/patient37669/study1/view1_frontal.jpg,Female,60,Frontal,AP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,\n \n1.ap chest radiograph taken in recovery h...
3,train/patient15715/study1/view1_frontal.jpg,Male,41,Frontal,PA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,\n \n1. mild right mid and lower lung atelect...
4,train/patient48702/study1/view1_frontal.jpg,Male,49,Frontal,AP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,\n \n1.low lung volumes with presumed right b...


In [8]:
# find the rows in chexpert_5x200 that are present in chexpert_plus. then replace the "split" column in chexpert_plus with "test" for those rows
# x = chexpert_5x200[chexpert_5x200['Path'].isin(chexpert_plus['path_to_image'])]
# chexpert_plus.loc[chexpert_plus['path_to_image'].isin(x['Path']), 'split'] = 'test'

In [9]:
# drop the rows in chexpert_plus that are present in chexpert_5x200 (chexpert_5x200_paths)
chexpert_plus = chexpert_plus[~chexpert_plus['path_to_image'].isin(chexpert_5x200_paths)]
print(chexpert_plus.shape)
chexpert_plus.split.value_counts()

(222464, 20)


split
train    222230
valid       234
Name: count, dtype: int64

In [11]:
#check if the image paths in chexpert_5x200 are present in chexpert_plus 

x = chexpert_5x200[~chexpert_5x200['Path'].isin(chexpert_plus['path_to_image'])]
x.drop(columns=["Frontal/Lateral", "AP/PA"], inplace=True)
x['race'] = "Unknown"
x['split'] = "test"
x.rename(columns={'Path': 'path_to_image', 'Report Impression': 'report', 'Age': 'age', 'Sex': 'sex'}, inplace=True)
#copy x into chexpert_plus. first format it according to the format of chexpert_plus and then append it to chexpert_plus
chexpert_plus = pd.concat([chexpert_plus, x])
chexpert_plus.split.value_counts()

split
train    222230
test       1000
valid       234
Name: count, dtype: int64

In [3]:
# chexpert_plus.to_csv("../data/CheXpert/chexpert_plus_labels_with_5x200.csv", index=False, header=True, sep="\t")
chexpert_plus = pd.read_csv("../data/CheXpert/CheXpert-v1.0-small/chexpert_plus_labels_with_5x200.csv", sep="\t")
print(chexpert_plus.shape)
chexpert_plus.split.value_counts()
chexpert_plus.head()

(223464, 20)


Unnamed: 0,path_to_image,age,sex,race,split,report,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,No Finding
0,train/patient42142/study5/view1_frontal.jpg,62.0,Female,White,train,impression: 1.tracheostomy tube remains in pla...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,train/patient42142/study8/view1_frontal.jpg,62.0,Female,White,train,impression: 1.tracheostomy and subdiaphragmati...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,train/patient42142/study2/view1_frontal.jpg,62.0,Female,White,train,impression: 1.single portable semiupright ap v...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
3,train/patient42142/study4/view1_frontal.jpg,62.0,Female,White,train,impression: 1. patchy subsegmental bibasilar (...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,train/patient42142/study3/view1_frontal.jpg,62.0,Female,White,train,impression: 1.upright frontal view of the ches...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [12]:
chexpert_plus[chexpert_plus['split'] == 'test'][['Cardiomegaly', 'Edema', 'Consolidation', 'Atelectasis','Pleural Effusion']].sum()
# chexpert_plus


Cardiomegaly        200.0
Edema               200.0
Consolidation       200.0
Atelectasis         200.0
Pleural Effusion    200.0
dtype: float64

In [14]:
images = glob.glob("../data/CheXpert/CheXpert-v1.0-small/*/*/*/*.jpg")
images = [name.replace("../data/CheXpert/CheXpert-v1.0-small/", "") for name in images]
print(len(images))

223648


In [15]:
s1 = set(chexpert_5x200['Path'].to_list())
s2 = set(chexpert_plus['path_to_image'].to_list())
s3 = set(images)
s4 = set(chexpert_plus[chexpert_plus['split'] == 'test']['path_to_image'].to_list())
print(len(s1.intersection(s2)))
print(len(s1.intersection(s3)))
print(len(s1.intersection(s4)))

1000
1000
1000


In [16]:
print(chexpert_5x200.columns)
chexpert_5x200[['Cardiomegaly', 'Edema', 'Consolidation', 'Atelectasis','Pleural Effusion']].sample(5)


Index(['Path', 'Sex', 'Age', 'Frontal/Lateral', 'AP/PA', 'No Finding',
       'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity',
       'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis',
       'Pneumothorax', 'Pleural Effusion', 'Pleural Other', 'Fracture',
       'Support Devices', 'Report Impression'],
      dtype='object')


Unnamed: 0,Cardiomegaly,Edema,Consolidation,Atelectasis,Pleural Effusion
297,1.0,0.0,0.0,0.0,0.0
103,0.0,0.0,0.0,1.0,0.0
822,0.0,0.0,0.0,0.0,1.0
288,1.0,0.0,0.0,0.0,0.0
58,0.0,0.0,0.0,1.0,0.0


In [18]:
chexpert_5x200[['Cardiomegaly', 'Edema', 'Consolidation', 'Atelectasis','Pleural Effusion']].sum()

Cardiomegaly        200.0
Edema               200.0
Consolidation       200.0
Atelectasis         200.0
Pleural Effusion    200.0
dtype: float64

In [14]:
chexpert_5x200[chexpert_5x200['Path'].isin(chexpert_plus['path_to_image'])]

# Inverse of the above
chexpert_plus[chexpert_plus['path_to_image'].isin(chexpert_5x200['Path'])]

Unnamed: 0,path_to_image,path_to_dcm,frontal_lateral,ap_pa,deid_patient_id,patient_report_date_order,report,section_narrative,section_clinical_history,section_history,...,section_accession_number,age,sex,race,ethnicity,interpreter_needed,insurance_type,recent_bmi,deceased,split
236,train/patient12349/study17/view2_frontal.jpg,train/patient12349/study17/view2_frontal.dcm,Frontal,AP,patient12349,21,NARRATIVE:\nRADIOGRAPHIC EXAMINATION OF THE CH...,\nRADIOGRAPHIC EXAMINATION OF THE CHEST: 1/13/...,"57 years of age, Female, rule out infiltrate....",,...,\n18918811\nThis report has been anonymized. A...,58.0,Female,Other,Hispanic/Latino,No,Medicare,31.9,No,train
317,train/patient26886/study1/view1_frontal.jpg,train/patient26886/study1/view1_frontal.dcm,Frontal,PA,patient26886,2,NARRATIVE:\nCHEST: Two views. 1/11/2007\nCLI...,\nCHEST: Two views. 1/11/2007\n,48 -year-old male with hepatocellular carcino...,,...,\n#2-4-0-7-9-0-6-5-3-4-7-7\nThis report has be...,49.0,Male,White,Hispanic/Latino,No,Unknown,31.8,Yes,train
1118,train/patient25525/study9/view1_frontal.jpg,train/patient25525/study9/view1_frontal.dcm,Frontal,AP,patient25525,12,"NARRATIVE:\nExam: Chest 1 View, 11/22/2001\n \...","\nExam: Chest 1 View, 11/22/2001\n \n",42 years old Female with Post bronchoscopy\n \n,,...,\n36817447047\nThis report has been anonymized...,40.0,Female,Other,Hispanic/Latino,Yes,Medicare,13.6,Yes,train
1140,train/patient08963/study10/view2_frontal.jpg,train/patient08963/study10/view2_frontal.dcm,Frontal,AP,patient08963,16,NARRATIVE:\nPORTABLE CHEST: 12/24/2010 1713 ho...,\nPORTABLE CHEST: 12/24/2010 1713 hours\n,66 -year-old male status post- heart transpla...,,...,\n#017367\nThis report has been anonymized. Al...,66.0,Male,Unknown,Unknown,Unknown,Unknown,,Yes,train
1141,train/patient08963/study10/view1_frontal.jpg,train/patient08963/study10/view1_frontal.dcm,Frontal,AP,patient08963,17,NARRATIVE:\nPORTABLE CHEST: 3/29/2008 1713 hou...,\nPORTABLE CHEST: 3/29/2008 1713 hours\n,66 -year-old male status post- heart transpla...,,...,\n0_3_2_1_0_9_9_8\nThis report has been anonym...,66.0,Male,Unknown,Unknown,Unknown,Unknown,,Yes,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
222803,train/patient18399/study1/view1_frontal.jpg,train/patient18399/study1/view1_frontal.dcm,Frontal,AP,patient18399,1,NARRATIVE:\nChest 1 View 5-14-2010\n \nCLINICA...,\nChest 1 View 5-14-2010\n \n,73 years-old Male. Cp \n \n,,...,\n63789586172\nThis report has been anonymized...,74.0,Male,White,Non-Hispanic/Non-Latino,No,Medicare,26.4,No,train
222907,train/patient09787/study1/view1_frontal.jpg,train/patient09787/study1/view1_frontal.dcm,Frontal,AP,patient09787,1,"NARRATIVE:\nCHEST, ONE VIEW: 1-18-2011\n \n ...","\nCHEST, ONE VIEW: 1-18-2011\n \n",,,...,\nCJRSQXYBX0\nThis report has been anonymized....,69.0,Female,Asian,Non-Hispanic/Non-Latino,Yes,Medicare,22.7,No,train
223010,train/patient52455/study1/view1_frontal.jpg,train/patient52455/study1/view1_frontal.dcm,Frontal,AP,patient52455,1,NARRATIVE:\nSINGLE VIEW PORTABLE CHEST: 3/26/0...,\nSINGLE VIEW PORTABLE CHEST: 3/26/08 AT 0955 ...,,,...,\n#uhwplxbxoul\nThis report has been anonymize...,22.0,Male,Unknown,Unknown,Unknown,Medicare,,No,train
223457,train/patient59696/study1/view1_frontal.jpg,train/patient59696/study1/view1_frontal.dcm,Frontal,AP,patient59696,1,NARRATIVE:\nRADIOGRAPHIC EXAMINATION OF THE CH...,\nRADIOGRAPHIC EXAMINATION OF THE CHEST: 12/31...,"80 years of age, Female, Pulmonary edema.\n \n",,...,\n#7n4778536837\nThis report has been anonymiz...,81.0,Female,White,Non-Hispanic/Non-Latino,No,Medicare,24.0,No,train


#### MIMIC-CXR

In [2]:
!ls ../data/MIMIC-CXR/mimic-cxr-jpg-2.0.0-small/

clip-files		      mimic-cxr-2.1.0-test-set-labeled.csv
final-datasets		      mimic_cxr_labels_reports_splits.csv
IMAGE_FILENAMES		      mimic-cxr-lt
images			      mimic_cxr_sectioned.csv
LICENSE.txt		      mimic_cxr_sections.csv
mimic-cxr-2.0.0-chexpert.csv  patients.csv
mimic-cxr-2.0.0-merged.csv    README
mimic-cxr-2.0.0-metadata.csv  reports
mimic-cxr-2.0.0-negbio.csv    resize_mimic.py
mimic-cxr-2.0.0-split.csv


In [3]:
# count total images 

images = glob.glob("../data/MIMIC-CXR/mimic-cxr-jpg-2.0.0-small/images/files/*/*/*/*.jpg")
print(len(images))


377110


In [4]:
merged = "../data/MIMIC-CXR/mimic-cxr-jpg-2.0.0-small/mimic-cxr-2.0.0-merged.csv"
merged = pd.read_csv(merged)
merged = merged.drop(columns=['Unnamed: 0'])
merged['subject_id'] = merged['subject_id'].astype(str)
print(merged.shape)
print(merged.columns)
merged['split'].value_counts()

(377095, 18)
Index(['dicom_id', 'study_id', 'subject_id', 'split', 'Atelectasis',
       'Cardiomegaly', 'Consolidation', 'Edema', 'Enlarged Cardiomediastinum',
       'Fracture', 'Lung Lesion', 'Lung Opacity', 'No Finding',
       'Pleural Effusion', 'Pleural Other', 'Pneumonia', 'Pneumothorax',
       'Support Devices'],
      dtype='object')


split
train       368945
test          5159
validate      2991
Name: count, dtype: int64

In [5]:
cxr_lt_train = "../data/MIMIC-CXR/mimic-cxr-jpg-2.0.0-small/mimic-cxr-lt/train.csv"
cxr_lt_train = pd.read_csv(cxr_lt_train)
cxr_lt_val = "../data/MIMIC-CXR/mimic-cxr-jpg-2.0.0-small/mimic-cxr-lt/development.csv"
cxr_lt_val = pd.read_csv(cxr_lt_val)
cxr_lt_test = "../data/MIMIC-CXR/mimic-cxr-jpg-2.0.0-small/mimic-cxr-lt/test.csv"
cxr_lt_test = pd.read_csv(cxr_lt_test)

cxr_lt_train['split'] = 'train'
cxr_lt_val['split'] = 'validate'
cxr_lt_test['split'] = 'test'

# Combine all three
cxr_lt = pd.concat([cxr_lt_train, cxr_lt_val, cxr_lt_test])

# Write to csv
cxr_lt.to_csv("../data/MIMIC-CXR/mimic-cxr-jpg-2.0.0-small/mimic-cxr-lt/mimic-cxr-lt-merged.csv", index=False, header=True, sep="\t")

print(cxr_lt_train.shape, cxr_lt_val.shape, cxr_lt_test.shape, cxr_lt.shape)
print(cxr_lt_train.columns)


(264849, 33) (36769, 33) (75492, 33) (377110, 33)
Index(['dicom_id', 'subject_id', 'study_id', 'ViewPosition',
       'ViewCodeSequence_CodeMeaning', 'path', 'Atelectasis',
       'Calcification of the Aorta', 'Cardiomegaly', 'Consolidation', 'Edema',
       'Emphysema', 'Enlarged Cardiomediastinum', 'Fibrosis', 'Fracture',
       'Hernia', 'Infiltration', 'Lung Lesion', 'Lung Opacity', 'Mass',
       'No Finding', 'Nodule', 'Pleural Effusion', 'Pleural Other',
       'Pleural Thickening', 'Pneumomediastinum', 'Pneumonia',
       'Pneumoperitoneum', 'Pneumothorax', 'Subcutaneous Emphysema',
       'Support Devices', 'Tortuous Aorta', 'split'],
      dtype='object')


In [6]:
# find unique dicom_ids in cxr_lt
print(len(cxr_lt['dicom_id'].unique()))


377110


In [7]:
# check the split column in the original merged for cxr_lt_val and cxr_lt_test. 
# first create a new df for the overlapping dicom_ids and then do value counts on the split column

overlapping_dicom_ids = set(cxr_lt_val['dicom_id']).intersection(set(merged['dicom_id']))
overlapping_df = merged[merged['dicom_id'].isin(overlapping_dicom_ids)]
overlapping_df['split'].value_counts()

split
train       36014
test          449
validate      303
Name: count, dtype: int64

In [8]:
overlapping_dicom_ids = set(cxr_lt_test['dicom_id']).intersection(set(merged['dicom_id']))
overlapping_df = merged[merged['dicom_id'].isin(overlapping_dicom_ids)]
overlapping_df['split'].value_counts()


split
train       73905
test          998
validate      589
Name: count, dtype: int64

In [7]:
patients = pd.read_csv("../data/MIMIC-CXR/mimic-cxr-jpg-2.0.0-small/patients.csv")
patients['subject_id'] = patients['subject_id'].astype(str)
print(patients.shape)
patients.head()

patients_age = dict(zip(patients['subject_id'], patients['anchor_age']))
patients_gender = dict(zip(patients['subject_id'], patients['gender']))

# # create a "Age" column in merged df and copy the value from the patients_age dictionary
merged['age'] = merged['subject_id'].map(patients_age)
merged['sex'] = merged['subject_id'].map(patients_gender)

# Check how many subject_ids are present in the merged df
print(len(merged['subject_id'].unique()))
print(len(patients['subject_id'].unique()))

# Check how many subject_ids are present in the merged df but not in the patients df
print(len(set(merged['subject_id'].unique()) - set(patients['subject_id'].unique())))


(364627, 6)
65379
364627
3511


In [10]:
# create a "path" column in merged df and copy the value from the path column in cxr_lt. first create a dictionary to map dicom_id to path

dicom_id_to_path = dict(zip(cxr_lt['dicom_id'], cxr_lt['path']))

# create a "path" column in merged df and copy the value from the path column in cxr_lt
merged['path'] = merged['dicom_id'].map(dicom_id_to_path)

# Write to csv
merged.to_csv("../data/MIMIC-CXR/mimic-cxr-jpg-2.0.0-small/mimic-cxr-2.0.0-merged-with-paths.csv", index=False, header=True, sep="\t")


#### NIH

In [2]:
!ls ../data/ChestXRay-14/CXR8/

'AAA Job Opportunity!!! Cloud Computing - Masters or PhD Degree.pdf'
'AAA Physician AI Research Opportunity!!!.pdf'
'AAA Postdoctoral Fellowship Opportunity!!! - NIH Medical Image Analysis Postdoc.pdf'
 ARXIV_V5_CHESTXRAY.pdf
 BBox_List_2017.csv
 Data_Entry_2017_v2020.csv
 FAQ_CHESTXRAY.pdf
 final-datasets
 images
 LOG_CHESTXRAY.pdf
 LongTailCXR
 PruneCXR
 README_CHESTXRAY.pdf
 test_list.txt
 train_val_list.txt


In [4]:
images = glob.glob("../data/ChestXRay-14/CXR8/images/files/*.jpg")
print(len(images))


112120


In [5]:
random.sample(images, 10)

['../data/ChestXRay-14/CXR8/images/files/00005566_003.jpg',
 '../data/ChestXRay-14/CXR8/images/files/00026727_000.jpg',
 '../data/ChestXRay-14/CXR8/images/files/00000236_000.jpg',
 '../data/ChestXRay-14/CXR8/images/files/00022704_001.jpg',
 '../data/ChestXRay-14/CXR8/images/files/00014006_001.jpg',
 '../data/ChestXRay-14/CXR8/images/files/00013966_017.jpg',
 '../data/ChestXRay-14/CXR8/images/files/00002835_003.jpg',
 '../data/ChestXRay-14/CXR8/images/files/00015732_006.jpg',
 '../data/ChestXRay-14/CXR8/images/files/00014790_017.jpg',
 '../data/ChestXRay-14/CXR8/images/files/00004857_032.jpg']

In [14]:

# def resize_image(input_path):
#     with Image.open(input_path) as img:
#         img = img.resize((224, 224), Image.BICUBIC)
#         img.save(input_path)  # Overwrite the original image

# num_workers = 8

# with Pool(processes=num_workers) as pool:
#     list(tqdm(
#         pool.imap(resize_image, images),
#         total=len(images),
#         desc="Resizing images",
#         unit="image"
#     ))

In [3]:
nih_lt_train = "../data/ChestXRay-14/CXR8/PruneCXR/miccai2023_nih-cxr-lt_labels_train.csv"
nih_lt_train = pd.read_csv(nih_lt_train)
nih_lt_train['split'] = 'train'
print(nih_lt_train.shape)
nih_lt_train.head()

nih_lt_val = "../data/ChestXRay-14/CXR8/PruneCXR/miccai2023_nih-cxr-lt_labels_val.csv"
nih_lt_val = pd.read_csv(nih_lt_val)
nih_lt_val['split'] = 'valid'
print(nih_lt_val.shape)
nih_lt_val.head()

nih_lt_test = "../data/ChestXRay-14/CXR8/PruneCXR/miccai2023_nih-cxr-lt_labels_test.csv"
nih_lt_test = pd.read_csv(nih_lt_test)
nih_lt_test['split'] = 'test'
print(nih_lt_test.shape)
nih_lt_test.head()

(78506, 23)
(12533, 23)
(21081, 23)


Unnamed: 0,id,Atelectasis,Cardiomegaly,Consolidation,Edema,Effusion,Emphysema,Fibrosis,Hernia,Infiltration,...,Pneumonia,Pneumothorax,Pneumoperitoneum,Pneumomediastinum,Subcutaneous Emphysema,Tortuous Aorta,Calcification of the Aorta,No Finding,subj_id,split
0,00000013_000.png,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,13,test
1,00000013_001.png,0,0,0,0,0,1,0,0,0,...,0,1,0,0,1,0,0,0,13,test
2,00000013_002.png,0,0,0,0,0,1,0,0,0,...,0,1,0,0,1,0,0,0,13,test
3,00000013_003.png,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,13,test
4,00000013_004.png,0,0,0,0,1,1,0,0,1,...,0,1,0,0,1,0,0,0,13,test


In [4]:
#Merge all three in a single df called nih_lt
nih_lt = pd.concat([nih_lt_train, nih_lt_val, nih_lt_test])
nih_lt.rename(columns={'id': 'path'}, inplace=True)
print(nih_lt.shape)
print(nih_lt.split.value_counts())
nih_lt.head()

(112120, 23)
split
train    78506
test     21081
valid    12533
Name: count, dtype: int64


Unnamed: 0,path,Atelectasis,Cardiomegaly,Consolidation,Edema,Effusion,Emphysema,Fibrosis,Hernia,Infiltration,...,Pneumonia,Pneumothorax,Pneumoperitoneum,Pneumomediastinum,Subcutaneous Emphysema,Tortuous Aorta,Calcification of the Aorta,No Finding,subj_id,split
0,00000001_000.png,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,train
1,00000001_001.png,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,train
2,00000001_002.png,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,train
3,00000002_000.png,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,2,train
4,00000004_000.png,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4,train


In [5]:
# Write to csv
nih_lt.to_csv("../data/ChestXRay-14/CXR8/nih-lt-merged.csv", index=False, header=True, sep="\t")


In [7]:
nih_train = "../data/ChestXRay-14/CXR8/train_val_list.txt"
nih_train = pd.read_csv(nih_train, header=None)
nih_train.columns = ["path"]
print(nih_train.shape)
nih_train.head()

nih_test = "../data/ChestXRay-14/CXR8/test_list.txt"
nih_test = pd.read_csv(nih_test, header=None)
nih_test.columns = ["path"]
print(nih_test.shape)
nih_test.head()

nih_labels = "../data/ChestXRay-14/CXR8/Data_Entry_2017_v2020.csv"
nih_labels = pd.read_csv(nih_labels)
print(nih_labels.shape)
nih_labels.head()

# Create a mapping from Image Index to "Finding Labels"
nih_labels_mapping = dict(zip(nih_labels['Image Index'], nih_labels['Finding Labels']))

def split_label(label):
    return label.split("|")

for k, v in nih_labels_mapping.items():
    nih_labels_mapping[k] = split_label(v)



(86524, 1)
(25596, 1)
(112120, 11)


In [8]:
# load all values into a list and make it a set 
all_labels = [item for sublist in nih_labels_mapping.values() for item in sublist]
print(set(all_labels))


{'No Finding', 'Hernia', 'Pleural_Thickening', 'Consolidation', 'Mass', 'Atelectasis', 'Infiltration', 'Emphysema', 'Pneumonia', 'Edema', 'Effusion', 'Pneumothorax', 'Nodule', 'Cardiomegaly', 'Fibrosis'}


In [9]:
# nih_labels_mapping looks like this
# {'00000001_000.png': ['Cardiomegaly'],
#  '00000001_001.png': ['Cardiomegaly', 'Emphysema'],
#  '00000001_002.png': ['Cardiomegaly', 'Effusion'],
#  '00000002_000.png': ['No Finding'],
#  ...
#  }

# create a new df for nih_train and nih_test with the path and the labels. 
# each label should be a column and the value should be 1 if the label is present and 0 otherwise

from sklearn.model_selection import train_test_split


nih_train_new = nih_train.copy()
nih_test_new = nih_test.copy()

label_cols = list(set(all_labels))

for label in label_cols:
    nih_train_new[label] = nih_train_new['path'].apply(lambda x: 1 if label in nih_labels_mapping[x] else 0)
    nih_test_new[label] = nih_test_new['path'].apply(lambda x: 1 if label in nih_labels_mapping[x] else 0)
    
# Copy Patient Age and Patient Gender to the new dfs. they are in the original nih_labels df. 
# first make two dictionaries to map Image Index to Patient Age and Patient Gender
nih_labels_mapping_age = dict(zip(nih_labels['Image Index'], nih_labels['Patient Age']))
nih_labels_mapping_gender = dict(zip(nih_labels['Image Index'], nih_labels['Patient Gender']))
nih_train_new['age'] = nih_train_new['path'].map(nih_labels_mapping_age)
nih_train_new['sex'] = nih_train_new['path'].map(nih_labels_mapping_gender)
nih_train_new['split'] = 'train'

nih_test_new['age'] = nih_test_new['path'].map(nih_labels_mapping_age)
nih_test_new['sex'] = nih_test_new['path'].map(nih_labels_mapping_gender)
nih_test_new['split'] = 'test'


nih_train_new, nih_val_new = train_test_split(nih_train_new, test_size=0.05, random_state=42)
nih_val_new['split'] = 'valid'


nih_merged = pd.concat([nih_train_new, nih_val_new, nih_test_new])

# in path, replace png with jpg
nih_merged['path'] = nih_merged['path'].apply(lambda x: x.replace(".png", ".jpg"))

nih_merged.head()

Unnamed: 0,path,No Finding,Hernia,Pleural_Thickening,Consolidation,Mass,Atelectasis,Infiltration,Emphysema,Pneumonia,Edema,Effusion,Pneumothorax,Nodule,Cardiomegaly,Fibrosis,age,sex,split
10032,00003213_000.jpg,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,61,M,train
52497,00016256_000.jpg,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,24,M,train
62656,00019474_000.jpg,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,29,F,train
70633,00022235_001.jpg,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,46,M,train
61167,00018972_058.jpg,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,52,F,train


In [11]:
print(nih_merged.split.value_counts())
# Write to csv
nih_merged.to_csv("../data/ChestXRay-14/CXR8/nih-merged.csv", index=False, header=True, sep="\t")


split
train    82197
test     25596
valid     4327
Name: count, dtype: int64


In [12]:
nih_merged

Unnamed: 0,path,No Finding,Hernia,Pleural_Thickening,Consolidation,Mass,Atelectasis,Infiltration,Emphysema,Pneumonia,Edema,Effusion,Pneumothorax,Nodule,Cardiomegaly,Fibrosis,age,sex,split
10032,00003213_000.jpg,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,61,M,train
52497,00016256_000.jpg,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,24,M,train
62656,00019474_000.jpg,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,29,F,train
70633,00022235_001.jpg,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,46,M,train
61167,00018972_058.jpg,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,52,F,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25591,00030800_000.jpg,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,33,F,test
25592,00030802_000.jpg,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,28,M,test
25593,00030803_000.jpg,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,42,F,test
25594,00030804_000.jpg,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,29,F,test


In [11]:
#load mimic, chexpert, nih, covid data to compare value counts of split 

mimic = pd.read_csv("../data/MIMIC-CXR/mimic-cxr-jpg-2.0.0-small/mimic-cxr-lt/mimic-cxr-lt-merged.csv", sep="\t")
chexpert = pd.read_csv("../data/CheXpert/CheXpert-v1.0-small/chexpert_plus_labels_with_5x200.csv", sep="\t")
nih = pd.read_csv("../data/ChestXRay-14/CXR8/images/nih-lt-merged.csv", sep="\t")

print(mimic.split.value_counts())
print(chexpert.split.value_counts())
print(nih.split.value_counts())



split
train       264849
test         75492
validate     36769
Name: count, dtype: int64
split
train    222230
test       1000
valid       234
Name: count, dtype: int64
split
train    78506
test     21081
valid    12533
Name: count, dtype: int64


In [13]:
chexpert

Unnamed: 0,path_to_image,age,sex,race,split,report,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,No Finding
0,train/patient42142/study5/view1_frontal.jpg,62.0,Female,White,train,impression: 1.tracheostomy tube remains in pla...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,train/patient42142/study8/view1_frontal.jpg,62.0,Female,White,train,impression: 1.tracheostomy and subdiaphragmati...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,train/patient42142/study2/view1_frontal.jpg,62.0,Female,White,train,impression: 1.single portable semiupright ap v...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
3,train/patient42142/study4/view1_frontal.jpg,62.0,Female,White,train,impression: 1. patchy subsegmental bibasilar (...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,train/patient42142/study3/view1_frontal.jpg,62.0,Female,White,train,impression: 1.upright frontal view of the ches...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223459,train/patient29065/study20/view1_frontal.jpg,60.0,Male,Unknown,test,\n \n1. small right pleural effusion.\n \n2. ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
223460,train/patient50590/study2/view1_frontal.jpg,67.0,Male,Unknown,test,"\n \n 1. though the patient is rotated, aga...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
223461,train/patient09101/study1/view1_frontal.jpg,43.0,Female,Unknown,test,\n \n1. large right pleural effusion with a p...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
223462,train/patient31752/study3/view1_frontal.jpg,48.0,Male,Unknown,test,\n \n1.postsurgical changes of the right lower...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


#### COVID

In [2]:
train_images = glob.glob("../data/COVIDx-CXR-4/train/*")
val_images = glob.glob("../data/COVIDx-CXR-4/val/*")
test_images = glob.glob("../data/COVIDx-CXR-4/test/*")

print(len(train_images), len(val_images), len(test_images))

67863 8473 8482


In [32]:
covid_train = "../data/COVIDx-CXR-4/train.txt"
covid_train = pd.read_csv(covid_train, header=None, sep=" ")
covid_train.columns = ["idx", "path", "label", "source"]
covid_train['split'] = 'train'
covid_train.head()
print(covid_train.label.value_counts())

covid_val = "../data/COVIDx-CXR-4/val.txt"
covid_val = pd.read_csv(covid_val, header=None, sep=" ")
covid_val.columns = ["idx", "path", "label", "source"]
covid_val['split'] = 'valid'
covid_val.head()
print(covid_val.label.value_counts())

covid_test = "../data/COVIDx-CXR-4/test.txt"
covid_test = pd.read_csv(covid_test, header=None, sep=" ")
covid_test.columns = ["idx", "path", "label", "source"]
covid_test['split'] = 'test'
covid_test.head()
print(covid_test.label.value_counts())

# combine all three
covid = pd.concat([covid_train, covid_val, covid_test])
covid.drop(columns=['idx', 'source'], inplace=True)

#replace "negative" with 0 and "positive" with 1
covid['label'] = covid['label'].apply(lambda x: 0 if x == "negative" else 1)

print(covid.shape)
print(covid.split.value_counts())
print(covid.label.value_counts())



label
positive    57199
negative    10664
Name: count, dtype: int64
label
positive    4241
negative    4232
Name: count, dtype: int64
label
positive    4241
negative    4241
Name: count, dtype: int64
(84818, 3)
split
train    67863
test      8482
valid     8473
Name: count, dtype: int64
label
1    65681
0    19137
Name: count, dtype: int64


In [4]:
train_paths = set(covid_train['path'].to_list())
val_paths = set(covid_val['path'].to_list())
test_paths = set(covid_test['path'].to_list())

#check overlap between all the pairs of sets 
print(len(train_paths.intersection(val_paths)))
print(len(train_paths.intersection(test_paths)))
print(len(val_paths.intersection(test_paths)))

0
0
0


In [5]:
# first check if the images exist in the original folder
not_found = []
for image in train_images:
    if not os.path.exists(image):
        not_found.append(image)
print(len(not_found))

for image in val_images:
    if not os.path.exists(image):
        not_found.append(image)
print(len(not_found))

for image in test_images:
    if not os.path.exists(image):
        not_found.append(image)
print(len(not_found))

0
0
0


In [6]:
#move all the images from train/ val/ and test/ to a new folder called images

new_folder = "../data/COVIDx-CXR-4/images"
os.makedirs(new_folder, exist_ok=True)

for image in tqdm(train_images):
    shutil.move(image, new_folder)

for image in tqdm(val_images):
    shutil.move(image, new_folder)

for image in tqdm(test_images):
    shutil.move(image, new_folder)

100%|██████████| 67863/67863 [00:02<00:00, 28975.28it/s]
100%|██████████| 8473/8473 [00:00<00:00, 31983.85it/s]
100%|██████████| 8482/8482 [00:00<00:00, 31444.33it/s]


In [7]:
images = glob.glob("../data/COVIDx-CXR-4/images/*")

# Print all the extensions of the images
print(set([os.path.splitext(image)[1] for image in images]))

# print the counts of the extensions
from collections import Counter
print(Counter([os.path.splitext(image)[1] for image in images]))


{'.jpg', '.PNG', '.jpeg', '.JPG', '.png'}
Counter({'.png': 70747, '.jpg': 13947, '.jpeg': 122, '.JPG': 1, '.PNG': 1})


In [8]:
def resize_image(input_path):
    try:
        with Image.open(input_path) as img:
            img = img.resize((224, 224), Image.BICUBIC)
            # Convert to RGB mode for all images
            img = img.convert('RGB')
            # Convert to jpg if not already jpg/jpeg
            if not (input_path.lower().endswith('.jpg') or input_path.lower().endswith('.jpeg')):
                output_path = os.path.splitext(input_path)[0] + '.jpg'
                img.save(output_path, 'JPEG')  # Save as jpg
                os.remove(input_path)  # Remove original file
            else:
                img.save(input_path)  # Overwrite original jpg/jpeg
    except Exception as e:
        print(f"Error processing {input_path}: {e}")

num_workers = 8

with Pool(processes=num_workers) as pool:
    list(tqdm(
        pool.imap(resize_image, images),
        total=len(images),
        desc="Resizing images",
        unit="image"
    ))

Resizing images: 100%|██████████| 84818/84818 [01:48<00:00, 780.33image/s]


In [9]:
# Replace all the jpeg with jpg
for image in tqdm(images):
    if os.path.splitext(image)[1] == '.jpeg':
        os.rename(image, os.path.splitext(image)[0] + '.jpg')



100%|██████████| 84818/84818 [00:00<00:00, 440956.79it/s]


In [33]:
# In the path column, replace all extensions with jpg
covid['path'] = covid['path'].apply(lambda x: os.path.splitext(x)[0] + '.jpg')
covid = covid.drop_duplicates(subset='path', keep=False)
covid

Unnamed: 0,path,label,split
0,1e64990d1b40c1758a2aaa9c7f7a85_jumbo.jpg,0,train
1,7223b8ad031187d9a142d7f7ca02c9_jumbo.jpg,0,train
2,3392dc7d262e28423caca517f98c2e_jumbo.jpg,0,train
3,ec3a480c0926ded74429df416cfb05_jumbo.jpg,0,train
4,a72aeb349a63c79ed24e473c434efe_jumbo.jpg,0,train
...,...,...,...
8477,sub-S24538_ses-E50624_run-1_bp-chest_vp-ap_dx-...,0,test
8478,sub-S24539_ses-E50625_run-1_bp-chest_vp-pa_cr-...,0,test
8479,sub-S24540_ses-E50626_run-1_bp-chest_vp-ap_cr-...,0,test
8480,sub-S24540_ses-E60362_run-1_bp-chest_vp-ap_dx-...,0,test


In [34]:
covid.to_csv("../data/COVIDx-CXR-4/covid-merged.csv", index=False, header=True, sep="\t")

In [35]:
# check if all the images in the images/ folder end with a .jpg
images = glob.glob("../data/COVIDx-CXR-4/images/*")
print(set([os.path.splitext(image)[1] for image in images]))


{'.JPG', '.jpg'}


In [36]:
# replace the extension .JPG with .jpg
for image in images:
    if os.path.splitext(image)[1] == '.JPG':
        os.rename(image, os.path.splitext(image)[0] + '.jpg')

# check if all the images in the images/ folder end with a .jpg
images = glob.glob("../data/COVIDx-CXR-4/images/*")
print(set([os.path.splitext(image)[1] for image in images]))


{'.jpg'}


In [37]:
# check if all the images in the covid-merged.csv file end with a .jpg
print(set([os.path.splitext(image)[1] for image in covid['path'].to_list()]))

{'.jpg'}


#### CheXechonet

In [3]:
images = glob.glob("../data/CheXchoNet/images/*")

In [4]:
# if the height and widith are not 224, print the path
for image in tqdm(images):
    if Image.open(image).size != (224, 224):
        print(image)


# Load and resize all the images in the images/ folder
def resize_image(input_path):
    with Image.open(input_path) as img:
        img = img.resize((224, 224), Image.BICUBIC)
        img.save(input_path, 'JPEG')

num_workers = 8

with Pool(processes=num_workers) as pool:
    list(tqdm(
        pool.imap(resize_image, images),
        total=len(images),
        desc="Resizing images",
        unit="image"
    ))

100%|██████████| 71589/71589 [00:03<00:00, 19967.78it/s]
Resizing images: 100%|██████████| 71589/71589 [00:07<00:00, 9188.59image/s]


In [14]:
metadata = pd.read_csv("../data/CheXchoNet/metadata.csv")
metadata.head()

Unnamed: 0,patient_id,cxr_filename,cxr_time_offset,cxr_year,cxr_path,cxr_pixel_spacing_x,cxr_pixel_spacing_y,age,sex,ivsd,lvpwd,lvidd,slvh,dlv,composite_slvh_dlv,heart_transplant,lung_transplant,pacemaker_or_icd
0,0001a10b30d6cff4ce1c2bff5be86958,0001a10b30d6cff4ce1c2bff5be86958_6aee7800.jpg,4314 days 09:20:05.326807,2018,./cxrs/0001a10b30d6cff4ce1c2bff5be86958_6aee78...,0.2,0.2,31,F,0.73626,0.754162,4.58632,0,0,0,0,0,0
1,0008eb0430637bd17906cb0a5f1126e1,0008eb0430637bd17906cb0a5f1126e1_000bd69b.jpg,3431 days 15:41:04.059338,2017,./cxrs/0008eb0430637bd17906cb0a5f1126e1_000bd6...,0.194556,0.194556,74,F,1.07021,1.13182,4.34344,0,0,0,0,0,0
2,0008eb0430637bd17906cb0a5f1126e1,0008eb0430637bd17906cb0a5f1126e1_301ec85b.jpg,3439 days 17:09:04.059338,2017,./cxrs/0008eb0430637bd17906cb0a5f1126e1_301ec8...,0.194556,0.194556,74,F,1.07021,1.13182,4.34344,0,0,0,0,0,0
3,000a92e15c0a8df9f107f09acf295e36,000a92e15c0a8df9f107f09acf295e36_32b2a209.jpg,4880 days 13:29:52.822593,2015,./cxrs/000a92e15c0a8df9f107f09acf295e36_32b2a2...,0.194556,0.194556,67,M,1.25907,1.25368,5.02448,0,0,0,0,0,0
4,00163791112923cb52834f0e75123225,00163791112923cb52834f0e75123225_32f21836.jpg,5113 days 09:27:04.110448,2016,./cxrs/00163791112923cb52834f0e75123225_32f218...,0.194556,0.194556,76,F,1.01069,1.04916,3.63108,0,0,0,0,0,0


In [15]:
images[0]

'../data/CheXchoNet/images/c53d8be402c3780f3ea36065c0b429b9_eaa5ba7c.jpg'

In [40]:
# take these labels ['cxr_path', 'age', 'sex', 'composite_slvh_dlv']
chexchonet = metadata[['cxr_path','composite_slvh_dlv']]
# in cxr_path, replace the "./cxrs/" with nothing
chexchonet['cxr_path'] = chexchonet['cxr_path'].apply(lambda x: x.replace("./cxrs/", ""))
chexchonet.rename(columns={'cxr_path': 'path', 'composite_slvh_dlv': 'label'}, inplace=True)
print(chexchonet.label.value_counts())

# First split out the positive and negative samples
pos_samples = chexchonet[chexchonet.label == 1]
neg_samples = chexchonet[chexchonet.label == 0]

# Determine size of test set based on minority class
n_test = min(len(pos_samples), len(neg_samples)) // 13  # 10% for test set

# Sample equal numbers from each class for test set
test_pos = pos_samples.sample(n=n_test, random_state=42)
test_neg = neg_samples.sample(n=n_test, random_state=42)

# Combine test samples and mark as test
test_samples = pd.concat([test_pos, test_neg])
test_samples['split'] = 'test'

# Get remaining samples for training
train_samples = chexchonet[~chexchonet.index.isin(test_samples.index)]
train_samples['split'] = 'train'

# Combine train and test
chexchonet = pd.concat([train_samples, test_samples])

chexchonet.head()


label
0    61728
1     9861
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chexchonet['cxr_path'] = chexchonet['cxr_path'].apply(lambda x: x.replace("./cxrs/", ""))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chexchonet.rename(columns={'cxr_path': 'path', 'composite_slvh_dlv': 'label'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_samples['split'] = 'train'


Unnamed: 0,path,label,split
0,0001a10b30d6cff4ce1c2bff5be86958_6aee7800.jpg,0,train
1,0008eb0430637bd17906cb0a5f1126e1_000bd69b.jpg,0,train
2,0008eb0430637bd17906cb0a5f1126e1_301ec85b.jpg,0,train
3,000a92e15c0a8df9f107f09acf295e36_32b2a209.jpg,0,train
4,00163791112923cb52834f0e75123225_32f21836.jpg,0,train


In [41]:
chexchonet.pivot_table(index='split', columns='label', values='path', aggfunc='count')

label,0,1
split,Unnamed: 1_level_1,Unnamed: 2_level_1
test,758,758
train,60970,9103


In [42]:
chexchonet.to_csv("../data/CheXchoNet/chexchonet-merged.csv", index=False, header=True, sep="\t")

#### TBX11K

In [2]:
tbx11k_train = pd.read_csv("../data/TBX11K/lists/TBX11K_train.txt", header=None)
tbx11k_val = pd.read_csv("../data/TBX11K/lists/TBX11K_val.txt", header=None)

tbx11k_train.columns = ["path"]
tbx11k_val.columns = ["path"]

tbx11k_train['split'] = 'train'
tbx11k_val['split'] = 'test'

tbx11k = pd.concat([tbx11k_train, tbx11k_val])
print(tbx11k.shape)
print(tbx11k.split.value_counts())



(8400, 2)
split
train    6600
test     1800
Name: count, dtype: int64


In [3]:
#tbx11k[tbx11k['path'].str.contains("sick")]

# if path contains "sick" then label is 1, if path contains "healthy" then label is 0, if path contains tb then label is 2
tbx11k['label'] = tbx11k['path'].apply(lambda x: 1 if "sick" in x else 0 if "health" in x else 2 if "tb" in x else None)
print(tbx11k.label.value_counts())
print(tbx11k.pivot_table(index='split', columns='label', values='path', aggfunc='count'))

label
0    3800
1    3800
2     800
Name: count, dtype: int64
label     0     1    2
split                 
test    800   800  200
train  3000  3000  600


In [4]:
da_db_train = glob.glob("../data/TBX11K/imgs/extra/da+db/train/*")
da_db_test = glob.glob("../data/TBX11K/imgs/extra/da+db/val/*")

print(len(da_db_train), len(da_db_test))

def get_label(path):
    # if filename starts with "n" label is 0, if path starts with p label is 2
    filename = os.path.basename(path)
    if filename.startswith("n"):
        return 0
    elif filename.startswith("p"):
        return 2
    else:
        return None


da_db_train_labels = [get_label(path) for path in da_db_train]
da_db_test_labels = [get_label(path) for path in da_db_test]

print(Counter(da_db_train_labels))
print(Counter(da_db_test_labels))

da_db_train_df = pd.DataFrame({'path': da_db_train, 'label': da_db_train_labels})
da_db_test_df = pd.DataFrame({'path': da_db_test, 'label': da_db_test_labels})

da_db_train_df['split'] = 'train'
da_db_test_df['split'] = 'test'

da_db = pd.concat([da_db_train_df, da_db_test_df])
print(da_db.shape)
print(da_db.split.value_counts())
print(da_db.label.value_counts())
print(da_db.pivot_table(index='split', columns='label', values='path', aggfunc='count'))


# remove ../data/TBX11K/imgs/ from the path column
da_db['path'] = da_db['path'].apply(lambda x: x.replace("../data/TBX11K/imgs/", ""))

88 88
Counter({0: 54, 2: 34})
Counter({0: 48, 2: 40})
(176, 3)
split
train    88
test     88
Name: count, dtype: int64
label
0    102
2     74
Name: count, dtype: int64
label   0   2
split        
test   48  40
train  54  34


In [5]:


mc_shenzhen_train = glob.glob("../data/TBX11K/imgs/extra/mc+shenzhen/train/*")
mc_shenzhen_test = glob.glob("../data/TBX11K/imgs/extra/mc+shenzhen/val/*")

print(len(mc_shenzhen_train), len(mc_shenzhen_test))

def get_label(path):
    #CXRs with  filename having "_0" are TB negative and  with "_1" are TB positive.
    # path = only filename no extension
    filename = os.path.basename(path).split(".")[0]
    if filename.endswith("_0"):
        return 0
    elif filename.endswith("_1"):
        return 2
    else:
        return None

mc_shenzhen_train_labels = [get_label(path) for path in mc_shenzhen_train]
mc_shenzhen_test_labels = [get_label(path) for path in mc_shenzhen_test]

print(Counter(mc_shenzhen_train_labels))
print(Counter(mc_shenzhen_test_labels))

mc_shenzhen_train_df = pd.DataFrame({'path': mc_shenzhen_train, 'label': mc_shenzhen_train_labels})
mc_shenzhen_test_df = pd.DataFrame({'path': mc_shenzhen_test, 'label': mc_shenzhen_test_labels})

mc_shenzhen_train_df['split'] = 'train'
mc_shenzhen_test_df['split'] = 'test'

mc_shenzhen = pd.concat([mc_shenzhen_train_df, mc_shenzhen_test_df])
print(mc_shenzhen.shape)
print(mc_shenzhen.split.value_counts())
print(mc_shenzhen.label.value_counts())
print(mc_shenzhen.pivot_table(index='split', columns='label', values='path', aggfunc='count'))

# remove ../data/TBX11K/imgs/ from the path column
mc_shenzhen['path'] = mc_shenzhen['path'].apply(lambda x: x.replace("../data/TBX11K/imgs/", ""))



200 200
Counter({2: 103, 0: 97})
Counter({0: 105, 2: 95})
(400, 3)
split
train    200
test     200
Name: count, dtype: int64
label
0    202
2    198
Name: count, dtype: int64
label    0    2
split          
test   105   95
train   97  103


In [6]:
# now combine all the dataframes

mc_shenzhen['source'] = 'mc_shenzhen'
da_db['source'] = 'da_db'
tbx11k['source'] = 'tbx11k'

tbx11k_merged = pd.concat([ tbx11k, mc_shenzhen, da_db])
# tbx11k_merged.to_csv("../data/TBX11K/tbx11k-merged.csv", index=False, header=True, sep="\t")

print(tbx11k_merged.pivot_table(index='split', columns='source', values='path', aggfunc='count'))
print(tbx11k_merged.pivot_table(index='split', columns='label', values='path', aggfunc='count'))

source  da_db  mc_shenzhen  tbx11k
split                             
test       88          200    1800
train      88          200    6600
label     0     1    2
split                 
test    953   800  335
train  3151  3000  737


In [54]:
# images = glob.glob("../data/TBX11K/imgs/*")
images = tbx11k_merged.path.to_list()
add_path = "../data/TBX11K/imgs/"
images = [add_path + image for image in images]


def resize_image(input_path):
    try:
        with Image.open(input_path) as img:
            img = img.resize((224, 224), Image.BICUBIC)
            # Convert to RGB mode for all images
            img = img.convert('RGB')
            # Convert to jpg if not already jpg/jpeg
            if not (input_path.lower().endswith('.jpg') or input_path.lower().endswith('.jpeg')):
                output_path = os.path.splitext(input_path)[0] + '.jpg'
                img.save(output_path, 'JPEG')  # Save as jpg
                os.remove(input_path)  # Remove original file
            else:
                img.save(input_path)  # Overwrite original jpg/jpeg
    except Exception as e:
        print(f"Error processing {input_path}: {e}")

num_workers = 8

with Pool(processes=num_workers) as pool:
    list(tqdm(
        pool.imap(resize_image, images),
        total=len(images),
        desc="Resizing images",
        unit="image"
    ))

Resizing images: 100%|██████████| 8976/8976 [00:09<00:00, 969.80image/s] 


In [7]:
# find all the extensions in tbx11k_merged
print(set([os.path.splitext(image)[1] for image in tbx11k_merged['path'].to_list()]))
print(Counter([os.path.splitext(image)[1] for image in tbx11k_merged['path'].to_list()]))

# replace all the extensions with .jpg
tbx11k_merged['path'] = tbx11k_merged['path'].apply(lambda x: os.path.splitext(x)[0] + '.jpg')

# find all the extensions in tbx11k_merged
print(set([os.path.splitext(image)[1] for image in tbx11k_merged['path'].to_list()]))



{'.jpg', '.png'}
Counter({'.png': 8400, '.jpg': 576})
{'.jpg'}


In [8]:
tbx11k_merged.drop(columns=['source'], inplace=True)
tbx11k_merged.to_csv("../data/TBX11K/tbx11k-merged.csv", index=False, header=True, sep="\t")

#### RSNA Pneumonia

In [6]:
labels = pd.read_csv("../data/RSNA-Pneumonia/stage_2_detailed_class_info.csv")
print(labels.columns)
print(labels['class'].value_counts())

# replace No Lung Opacity / Not Normal with 1, Normal with 0, and Lung Opacity with 2
labels['class'] = labels['class'].apply(lambda x: 1 if x == "No Lung Opacity / Not Normal" else 0 if x == "Normal" else 2 if x == "Lung Opacity" else None)
print(labels['class'].value_counts())

labels.head()


Index(['patientId', 'class'], dtype='object')
class
No Lung Opacity / Not Normal    11821
Lung Opacity                     9555
Normal                           8851
Name: count, dtype: int64
class
1    11821
2     9555
0     8851
Name: count, dtype: int64


Unnamed: 0,patientId,class
0,0004cfab-14fd-4e49-80ba-63a80b6bddd6,1
1,00313ee0-9eaa-42f4-b0ab-c148ed3241cd,1
2,00322d4d-1c29-4943-afc9-b6754be640eb,1
3,003d8fa0-6bf1-40ed-b54c-ac657f8495c5,0
4,00436515-870c-4b36-a041-de91049b9ab4,2


In [8]:
images = glob.glob("../data/RSNA-Pneumonia/stage_2_train_images/*")
print(len(images))

26684


In [9]:
images[0]

'../data/RSNA-Pneumonia/stage_2_train_images/60ae6dde-b322-4c93-ad89-1b061fa48098.dcm'

In [14]:
import os
import pydicom
import cv2
import gc
from tqdm import tqdm
from PIL import Image

def resize_image(input_path, output_path, size=(224, 224)):
    """
    Resize an image using PIL's BICUBIC interpolation.
    
    Args:
        input_path: Path to input image or a PIL Image object
        output_path: Path to save the resized image
        size: Tuple of (width, height) for the output image
    """
    with Image.open(input_path) as img:
        img = img.resize(size, Image.BICUBIC)
        img = img.convert('RGB')  # Ensure the image is in RGB mode
        img.save(output_path, 'JPEG')

def convert_dicom_to_cv2(dcm_file, temp_jpg_path):
    """
    Convert a DICOM file to a CV2 image and save it as a JPEG.
    
    Args:
        dcm_file: A file-like object containing DICOM data
        temp_jpg_path: Path to save the temporary JPEG image
    """
    dcm = pydicom.dcmread(dcm_file)
    rescaled_image = cv2.convertScaleAbs(dcm.pixel_array, alpha=(255.0 / dcm.pixel_array.max()))

    # Handle MONOCHROME1
    if dcm.PhotometricInterpretation == "MONOCHROME1":
        rescaled_image = cv2.bitwise_not(rescaled_image)

    # Apply histogram equalization
    adjusted_image = cv2.equalizeHist(rescaled_image)

    # Save the adjusted image as a temporary JPEG
    cv2.imwrite(temp_jpg_path, adjusted_image)

def process_single_file(args):
    """
    Process a single DICOM file.
    
    Args:
        args: Tuple of (dicom_path, output_dir, size)
    """
    dicom_path, output_dir, size = args
    try:
        # Create output paths
        filename = os.path.basename(dicom_path)
        jpg_path = os.path.join(output_dir, filename.replace('.dcm', '.jpg'))
        temp_jpg_path = os.path.join(output_dir, 'temp', filename.replace('.dcm', '.jpg'))
        
        # Convert DICOM to CV2 Image and save as temporary JPEG
        convert_dicom_to_cv2(dicom_path, temp_jpg_path)
        
        # Resize using PIL and replace the original JPEG
        resize_image(temp_jpg_path, jpg_path, size=size)
        
        # Remove the temporary JPEG file
        os.remove(temp_jpg_path)
        return True
    except Exception as e:
        print(f"Error processing {filename}: {str(e)}")
        return False
    
def process_all_files(images, output_dir, size=(224, 224)):
    with Pool(processes=8) as pool:
        list(tqdm(
            pool.imap(process_single_file, [(image, output_dir, size) for image in images]),
            total=len(images),
            desc="Processing images",
            unit="image"
        ))
        
os.makedirs("../data/RSNA-Pneumonia/stage_2_train_images_resized", exist_ok=True)
os.makedirs("../data/RSNA-Pneumonia/stage_2_train_images_resized/temp", exist_ok=True)
process_all_files(images, "../data/RSNA-Pneumonia/stage_2_train_images_resized")

Processing images: 100%|██████████| 26684/26684 [04:04<00:00, 109.25image/s]


In [15]:
images = glob.glob("../data/RSNA-Pneumonia/stage_2_train_images_resized/*")
print(len(images))
images[0]


26685


'../data/RSNA-Pneumonia/stage_2_train_images_resized/43605721-d0e0-4d58-b212-b05a295b21be.jpg'

In [17]:
# print the size of the first image
print(Image.open(images[0]).size)

(224, 224)


In [21]:
labels.rename(columns={'patientId': 'path', 'class': 'label'}, inplace=True)

train, test = train_test_split(labels, test_size=0.1, random_state=42)

print(train.label.value_counts(), test.label.value_counts())

train['split'] = 'train'
test['split'] = 'test'

combined = pd.concat([train, test])

# add .jpg to the path
combined['path'] = combined['path'].apply(lambda x: x + '.jpg')

combined.to_csv("../data/RSNA-Pneumonia/rsna-pneumonia-merged.csv", index=False, header=True, sep="\t")



label
1    10591
2     8633
0     7980
Name: count, dtype: int64 label
1    1230
2     922
0     871
Name: count, dtype: int64


#### SIIM-Pneumothorax

In [10]:
train_images = glob.glob("../data/SIIM-Pneumothorax/dicom-images-train/*/*/*")
print(train_images[0])
train_image_ids = [".".join(os.path.basename(image).split(".")[:-1]) for image in train_images] # get only the filename from the path
print(len(train_image_ids))
train_image_ids[0]

../data/SIIM-Pneumothorax/dicom-images-train/1.2.276.0.7230010.3.1.2.8323329.31920.1517875157.501098/1.2.276.0.7230010.3.1.3.8323329.31920.1517875157.501097/1.2.276.0.7230010.3.1.4.8323329.31920.1517875157.501099.dcm
10712


'1.2.276.0.7230010.3.1.4.8323329.31920.1517875157.501099'

In [11]:
test_images = glob.glob("../data/SIIM-Pneumothorax/dicom-images-test/*/*/*")
test_image_ids = [".".join(os.path.basename(image).split(".")[:-1]) for image in test_images] # get only the filename from the path
print(len(test_image_ids))
test_image_ids[0]


1377


'1.2.276.0.7230010.3.1.4.8323329.6585.1517875199.37328'

In [16]:
images = train_images + test_images
image_ids = train_image_ids + test_image_ids

id_to_paths = {image_id: image for image_id, image in zip(image_ids, images)}

print(len(images), len(image_ids))
images[0], image_ids[0]


12089 12089


('../data/SIIM-Pneumothorax/dicom-images-train/1.2.276.0.7230010.3.1.2.8323329.31920.1517875157.501098/1.2.276.0.7230010.3.1.3.8323329.31920.1517875157.501097/1.2.276.0.7230010.3.1.4.8323329.31920.1517875157.501099.dcm',
 '1.2.276.0.7230010.3.1.4.8323329.31920.1517875157.501099')

In [5]:
labels = pd.read_csv("../data/SIIM-Pneumothorax/train-rle.csv")
labels = labels.rename(columns={' EncodedPixels': 'label'})

# if label is -1, then convert to 0, else convert to 1
labels['label'] = labels['label'].apply(lambda x: 0 if x == ' -1' else 1)
print(labels['label'].value_counts())
print(labels.shape)
labels.head()




label
0    8296
1    3286
Name: count, dtype: int64
(11582, 2)


Unnamed: 0,ImageId,label
0,1.2.276.0.7230010.3.1.4.8323329.5597.151787518...,0
1,1.2.276.0.7230010.3.1.4.8323329.12515.15178752...,0
2,1.2.276.0.7230010.3.1.4.8323329.4904.151787518...,1
3,1.2.276.0.7230010.3.1.4.8323329.32579.15178751...,1
4,1.2.276.0.7230010.3.1.4.8323329.32579.15178751...,1


In [6]:
10712+1377

12089

In [27]:
stage_2_train = pd.read_csv("../data/SIIM-Pneumothorax/stage_2_train.csv")

# rename columns
stage_2_train.rename(columns={'EncodedPixels': 'label'}, inplace=True)

# Drop Unnamed: 0
stage_2_train.drop(columns=['Unnamed: 0'], inplace=True)
# if label is -1, then convert to 0, else convert to 1
stage_2_train['label'] = stage_2_train['label'].apply(lambda x: 0 if x == '-1' else 1)
print(stage_2_train['label'].value_counts())
print(stage_2_train.shape)
stage_2_train.head()

label
0    9378
1    3576
Name: count, dtype: int64
(12954, 2)


Unnamed: 0,ImageId,label
0,1.2.276.0.7230010.3.1.4.8323329.3678.151787517...,1
1,1.2.276.0.7230010.3.1.4.8323329.4200.151787518...,0
2,1.2.276.0.7230010.3.1.4.8323329.4862.151787518...,1
3,1.2.276.0.7230010.3.1.4.8323329.12313.15178752...,0
4,1.2.276.0.7230010.3.1.4.8323329.14214.15178752...,0


In [8]:
12954 - 11582

1372

In [15]:
not_found = []
for image_id in stage_2_train['ImageId'].to_list():
    if image_id not in image_ids:
        not_found.append(image_id)
print(len(not_found))
not_found


0


[]

In [9]:
#check imageids that are in stage_2_train but not in labels 

x = set(stage_2_train['ImageId'].to_list()) - set(labels['ImageId'].to_list())
y = set(labels['ImageId'].to_list()) - set(stage_2_train['ImageId'].to_list())
print(len(x))
print(len(y))
# x



1372
0


In [18]:
import os
import pydicom
import cv2
import gc
from tqdm import tqdm
from PIL import Image

def resize_image(input_path, output_path, size=(224, 224)):
    """
    Resize an image using PIL's BICUBIC interpolation.
    
    Args:
        input_path: Path to input image or a PIL Image object
        output_path: Path to save the resized image
        size: Tuple of (width, height) for the output image
    """
    with Image.open(input_path) as img:
        img = img.resize(size, Image.BICUBIC)
        img = img.convert('RGB')  # Ensure the image is in RGB mode
        img.save(output_path, 'JPEG')

def convert_dicom_to_cv2(dcm_file, temp_jpg_path):
    """
    Convert a DICOM file to a CV2 image and save it as a JPEG.
    
    Args:
        dcm_file: A file-like object containing DICOM data
        temp_jpg_path: Path to save the temporary JPEG image
    """
    dcm = pydicom.dcmread(dcm_file)
    rescaled_image = cv2.convertScaleAbs(dcm.pixel_array, alpha=(255.0 / dcm.pixel_array.max()))

    # Handle MONOCHROME1
    if dcm.PhotometricInterpretation == "MONOCHROME1":
        rescaled_image = cv2.bitwise_not(rescaled_image)

    # Apply histogram equalization
    adjusted_image = cv2.equalizeHist(rescaled_image)

    # Save the adjusted image as a temporary JPEG
    cv2.imwrite(temp_jpg_path, adjusted_image)

def process_single_file(args):
    """
    Process a single DICOM file.
    
    Args:
        args: Tuple of (dicom_path, output_dir, size)
    """
    dicom_path, output_dir, size = args
    try:
        # Create output paths
        filename = os.path.basename(dicom_path)
        jpg_path = os.path.join(output_dir, filename.replace('.dcm', '.jpg'))
        temp_jpg_path = os.path.join(output_dir, 'temp', filename.replace('.dcm', '.jpg'))
        
        # Convert DICOM to CV2 Image and save as temporary JPEG
        convert_dicom_to_cv2(dicom_path, temp_jpg_path)
        
        # Resize using PIL and replace the original JPEG
        resize_image(temp_jpg_path, jpg_path, size=size)
        
        # Remove the temporary JPEG file
        os.remove(temp_jpg_path)
        return True
    except Exception as e:
        print(f"Error processing {filename}: {str(e)}")
        return False
    
def process_all_files(images, output_dir, size=(224, 224)):
    with Pool(processes=8) as pool:
        list(tqdm(
            pool.imap(process_single_file, [(image, output_dir, size) for image in images]),
            total=len(images),
            desc="Processing images",
            unit="image"
        ))
        
os.makedirs("../data/SIIM-Pneumothorax/images", exist_ok=True)
os.makedirs("../data/SIIM-Pneumothorax/images/temp", exist_ok=True)
process_all_files(images, "../data/SIIM-Pneumothorax/images")

Processing images: 100%|██████████| 12089/12089 [02:24<00:00, 83.42image/s] 


In [31]:
stage_2_train.rename(columns={'ImageId': 'path'}, inplace=True)

# add .jpg to path
stage_2_train['path'] = stage_2_train['path'].apply(lambda x: x + '.jpg')

train, test = train_test_split(stage_2_train, test_size=0.1, random_state=42)

train['split'] = 'train'
test['split'] = 'test'

combined = pd.concat([train, test])
combined
combined.to_csv("../data/SIIM-Pneumothorax/siim-pneumothorax-merged.csv", index=False, header=True, sep="\t")


In [32]:
images = glob.glob("../data/SIIM-Pneumothorax/images/*")
print(len(images))
images[0]

12089


'../data/SIIM-Pneumothorax/images/1.2.276.0.7230010.3.1.4.8323329.4434.1517875182.843610.jpg'

In [34]:
paths = stage_2_train['path'].to_list()
add_path = "../data/SIIM-Pneumothorax/images/"
paths = [add_path + path for path in paths]

# check in images if all the paths are present
for path in paths:
    if path not in images:
        print(path)


#### VinDR-CXR

In [19]:
train_labels = pd.read_csv("../data/VinDr-CXR/image_labels_train.csv")
test_labels = pd.read_csv("../data/VinDr-CXR/image_labels_test.csv")

train_labels['split'] = 'train'
test_labels['split'] = 'test'

drop_columns = ['rad_id', 'Other diseases', 'Other lesion']
train_labels.drop(columns=drop_columns, inplace=True)
drop_columns = [ 'Other disease', 'Other lesion']
test_labels.drop(columns=drop_columns, inplace=True)


combined = pd.concat([train_labels, test_labels])
combined.rename(columns={'image_id': 'path'}, inplace=True)
combined['path'] = combined['path'].apply(lambda x: x + '.jpg')

print(train_labels.columns)
print(test_labels.columns)
print(combined.columns)

print(train_labels.shape, test_labels.shape, combined.shape)

train_labels.head()
test_labels.head()
combined.head()



Index(['image_id', 'Aortic enlargement', 'Atelectasis', 'Calcification',
       'Cardiomegaly', 'Clavicle fracture', 'Consolidation', 'Edema',
       'Emphysema', 'Enlarged PA', 'ILD', 'Infiltration', 'Lung Opacity',
       'Lung cavity', 'Lung cyst', 'Mediastinal shift', 'Nodule/Mass',
       'Pleural effusion', 'Pleural thickening', 'Pneumothorax',
       'Pulmonary fibrosis', 'Rib fracture', 'COPD', 'Lung tumor', 'Pneumonia',
       'Tuberculosis', 'No finding', 'split'],
      dtype='object')
Index(['image_id', 'Aortic enlargement', 'Atelectasis', 'Calcification',
       'Cardiomegaly', 'Clavicle fracture', 'Consolidation', 'Edema',
       'Emphysema', 'Enlarged PA', 'ILD', 'Infiltration', 'Lung Opacity',
       'Lung cavity', 'Lung cyst', 'Mediastinal shift', 'Nodule/Mass',
       'Pleural effusion', 'Pleural thickening', 'Pneumothorax',
       'Pulmonary fibrosis', 'Rib fracture', 'COPD', 'Lung tumor', 'Pneumonia',
       'Tuberculosis', 'No finding', 'split'],
      dtype='objec

Unnamed: 0,path,Aortic enlargement,Atelectasis,Calcification,Cardiomegaly,Clavicle fracture,Consolidation,Edema,Emphysema,Enlarged PA,...,Pleural thickening,Pneumothorax,Pulmonary fibrosis,Rib fracture,COPD,Lung tumor,Pneumonia,Tuberculosis,No finding,split
0,000434271f63a053c4128a0ba6352c7f.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,train
1,000434271f63a053c4128a0ba6352c7f.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,train
2,000434271f63a053c4128a0ba6352c7f.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,train
3,00053190460d56c53cc3e57321387478.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,train
4,00053190460d56c53cc3e57321387478.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,train


In [20]:
combined.to_csv("../data/VinDr-CXR/vin-dr-cxr-merged.csv", index=False, header=True, sep="\t")

In [24]:
images = glob.glob("../data/VinDr-CXR/images/*/*.jpg")
print(len(images))
images[0]

# remove ../data/VinDr-CXR/images/test/ and ../data/VinDr-CXR/images/train from the paths to create image_ids
image_ids = [os.path.basename(image).split(".")[0] for image in images]
print(len(image_ids))
image_ids[0]

image_ids_to_paths = {image_id: image for image_id, image in zip(image_ids, images)}


18000
18000


In [25]:
combined_paths = combined['path'].to_list()


# remove .jpg from the end of the paths
combined_paths = [path.replace('.jpg', '') for path in combined_paths]

# check if all the paths are present in image_ids_to_paths
for path in combined_paths:
    if path not in image_ids_to_paths:
        print(path)


In [26]:
len(image_ids), len(set(image_ids))

(18000, 18000)

In [28]:
# copy all the images from the train and test folders to a single folder
os.makedirs("../data/VinDr-CXR/images_eval", exist_ok=True)
for path in tqdm(images):
    shutil.copy(path, "../data/VinDr-CXR/images_eval")

100%|██████████| 18000/18000 [00:02<00:00, 8312.54it/s] 


In [21]:
labels = ['Aortic enlargement', 'Atelectasis', 'Calcification',
       'Cardiomegaly', 'Clavicle fracture', 'Consolidation', 'Edema',
       'Emphysema', 'Enlarged PA', 'ILD', 'Infiltration', 'Lung Opacity',
       'Lung cavity', 'Lung cyst', 'Mediastinal shift', 'Nodule/Mass',
       'Pleural effusion', 'Pleural thickening', 'Pneumothorax',
       'Pulmonary fibrosis', 'Rib fracture', 'COPD', 'Lung tumor', 'Pneumonia',
       'Tuberculosis', 'No finding']

#print column sums and percentage of positive labels in both train and test separately
print("Column sums:")
for label in labels:
    print(f"{label}: {train_labels[label].sum()}, {test_labels[label].sum()}")

print("\nPercentage of positive labels:")
for label in labels:
    print(f"{label}: {train_labels[label].sum() / len(train_labels):.4f}, {test_labels[label].sum() / len(test_labels):.4f}")



Column sums:
Aortic enlargement: 7188, 220
Atelectasis: 267, 86
Calcification: 692, 194
Cardiomegaly: 5430, 309
Clavicle fracture: 29, 2
Consolidation: 505, 96
Edema: 14, 0
Emphysema: 101, 3
Enlarged PA: 156, 8
ILD: 620, 221
Infiltration: 949, 58
Lung Opacity: 2045, 84
Lung cavity: 80, 9
Lung cyst: 38, 2
Mediastinal shift: 270, 20
Nodule/Mass: 1431, 176
Pleural effusion: 2111, 111
Pleural thickening: 3257, 169
Pneumothorax: 199, 18
Pulmonary fibrosis: 3271, 217
Rib fracture: 150, 11
COPD: 46, 2
Lung tumor: 478, 80
Pneumonia: 1570, 246
Tuberculosis: 1479, 164
No finding: 31685, 2051

Percentage of positive labels:
Aortic enlargement: 0.1597, 0.0733
Atelectasis: 0.0059, 0.0287
Calcification: 0.0154, 0.0647
Cardiomegaly: 0.1207, 0.1030
Clavicle fracture: 0.0006, 0.0007
Consolidation: 0.0112, 0.0320
Edema: 0.0003, 0.0000
Emphysema: 0.0022, 0.0010
Enlarged PA: 0.0035, 0.0027
ILD: 0.0138, 0.0737
Infiltration: 0.0211, 0.0193
Lung Opacity: 0.0454, 0.0280
Lung cavity: 0.0018, 0.0030
Lung cyst: 

#### VinDr-PCXR

In [6]:
# Do the same as above for VinDr-PCXR

train_labels = pd.read_csv("../data/VinDr-PCXR/image_labels_train.csv")
test_labels = pd.read_csv("../data/VinDr-PCXR/image_labels_test.csv")
train_labels['split'] = 'train'
test_labels['split'] = 'test'

combined = pd.concat([train_labels, test_labels])
combined.rename(columns={'image_id': 'path'}, inplace=True)
combined['path'] = combined['path'].apply(lambda x: x + '.jpg')
combined.drop(columns=['rad_ID', 'Other disease'], inplace=True)

print(train_labels.columns)
print(test_labels.columns)
print(combined.columns)

print(train_labels.shape, test_labels.shape, combined.shape)

train_labels.head()
test_labels.head()
combined.head()

Index(['image_id', 'rad_ID', 'No finding', 'Bronchitis', 'Brocho-pneumonia',
       'Other disease', 'Bronchiolitis', 'Situs inversus', 'Pneumonia',
       'Pleuro-pneumonia', 'Diagphramatic hernia', 'Tuberculosis',
       'Congenital emphysema', 'CPAM', 'Hyaline membrane disease',
       'Mediastinal tumor', 'Lung tumor', 'split'],
      dtype='object')
Index(['image_id', 'rad_ID', 'No finding', 'Bronchitis', 'Brocho-pneumonia',
       'Other disease', 'Bronchiolitis', 'Situs inversus', 'Pneumonia',
       'Pleuro-pneumonia', 'Diagphramatic hernia', 'Tuberculosis',
       'Congenital emphysema', 'CPAM', 'Hyaline membrane disease',
       'Mediastinal tumor', 'Lung tumor', 'split'],
      dtype='object')
Index(['path', 'No finding', 'Bronchitis', 'Brocho-pneumonia', 'Bronchiolitis',
       'Situs inversus', 'Pneumonia', 'Pleuro-pneumonia',
       'Diagphramatic hernia', 'Tuberculosis', 'Congenital emphysema', 'CPAM',
       'Hyaline membrane disease', 'Mediastinal tumor', 'Lung tumor',

Unnamed: 0,path,No finding,Bronchitis,Brocho-pneumonia,Bronchiolitis,Situs inversus,Pneumonia,Pleuro-pneumonia,Diagphramatic hernia,Tuberculosis,Congenital emphysema,CPAM,Hyaline membrane disease,Mediastinal tumor,Lung tumor,split
0,6cb53aff85c71b98ad13d67a131708c6.jpg,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train
1,40414c05687cdb156823c156967b13f0.jpg,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train
2,0e4a464dfbf8abc6333c82f1b77b6455.jpg,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train
3,f4d3fab0b71381e6b237dc36301e85a0.jpg,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train
4,b58c9b1c89978a0b1f8533b7a2ca1088.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train


In [7]:
images = glob.glob("../data/VinDr-PCXR/images/*/*.jpg")
print(len(images))
images[0]

image_ids = [os.path.basename(image).split(".")[0] for image in images]
print(len(image_ids))
image_ids[0]

image_ids_to_paths = {image_id: image for image_id, image in zip(image_ids, images)}

9123
9123


In [8]:
combined_paths = combined['path'].to_list()
print(len(combined_paths))
combined_paths = [path.replace('.jpg', '') for path in combined_paths]

# check if all the paths are present in image_ids_to_paths
not_found = []
for path in combined_paths:
    if path not in image_ids:
        not_found.append(path)
print(len(not_found))
# len(image_ids), len(set(image_ids))
not_found = [path+'.jpg' for path in not_found]
# Drop the rows in combined where path is in not_found
combined = combined[~combined['path'].isin(not_found)]
combined
combined.to_csv("../data/VinDr-PCXR/vin-dr-pcxr-merged.csv", index=False, header=True, sep="\t")


9125
2


In [5]:
# copy all the images from the train and test folders to a single folder
os.makedirs("../data/VinDr-PCXR/images_eval", exist_ok=True)
for path in tqdm(images):
    shutil.copy(path, "../data/VinDr-PCXR/images_eval")

100%|██████████| 9123/9123 [00:00<00:00, 11201.66it/s]


In [11]:
labels = ['No finding', 'Bronchitis', 'Brocho-pneumonia', 'Bronchiolitis',
       'Situs inversus', 'Pneumonia', 'Pleuro-pneumonia',
       'Diagphramatic hernia', 'Tuberculosis', 'Congenital emphysema', 'CPAM',
       'Hyaline membrane disease', 'Mediastinal tumor', 'Lung tumor']

#print column sums and percentage of positive labels in both train and test separately
print("Column sums:")
for label in labels:
    print(f"{label}: {train_labels[label].sum()}, {test_labels[label].sum()}")

print("\nPercentage of positive labels:")
for label in labels:
    print(f"{label}: {train_labels[label].sum() / len(train_labels):.4f}, {test_labels[label].sum() / len(test_labels):.4f}")

Column sums:
No finding: 5143.0, 907.0
Bronchitis: 842.0, 174.0
Brocho-pneumonia: 545.0, 84.0
Bronchiolitis: 497.0, 90.0
Situs inversus: 11.0, 2.0
Pneumonia: 392.0, 89.0
Pleuro-pneumonia: 6.0, 0.0
Diagphramatic hernia: 3.0, 0.0
Tuberculosis: 14.0, 1.0
Congenital emphysema: 2.0, 0.0
CPAM: 5.0, 1.0
Hyaline membrane disease: 19.0, 3.0
Mediastinal tumor: 8.0, 1.0
Lung tumor: 5.0, 0.0

Percentage of positive labels:
No finding: 0.6655, 0.6492
Bronchitis: 0.1090, 0.1246
Brocho-pneumonia: 0.0705, 0.0601
Bronchiolitis: 0.0643, 0.0644
Situs inversus: 0.0014, 0.0014
Pneumonia: 0.0507, 0.0637
Pleuro-pneumonia: 0.0008, 0.0000
Diagphramatic hernia: 0.0004, 0.0000
Tuberculosis: 0.0018, 0.0007
Congenital emphysema: 0.0003, 0.0000
CPAM: 0.0006, 0.0007
Hyaline membrane disease: 0.0025, 0.0021
Mediastinal tumor: 0.0010, 0.0007
Lung tumor: 0.0006, 0.0000
