In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
from jupyterthemes import jtplot
jtplot.style(theme='onedork', context='notebook')

In [2]:
img_metadata = pd.read_csv("../dataset/Data_Entry_2017.csv")
unusable_images = pd.read_csv("../dataset/cxr14_bad_labels.csv").drop(["Index"], axis=1)

In [3]:
finding_labels = set()
for labels in pd.unique(img_metadata["Finding Labels"]):
    for label in labels.split("|"):
        finding_labels.add(label.lower().replace(" ", "_"))

print("All labels:")
for i, label in enumerate(finding_labels):
    print(f"{i + 1}. {label}")
print(f"Total labels: {len(finding_labels)}")

All labels:
1. emphysema
2. hernia
3. pleural_thickening
4. edema
5. cardiomegaly
6. atelectasis
7. fibrosis
8. nodule
9. effusion
10. pneumothorax
11. infiltration
12. mass
13. no_finding
14. consolidation
15. pneumonia
Total labels: 15


In [4]:
dx_series = pd.Series(list(finding_labels))
dx_series.to_csv("../dataset/dx_labels.csv", index=False, header=["dx_labels"])

In [5]:
img_metadata.rename(columns = {"Image Index": "img_filename", "Patient ID": "pt_id", "Patient Age": "pt_age", "Patient Gender": "pt_sex", "View Position": "view_position", "Image Width": "img_width", "Image Height": "img_height", "Spacing X": "x_spacing", "Spacing Y": "y_spacing"}, inplace=True)
img_metadata.columns

Index(['img_filename', 'Finding Labels', 'Follow-up #', 'pt_id', 'pt_age',
       'pt_sex', 'view_position', 'img_width', 'img_height', 'x_spacing',
       'y_spacing'],
      dtype='object')

In [6]:
def has_label(label, row):
    return 1 if label in row["Finding Labels"].split("|") else 0

for label in finding_labels:
    img_metadata[label] = img_metadata.apply(lambda row: has_label(label, row), axis=1)

img_metadata = img_metadata.drop(["Finding Labels", "Follow-up #"], axis=1)

In [7]:
unusable_images.head()

Unnamed: 0,File label,Inverted,Not frontal,Rotated
0,00000583_024.png,1.0,0.0,0.0
1,00002180_000.png,1.0,0.0,0.0
2,00002300_026.png,1.0,0.0,0.0
3,00002371_015.png,1.0,0.0,0.0
4,00006209_001.png,1.0,0.0,0.0


In [8]:
to_drop = []
unusable_list = unusable_images["File label"].tolist()
for index, row in img_metadata.iterrows():
    if row["img_filename"] in unusable_list:
        to_drop.append(index)
print(f"Total images to drop: {len(pd.unique(unusable_images['File label']))}")
print(f"Total indexes found: {len(to_drop)}")


Total images to drop: 257
Total indexes found: 257


In [9]:
usable_imgs = img_metadata.drop(to_drop, axis=0)
print(f"Total images before drop: {len(img_metadata)}")
print(f"Images usable: {len(usable_imgs)}")
print(f"Total dropped: {len(img_metadata) - len(usable_imgs)}")

Total images before drop: 112120
Images usable: 111863
Total dropped: 257


In [10]:
usable_imgs = usable_imgs[usable_imgs["pt_age"] < 100]
usable_imgs.describe()

Unnamed: 0,pt_id,pt_age,img_width,img_height,x_spacing,y_spacing,emphysema,hernia,pleural_thickening,edema,...,atelectasis,fibrosis,nodule,effusion,pneumothorax,infiltration,mass,no_finding,consolidation,pneumonia
count,111847.0,111847.0,111847.0,111847.0,111847.0,111847.0,111847.0,111847.0,111847.0,111847.0,...,111847.0,111847.0,111847.0,111847.0,111847.0,111847.0,111847.0,111847.0,111847.0,111847.0
mean,14352.069854,46.873729,2646.817215,2486.516482,0.155634,0.155634,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
std,8404.640184,16.593094,340.900387,401.416234,0.016178,0.016178,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,1.0,1.0,1143.0,966.0,0.115,0.115,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7317.0,35.0,2500.0,2048.0,0.143,0.143,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,13995.0,49.0,2520.0,2544.0,0.143,0.143,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,20673.0,59.0,2992.0,2991.0,0.168,0.168,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,30805.0,95.0,3827.0,3567.0,0.1988,0.1988,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
usable_imgs.to_csv("../dataset/usable_img_metadata.csv", index=False)