In [None]:
import os
NIH_14_DATASET_PATH = '../NIH_14/'
dataset_path = os.path.abspath(NIH_14_DATASET_PATH)
assert os.path.exists(dataset_path)
os.listdir(dataset_path)

In [None]:
import pandas as pd
data_entry_csv_path = os.path.join(dataset_path, 'Data_Entry_2017.csv')
data = pd.read_csv(data_entry_csv_path)
print(f"Data Shape : {data.shape}")
data.head()

In [None]:
# Removing patients with age greater than 100
data = data[data['Patient Age']<100]

print(f"New dataset dimensions: {data.shape}")

In [None]:
data = data[['Image Index', 'Finding Labels']]
print(data.shape)

In [None]:
import glob2

all_images = sorted(glob2.glob(dataset_path + '/**/*.png'))
print(f'Number of Images: {len(all_images)}')

all_image_paths = {os.path.basename(x): x for x in all_images}

#Add path of images as column to the dataset
data['Path'] = data['Image Index'].map(all_image_paths.get)
data.sample(5, random_state=3)

In [None]:
import numpy as np
from itertools import chain
all_labels = np.unique(list(chain(*data['Finding Labels'].map(lambda x: x.split('|')).tolist())))
print(all_labels)

In [None]:
all_labels = np.delete(all_labels, np.where(all_labels == 'No Finding'))
all_labels = [x for x in all_labels]
all_labels

In [None]:
for c_label in all_labels:
    if len(c_label)>1: # leave out empty labels
        # Add a column for each desease
        data[c_label] = data['Finding Labels'].map(lambda finding: 1 if c_label in finding else 0)
        
print(f"Dataset Dimension: {data.shape}")
data.head()

In [None]:
label_counts = data['Finding Labels'].value_counts()
label_counts

In [None]:
data = data.groupby('Finding Labels').filter(lambda x : len(x)>11)
label_counts = data['Finding Labels'].value_counts()
print(label_counts.shape)
print(label_counts)

In [None]:
from sklearn.model_selection import train_test_split

train_and_valid_df, test_df = train_test_split(data,
                                               test_size = 0.30,
                                               random_state = 2018,
                                              )

train_df, valid_df = train_test_split(train_and_valid_df,
                                      test_size=0.30,
                                      random_state=2018,
                                     )

print(f'Training: {train_df.shape[0]} Validation: {valid_df.shape[0]} Testing: {test_df.shape[0]}')

In [None]:
test_image_paths = test_df['Path'].values

In [None]:
# Save images to test_images directory. This will be copied to coral dev baord for the inference purpose.
dst_dir = os.path.abspath('./test_images/')

if not os.path.exists(dst_dir):
    os.makedirs(dst_dir)

for img_path_item in test_image_paths:
    src = img_path_item
    base_name = img_path_item.split('/')[-1]
    dst = os.path.join(dst_dir, base_name)
    shutil.copy(src, dst)

In [None]:
test_df = test_df[['Path', 'Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 
                   'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']]

In [None]:
actual_labels = dict()
for idx, row in test_df.iterrows():
    actual_labels[row[0]] = list(row[1:].values)

In [None]:
import json

In [None]:
# Save actual labels in json file. This will be compared with predictions on edge tpu for calculation of AUC score.
with open('actual.json', 'w') as actual_file:
    json.dump(actual_labels, actual_file)