# Chest-prediction - Creation of Binary model to predict DISEASE / NO DISEASE - using MobileNetV2

In [1]:
import tensorflow as tf

tf.__version__

'2.15.0'

In [2]:
AUTOTUNE = tf.data.AUTOTUNE

In [3]:
import os
from pathlib import Path

project_name = "chest-predictor"
username = os.environ.get('USER')

#Set variables (origin url, destination folder, folder name)
data_dir = f"/Users/{username}/code/sachamagier/{project_name}/raw_data"
data_fname = 'resized_dataset.zip'

data_root = Path(os.path.join(data_dir, 'resized_dataset'))

all_image_paths = [str(path) for path in (data_root/"images"/"set_full").iterdir()]

In [4]:
import pandas as pd
# Load labels
labels_df = pd.read_csv(data_root/"Data_Entry_2017.csv")
labels_df.set_index('Image Index', inplace=True)

In [5]:
labels_df.head(5)

Unnamed: 0_level_0,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11
Image Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,
00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,
00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,
00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,
00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,


In [6]:
# Create list of all image labels
all_image_labels = labels_df.loc[[os.path.basename(path) for path in all_image_paths], 'Finding Labels'].values

In [7]:
all_image_labels[:50]

array(['No Finding', 'No Finding', 'No Finding', 'Pneumonia',
       'No Finding', 'Pneumonia', 'No Finding', 'No Finding',
       'No Finding', 'No Finding', 'No Finding',
       'Atelectasis|Infiltration', 'Fibrosis', 'Infiltration',
       'No Finding', 'No Finding', 'No Finding', 'No Finding',
       'No Finding', 'Atelectasis', 'Cardiomegaly', 'Infiltration',
       'Consolidation', 'No Finding', 'No Finding', 'No Finding', 'Mass',
       'No Finding', 'Nodule', 'Edema|Infiltration|Nodule',
       'Consolidation', 'No Finding', 'No Finding', 'No Finding',
       'Effusion|Infiltration|Pneumothorax',
       'Infiltration|Pleural_Thickening', 'Consolidation|Infiltration',
       'Effusion', 'Infiltration', 'No Finding', 'No Finding',
       'No Finding', 'Nodule', 'Pneumothorax',
       'Effusion|Pleural_Thickening|Pneumothorax', 'No Finding',
       'Atelectasis|Infiltration', 'No Finding', 'Mass', 'No Finding'],
      dtype=object)

In [8]:
new_labels = []
for label in all_image_labels:
    if label == 'No Finding':
        new_labels.append(0)
    else:
        new_labels.append(1)

print(new_labels[:50])

[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0]


In [9]:
len(new_labels)

111601

In [10]:
#labels_df = pd.DataFrame(new_labels)

In [11]:
#labels_df.to_csv("../raw_data/encoded_labels_binary.csv",index=False)

In [12]:
#df_labels = pd.read_csv("../raw_data/encoded_labels_binary.csv")

In [13]:
new_encoded_values = tf.convert_to_tensor(new_labels, dtype=tf.float32)

In [14]:
encoded_values = new_encoded_values

In [15]:
label_ds = tf.data.Dataset.from_tensor_slices(encoded_values)

In [16]:
label_ds

<_TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.float32, name=None)>

In [17]:
img_path = all_image_paths[0]
img_path

'/Users/arnodebelle/code/sachamagier/chest-predictor/raw_data/resized_dataset/images/set_full/00018805_001.png'

In [18]:
def preprocess_image(image):
    # convert it to rgb => VGG16 takes RGB image only
    image = tf.image.grayscale_to_rgb(image)

    # Resize the image to a width and a height of 256 pixels
    image = tf.image.resize(image, [224, 224], method='nearest')
    # Normalize the pixel values of the image to the range of [0, 1] => WE DON'T NEED FOR VGG16 (but we finally use another one...)
    image /= 255

    return image

In [19]:
def load_and_preprocess_image(path):
    image = tf.io.read_file(path)
    # Decode the image contents
    img_tensor = tf.image.decode_png(image)[:,:,0:1]
    # return img_tensor
    return preprocess_image(img_tensor)

In [20]:
path_ds = tf.data.Dataset.from_tensor_slices(all_image_paths)

In [21]:
image_ds = path_ds.map(load_and_preprocess_image, num_parallel_calls=AUTOTUNE)

In [22]:
path_label_ds = tf.data.Dataset.from_tensor_slices((all_image_paths, encoded_values))

# The tuples are unpacked into the positional arguments of the mapped function
def load_and_preprocess_from_path_label(path, label):
    return load_and_preprocess_image(path), label

image_label_ds = path_label_ds.map(load_and_preprocess_from_path_label)
image_label_ds

<_MapDataset element_spec=(TensorSpec(shape=(224, 224, 3), dtype=tf.float32, name=None), TensorSpec(shape=(), dtype=tf.float32, name=None))>

In [23]:
train_size = int(0.6 * len(all_image_paths))
val_size = int(0.2 * len(all_image_paths))
test_size = int(0.2 * len(all_image_paths))

In [24]:
train_ds = image_label_ds.take(train_size)

In [25]:
val_ds = image_label_ds.skip(train_size).take(val_size)

In [26]:
test_ds = image_label_ds.skip(train_size + val_size).take(test_size)

In [27]:
BATCH_SIZE = 32

def creating_batch_dataset(dataset, BATCH_SIZE, AUTOTUNE):
    ds = dataset.repeat()
    ds = ds.batch(BATCH_SIZE)
    ds = ds.prefetch(buffer_size=AUTOTUNE)
    return ds

In [28]:
ds = creating_batch_dataset(image_label_ds, BATCH_SIZE, AUTOTUNE)
train_ds = creating_batch_dataset(train_ds, BATCH_SIZE, AUTOTUNE)
val_ds = creating_batch_dataset(val_ds, BATCH_SIZE, AUTOTUNE)
test_ds = creating_batch_dataset(test_ds, BATCH_SIZE, AUTOTUNE)

In [29]:
# rearrange to -1 to 1
def change_range(image,label):
  return 2*image-1, label

keras_ds = ds.map(change_range)

In [30]:
image_batch, label_batch = next(iter(keras_ds))

In [31]:
full_ds = ds.map(change_range)
train_converted_ds = train_ds.map(change_range)
val_converted_ds = val_ds.map(change_range)
test_converted_ds = test_ds.map(change_range)

In [32]:
mobile_net = tf.keras.applications.MobileNetV2(input_shape=(224, 224, 3), include_top=False)
mobile_net.trainable=False

In [33]:
model = tf.keras.Sequential([
  mobile_net,
  tf.keras.layers.GlobalAveragePooling2D(),
  tf.keras.layers.Dense(1, activation = 'sigmoid')])

In [34]:
from tensorflow.keras import optimizers
model.compile(optimizer=optimizers.legacy.Adam(),
              loss='binary_crossentropy',
              metrics=["accuracy"])

In [35]:
STEPS_PER_EPOCH=tf.math.ceil(train_size/BATCH_SIZE).numpy() #alternative = len(all_image_paths)
STEPS_PER_EPOCH

2093.0

In [36]:
STEPS_PER_EPOCH_VAL=tf.math.ceil(val_size/BATCH_SIZE).numpy() #alternative = len(all_image_paths)
STEPS_PER_EPOCH_VAL

698.0

In [37]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

es = EarlyStopping(monitor='accuracy', #'val_multilabel_f1_score',  # the name of the metric will be its prefix + '_' + its function name
                   mode='max',
                   patience=2,
                   verbose=1,
                   restore_best_weights=True)

model_checkpoint = ModelCheckpoint(filepath= os.path.join(data_dir, 'best_binary_model.keras'),
                                   save_best_only=True,
                                   monitor='accuracy')

In [38]:
EPOCHS = 30

history = model.fit(train_converted_ds, 
                    validation_data=val_converted_ds, 
                    epochs=EPOCHS, 
                    steps_per_epoch=STEPS_PER_EPOCH, 
                    validation_steps=STEPS_PER_EPOCH_VAL,
                    callbacks=[es, model_checkpoint])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 12: early stopping


In [40]:
from tensorflow.keras.models import save_model

models_dir = '../models'

# Ensure that the models directory exists
os.makedirs(data_dir, exist_ok=True)


# Save the trained model to the models directory
save_model(model, os.path.join(data_dir, 'ADE_final_binary_model.keras'))

In [41]:
STEPS_PER_EPOCH_TEST=tf.math.ceil(test_size/BATCH_SIZE).numpy() #alternative = len(all_image_paths)
STEPS_PER_EPOCH_TEST

698.0

In [42]:
results = model.evaluate(test_ds, steps=STEPS_PER_EPOCH_TEST, verbose=0)
print(f"Test Accuracy: {results[0]}")

Test Accuracy: 0.6568540930747986


In [43]:
results

[0.6568540930747986, 0.6355658769607544]