# Chest-prediction

## PRELIMINARY ACTION !

⚠️ Please go to ➤ https://drive.google.com/file/d/1lLrHbpUQE-Kd-jZ68Uk7SFwawbzqf6Av/view?usp=drive_link

and download the dataset.

Put the zip file into your "*raw_data*" folder

## data loading

In [56]:
import numpy as np
import pandas as pd
import pdb

import os
from pathlib import Path
from PIL import Image

### Loading images

In [57]:
PROJECT_NAME = "chest-predictor" #to be adapted depending on the name of the Project Name in your system
NUMBER_OF_IMAGES = 3000 # nbr of images to be loaded or 'full' to load the entire dataset (+100k images)

In [58]:
USERNAME = os.environ.get('USER')

In [59]:
# I changed this so that it works on both windows and mac, but if it doesn't work uncomment the other one


# #LOCAL_DATA_PATH = Path(f"/Users/{USERNAME}/code/sachamagier/{PROJECT_NAME}/raw_data/resized_dataset")
LOCAL_DATA_PATH = Path(os.path.expanduser(f"~/code/sachamagier/{PROJECT_NAME}/raw_data/resized_dataset"))

In [60]:
print(f"LOCAL_DATA_PATH: {LOCAL_DATA_PATH}")

LOCAL_DATA_PATH: /home/rvnmll/code/sachamagier/chest-predictor/raw_data/resized_dataset


In [61]:
def loading_data():
    """This function either get all the images if the user set NUMBER_OF_IMAGES
    to 'full' or the number of imgaes otherwise """

    images_data = []

    # Define the path to the folder
    folder_path = f'../raw_data/resized_dataset/images/set_full/'


    # Get a list of all files in the folder
    file_list = os.listdir(folder_path)

    # Filter the list to only include image files
    image_files = [f for f in file_list if f.endswith('.png') or f.endswith('.jpg') or f.endswith('.jpeg')]


    # Loop through the first NUMBER_OF_IMAGES
    for i, image_file in enumerate(image_files):

        # Stop the loop after NUMBER_OF_IMAGES iterations
        if i == NUMBER_OF_IMAGES:
            break

        # Open the image file
        with Image.open(folder_path + image_file) as image:
            # Add the image to the list
            images_data.append((image_file, np.array(image)))

    return images_data

In [62]:
import urllib.request
import zipfile
import shutil


if LOCAL_DATA_PATH.is_dir():
    print("Load local data...")
    # loading data into data
    images_data = loading_data()
else:
    print("Unziping file and loading the data...")

    output_path = "../raw_data/resized_dataset.zip"
    # unzip the file
    with zipfile.ZipFile(output_path, "r") as zip_ref:
        for file_info in zip_ref.infolist():
            zip_ref.extract(file_info, "../raw_data/")

    if Path("../raw_data/__MACOSX").is_dir():
        # remove the __MACOSX folder if it exists
        shutil.rmtree("../raw_data/__MACOSX")

    # remove the zip file
    os.remove(output_path)
    images_data = loading_data()

print("data loaded.")

Load local data...
data loaded.


In [63]:
# Create a dataframe from the list of images and their indices
images_df = pd.DataFrame(images_data, columns=['Image Index', 'image'])

# Set the index of the dataframe to the 'Image Index' column
images_df = images_df.set_index('Image Index').sort_index(ascending=True)

In [64]:
images_df.shape

(3000, 1)

In [65]:
images_df['image'][0].shape

  images_df['image'][0].shape


(256, 256)

### Loading labels data

In [66]:
labels_df = pd.read_csv('../raw_data/resized_dataset/Data_Entry_2017.csv')

In [67]:
labels_df.head()

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,


In [68]:
unique_lab = labels_df['Finding Labels'].unique()

In [69]:
len(unique_lab)

836

### Merging Images with labels and creating a new DF

In [70]:
# Merge the image_df and labels_df dataframes on the 'Image Index' column
merged_df = pd.merge(images_df, labels_df[['Image Index', 'Finding Labels']], left_index=True, right_on='Image Index', how='inner')

# Rename the 'Finding Labels' column to 'labels'
merged_df = merged_df.rename(columns={'Finding Labels': 'labels'})

# Set the index of the dataframe to the 'Image Index' column
merged_df = merged_df.set_index('Image Index').sort_index(ascending=True)

In [71]:
merged_df.head()

Unnamed: 0_level_0,image,labels
Image Index,Unnamed: 1_level_1,Unnamed: 2_level_1
00000005_003.png,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",No Finding
00000006_000.png,"[[45, 43, 38, 34, 30, 27, 24, 21, 19, 17, 16, ...",No Finding
00000013_031.png,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",Emphysema|Mass
00000017_002.png,"[[14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, ...",No Finding
00000018_000.png,"[[32, 30, 26, 24, 24, 23, 23, 22, 22, 22, 21, ...",No Finding


### droping the rows with images of shape (256, 256, 4)

In [72]:
# Find the images with shape (256, 256, 4)
images_with_shape_4 = [img for img in merged_df['image'] if np.shape(img) == (256, 256, 4)]

# Print the number of images with shape (256, 256, 4)
print(len(images_with_shape_4))


9


In [73]:
# Find the indices of the images with shape (256, 256, 4)
indices_to_drop = merged_df[merged_df['image'].apply(lambda x: np.shape(x) == (256, 256, 4))].index

# Drop the rows with the images with shape (256, 256, 4)
merged_df = merged_df.drop(indices_to_drop)

In [74]:
# Find the images with shape (256, 256, 4)
images_with_shape_4 = [img for img in merged_df['image'] if np.shape(img) == (256, 256, 4)]

# Print the number of images with shape (256, 256, 4)
print(len(images_with_shape_4))

0


### Encoding labels

In [75]:
# Define the list of labels
labels = ['Atelectasis', 'Consolidation', 'Infiltration', 'Pneumothorax', 'Edema',
           'Emphysema', 'Fibrosis', 'Effusion', 'Pneumonia', 'Pleural_Thickening',
           'Cardiomegaly', 'Nodule', 'Mass', 'Hernia', 'No Finding']

# Create a new dataframe with one-hot encoded columns for the labels
one_hot_df = merged_df['labels'].str.get_dummies(sep='|')

# Concatenate the one-hot encoded dataframe with the original dataframe
merged_df = pd.concat([merged_df, one_hot_df], axis=1)


In [76]:
merged_df.columns

Index(['image', 'labels', 'Atelectasis', 'Cardiomegaly', 'Consolidation',
       'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration',
       'Mass', 'No Finding', 'Nodule', 'Pleural_Thickening', 'Pneumonia',
       'Pneumothorax'],
      dtype='object')

## Creating 'X' and 'y'

In [77]:
y = one_hot_df

In [78]:
X = merged_df['image']

In [79]:
X.info(memory_usage='deep')

<class 'pandas.core.series.Series'>
Index: 2991 entries, 00000005_003.png to 00030797_000.png
Series name: image
Non-Null Count  Dtype 
--------------  ----- 
2991 non-null   object
dtypes: object(1)
memory usage: 187.5 MB


In [80]:
X = np.array([np.reshape(img, (256, 256, 1)) for img in X])

In [81]:
X.shape

(2991, 256, 256, 1)

## Creating a simple model

In [82]:
#Imports
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, Rescaling, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [83]:
##### Splitting X and y in train/val/test #####

# First split: split data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

# Second split: split training data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=69)

In [84]:
#Image parameters
IMG_HEIGHT = 256
IMG_WIDTH = 256
BATCH_SIZE = 32
NUM_CLASSES = 15

In [85]:
#import metrics
from sklearn.metrics import accuracy_score, hamming_loss, precision_score, recall_score

# Some extra metrics that we'll use to evaluate our model
def EMR(y_pred, y_test):
    """ Exact Match ratio, this takes the ratio of exact matches per samble.
    A prediction of a sample that accurately predicts ALL lables will be considered
    an Exact match, we cound all the exact matches and divide them by the sample
    amount to get the EM ratio
    """
    return np.all(y_pred == y_test, axis=1).mean()

def hamming_score_value(y_pred, y_test):
    """Proportion of the predicted correct labels to the total number (predicted and actual) of labels
    for that instance. Overall accuracy is the average across all instances.
    We use the hamming loss because it's the opposite of the hamming score, it reports
    how many times on average, the relevance of an example to a class label is incorrectly predicted.
    """
    return 1 - hamming_loss(y_pred, y_test)

def average_precision(y_pred, y_test):
    return precision_score(y_test, y_pred, average='samples', zero_division=0)
def average_recall(y_pred, y_test):
    return recall_score(y_test, y_pred, average='samples')

def evaluating_model(y_pred, y_test):
    print('Exact Match Ratio:', EMR(y_pred, y_test)),
    print('Hamming Score:', hamming_score_value(y_pred, y_test)),
    print('Average Precision:', average_precision(y_pred, y_test)),
    print('Average Recall:', average_recall(y_pred, y_test))


In [99]:
# Define a simple model
def simple_model():

    model = Sequential()
    model.add(Rescaling(1./255, input_shape=(IMG_HEIGHT,IMG_WIDTH,1)))

    model.add(Conv2D(16, kernel_size=10, activation='relu'))
    model.add(MaxPooling2D(3))

    model.add(Conv2D(32, kernel_size=8, activation="relu"))
    model.add(MaxPooling2D(3))

    model.add(Conv2D(32, kernel_size=6, activation="relu"))
    model.add(MaxPooling2D(3))

    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    model.add(Dense(NUM_CLASSES, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [89]:
models_dir = '../models' #This is where the best models will be saved

# Ensure that the directory exists
os.makedirs(models_dir, exist_ok=True)

es = EarlyStopping(monitor='val_loss',
                   patience=10,
                   verbose=0,
                   restore_best_weights=True)

model_checkpoint = ModelCheckpoint(filepath= os.path.join(models_dir, 'simple_model_best.h5'),
                                   save_best_only=True,
                                   monitor='val_loss',
                                   mode='min')


In [105]:
s_model = simple_model()
s_model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 rescaling_8 (Rescaling)     (None, 256, 256, 1)       0         
                                                                 
 conv2d_24 (Conv2D)          (None, 247, 247, 16)      1616      
                                                                 
 max_pooling2d_24 (MaxPooli  (None, 82, 82, 16)        0         
 ng2D)                                                           
                                                                 
 conv2d_25 (Conv2D)          (None, 75, 75, 32)        32800     
                                                                 
 max_pooling2d_25 (MaxPooli  (None, 25, 25, 32)        0         
 ng2D)                                                           
                                                                 
 conv2d_26 (Conv2D)          (None, 20, 20, 32)       

In [106]:
# Train the model
history = s_model.fit(
    X_train,
    y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=BATCH_SIZE,
    callbacks=[es, model_checkpoint])

Epoch 1/10
Epoch 2/10


  saving_api.save_model(


Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [117]:
test_loss, test_accuracy = s_model.evaluate(X_test, y_test)
val_loss, val_accuracy = s_model.evaluate(X_val, y_val)
print('val accuracy:', val_accuracy)
print('test accuracy:', test_accuracy)

val accuracy: 0.5469728708267212
test accuracy: 0.5141903162002563


In [111]:
y_pred = s_model.predict(X_test)



In [112]:
y_pred[0]

array([0.1611082 , 0.08431179, 0.04302759, 0.0482697 , 0.23415886,
       0.0681626 , 0.03675291, 0.01000772, 0.30514473, 0.14327462,
       0.501417  , 0.10649241, 0.03310282, 0.00968514, 0.08492978],
      dtype=float32)

In [113]:
y.head()

Unnamed: 0_level_0,Atelectasis,Cardiomegaly,Consolidation,Edema,Effusion,Emphysema,Fibrosis,Hernia,Infiltration,Mass,No Finding,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax
Image Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
00000005_003.png,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
00000006_000.png,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
00000013_031.png,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0
00000017_002.png,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
00000018_000.png,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [121]:
#Because we need binary results to evaluate the model we'll put a threshold
#that will transform the values to either 0 or 1

threshold = 0.5
y_pred_binary = (y_pred >= threshold).astype(int)

print(evaluating_model(y_pred_binary, y_test))

Exact Match Ratio: 0.44574290484140233
Hamming Score: 0.9225375626043406
Average Precision: 0.44741235392320533
Average Recall: 0.44629938786866996
None


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## We train a more complex model to see how our scores change 

In [162]:
# Define a more complex model
model = Sequential()
model.add(Rescaling(1./255, input_shape=(IMG_HEIGHT,IMG_WIDTH,1)))

model.add(Conv2D(16, kernel_size=3, activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(2))
model.add(Dropout(0.3))

model.add(Conv2D(32, kernel_size=3, activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(2))
model.add(Dropout(0.3))

model.add(Conv2D(64, kernel_size=3, activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(2))
model.add(Dropout(0.4))

model.add(Conv2D(128, kernel_size=3, activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(2))
model.add(Dropout(0.5))

model.add(Flatten())

model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Dense(25, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Dense(NUM_CLASSES, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])


In [138]:
# model_checkpoint = ModelCheckpoint(filepath= os.path.join(models_dir, 'better_model.h5'),
#                                    save_best_only=True,
#                                    monitor='val_loss',
#                                    mode='min')


In [157]:
model.summary()

Model: "sequential_15"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 rescaling_15 (Rescaling)    (None, 256, 256, 1)       0         
                                                                 
 conv2d_48 (Conv2D)          (None, 254, 254, 16)      160       
                                                                 
 batch_normalization_6 (Bat  (None, 254, 254, 16)      64        
 chNormalization)                                                
                                                                 
 max_pooling2d_48 (MaxPooli  (None, 127, 127, 16)      0         
 ng2D)                                                           
                                                                 
 dropout_6 (Dropout)         (None, 127, 127, 16)      0         
                                                                 
 conv2d_49 (Conv2D)          (None, 125, 125, 32)    

In [163]:
history = model.fit(
    X_train,
    y_train,
    validation_data=(X_val, y_val),
    epochs=30,
    batch_size=BATCH_SIZE,
    callbacks=[es])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [164]:
y_pred = model.predict(X_test)



In [165]:
threshold = 0.5
y_pred_binary = (y_pred >= threshold).astype(int)

print(evaluating_model(y_pred_binary, y_test))

Exact Match Ratio: 0.2938230383973289
Hamming Score: 0.9228714524207011
Average Precision: 0.2938230383973289
Average Recall: 0.2938230383973289
None


## I try also vgg16 architecture to see if the performance increases 

In [86]:
def vg16_based__model():
    model = Sequential()

    model.add(Rescaling(1./255, input_shape=(IMG_HEIGHT,IMG_WIDTH,1)))

    model.add(Conv2D(64, kernel_size=3, padding = 'same', activation='relu'))
    model.add(Conv2D(64, kernel_size=3, padding = 'same', activation='relu'))

    model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))

    model.add(Conv2D(128, kernel_size=3, padding = 'same', activation='relu'))
    model.add(Conv2D(128, kernel_size=3, padding = 'same', activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))

    model.add(Conv2D(256, kernel_size=3, padding = 'same', activation='relu'))
    model.add(Conv2D(256, kernel_size=3, padding = 'same', activation='relu'))
    model.add(Conv2D(256, kernel_size=3, padding = 'same', activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))

    model.add(Conv2D(512, kernel_size=3, padding = 'same', activation='relu'))
    model.add(Conv2D(512, kernel_size=3, padding = 'same', activation='relu'))
    model.add(Conv2D(512, kernel_size=3, padding = 'same', activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))

    model.add(Conv2D(512, kernel_size=3, padding = 'same', activation='relu'))
    model.add(Conv2D(512, kernel_size=3, padding = 'same', activation='relu'))
    model.add(Conv2D(512, kernel_size=3, padding = 'same', activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))

    model.add(Flatten())

    model.add(Dense(256, activation='relu', name = 'fc1'))
    model.add(Dense(128, activation='relu', name = 'fc2'))
    model.add(Dense(25, activation='relu', name = 'fc3'))
    model.add(Dense(NUM_CLASSES, activation='sigmoid', name = 'ouput'))

    model.compile(loss='binary_crossentropy',
                    optimizer='adam',
                    metrics=['accuracy'])
    return model

In [87]:
model = vg16_based__model()
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 rescaling_1 (Rescaling)     (None, 256, 256, 1)       0         
                                                                 
 conv2d_13 (Conv2D)          (None, 256, 256, 64)      640       
                                                                 
 conv2d_14 (Conv2D)          (None, 256, 256, 64)      36928     
                                                                 
 max_pooling2d_5 (MaxPoolin  (None, 128, 128, 64)      0         
 g2D)                                                            
                                                                 
 conv2d_15 (Conv2D)          (None, 128, 128, 128)     73856     
                                                                 
 conv2d_16 (Conv2D)          (None, 128, 128, 128)     147584    
                                                      

In [1]:
history = model.fit(
    X_train,
    y_train,
    validation_data=(X_val, y_val),
    epochs=30,
    batch_size=128,
    callbacks=[es])

NameError: name 'model' is not defined

## We can also try to create a transfer model


In [169]:
from tensorflow.keras.applications.vgg16 import VGG16

def load_model():

    # $CHALLENGIFY_BEGIN

    model = VGG16(weights="imagenet", include_top=False, input_shape=X_train[0].shape)

    # $CHALLENGIFY_END

    return model

In [170]:
transfer_model = load_model()

transfer_model.summary()

ValueError: The input must have 3 channels; Received `input_shape=(256, 256, 1)`