# Chest-prediction

## PRELIMINARY ACTION !

⚠️ Please go to ➤ https://drive.google.com/file/d/1lLrHbpUQE-Kd-jZ68Uk7SFwawbzqf6Av/view?usp=drive_link

and download the dataset.

Put the zip file into your "*raw_data*" folder

## data loading

In [2]:
import numpy as np
import pandas as pd
import pdb

import os
from pathlib import Path
from PIL import Image

### Loading images

In [3]:
PROJECT_NAME = "chest-predictor" #to be adapted depending on the name of the Project Name in your system
NUMBER_OF_IMAGES = 1000 # nbr of images to be loaded or 'full' to load the entire dataset (+100k images)

In [4]:
USERNAME = os.environ.get('USER')

In [5]:
LOCAL_DATA_PATH = Path(f"/Users/{USERNAME}/code/sachamagier/{PROJECT_NAME}/raw_data/resized_dataset")

In [6]:
print(f"LOCAL_DATA_PATH: {LOCAL_DATA_PATH}")

LOCAL_DATA_PATH: /Users/arnodebelle/code/sachamagier/chest-predictor/raw_data/resized_dataset


In [7]:
def loading_data():
    """This function either get all the images if the user set NUMBER_OF_IMAGES
    to 'full' or the number of imgaes otherwise """

    images_data = []

    # Define the path to the folder
    folder_path = f'../raw_data/resized_dataset/images/set_full/'

    
    # Get a list of all files in the folder
    file_list = os.listdir(folder_path)

    # Filter the list to only include image files
    image_files = [f for f in file_list if f.endswith('.png') or f.endswith('.jpg') or f.endswith('.jpeg')]


    # Loop through the first NUMBER_OF_IMAGES
    for i, image_file in enumerate(image_files):

        # Stop the loop after NUMBER_OF_IMAGES iterations
        if i == NUMBER_OF_IMAGES:
            break

        # Open the image file
        with Image.open(folder_path + image_file) as image:
            # Add the image to the list
            images_data.append((image_file, np.array(image)))

    return images_data

In [8]:
import urllib.request
import zipfile
import shutil


if LOCAL_DATA_PATH.is_dir():
    print("Load local data...")
    # loading data into data
    images_data = loading_data()
else:
    print("Unziping file and loading the data...")

    output_path = "../raw_data/resized_dataset.zip"
    # unzip the file
    with zipfile.ZipFile(output_path, "r") as zip_ref:
        for file_info in zip_ref.infolist():
            zip_ref.extract(file_info, "../raw_data/")
            
    if Path("../raw_data/__MACOSX").is_dir():
        # remove the __MACOSX folder if it exists
        shutil.rmtree("../raw_data/__MACOSX")
        
    # remove the zip file
    os.remove(output_path)
    images_data = loading_data()

print("data loaded.")

Load local data...
data loaded.


In [9]:
# Create a dataframe from the list of images and their indices
images_df = pd.DataFrame(images_data, columns=['Image Index', 'image'])

# Set the index of the dataframe to the 'Image Index' column
images_df = images_df.set_index('Image Index').sort_index(ascending=True)

In [10]:
images_df.shape

(1000, 1)

In [11]:
images_df['image'][0].shape

  images_df['image'][0].shape


(256, 256)

### Loading labels data

In [12]:
labels_df = pd.read_csv('../raw_data/resized_dataset/Data_Entry_2017.csv')

In [13]:
labels_df.head()

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,


In [14]:
unique_lab = labels_df['Finding Labels'].unique()

In [15]:
len(unique_lab)

836

### Merging Images with labels and creating a new DF

In [16]:
# Merge the image_df and labels_df dataframes on the 'Image Index' column
merged_df = pd.merge(images_df, labels_df[['Image Index', 'Finding Labels']], left_index=True, right_on='Image Index', how='inner')

# Rename the 'Finding Labels' column to 'labels'
merged_df = merged_df.rename(columns={'Finding Labels': 'labels'})

# Set the index of the dataframe to the 'Image Index' column
merged_df = merged_df.set_index('Image Index').sort_index(ascending=True)

In [17]:
merged_df.head()

Unnamed: 0_level_0,image,labels
Image Index,Unnamed: 1_level_1,Unnamed: 2_level_1
00000013_008.png,"[[5, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6,...",No Finding
00000013_020.png,"[[43, 34, 29, 27, 25, 24, 25, 24, 25, 25, 26, ...",Pneumothorax
00000013_034.png,"[[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, ...",Emphysema|Pneumothorax
00000038_001.png,"[[27, 14, 6, 3, 2, 1, 2, 1, 2, 1, 1, 2, 2, 2, ...",Atelectasis|Infiltration
00000039_004.png,"[[198, 192, 188, 185, 180, 176, 174, 171, 169,...",Effusion


### droping the rows with images of shape (256, 256, 4)

In [18]:
# Find the images with shape (256, 256, 4)
images_with_shape_4 = [img for img in merged_df['image'] if np.shape(img) == (256, 256, 4)]

# Print the number of images with shape (256, 256, 4)
print(len(images_with_shape_4))


5


In [19]:
# Find the indices of the images with shape (256, 256, 4)
indices_to_drop = merged_df[merged_df['image'].apply(lambda x: np.shape(x) == (256, 256, 4))].index

# Drop the rows with the images with shape (256, 256, 4)
merged_df = merged_df.drop(indices_to_drop)

In [20]:
# Find the images with shape (256, 256, 4)
images_with_shape_4 = [img for img in merged_df['image'] if np.shape(img) == (256, 256, 4)]

# Print the number of images with shape (256, 256, 4)
print(len(images_with_shape_4))

0


### Encoding labels

In [21]:
# Define the list of labels
labels = ['Atelectasis', 'Consolidation', 'Infiltration', 'Pneumothorax', 'Edema',
           'Emphysema', 'Fibrosis', 'Effusion', 'Pneumonia', 'Pleural_Thickening',
           'Cardiomegaly', 'Nodule', 'Mass', 'Hernia', 'No Finding']

# Create a new dataframe with one-hot encoded columns for the labels
one_hot_df = merged_df['labels'].str.get_dummies(sep='|')

# Concatenate the one-hot encoded dataframe with the original dataframe
merged_df = pd.concat([merged_df, one_hot_df], axis=1)


In [22]:
merged_df.columns

Index(['image', 'labels', 'Atelectasis', 'Cardiomegaly', 'Consolidation',
       'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration',
       'Mass', 'No Finding', 'Nodule', 'Pleural_Thickening', 'Pneumonia',
       'Pneumothorax'],
      dtype='object')

## Creating 'X' and 'y'

In [23]:
y = one_hot_df

In [24]:
y

Unnamed: 0_level_0,Atelectasis,Cardiomegaly,Consolidation,Edema,Effusion,Emphysema,Fibrosis,Hernia,Infiltration,Mass,No Finding,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax
Image Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
00000013_008.png,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
00000013_020.png,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
00000013_034.png,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
00000038_001.png,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
00000039_004.png,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
00030413_003.png,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0
00030536_002.png,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
00030636_022.png,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
00030651_007.png,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [25]:
X = merged_df['image']

In [26]:
X.info(memory_usage='deep')

<class 'pandas.core.series.Series'>
Index: 995 entries, 00000013_008.png to 00030753_001.png
Series name: image
Non-Null Count  Dtype 
--------------  ----- 
995 non-null    object
dtypes: object(1)
memory usage: 62.4 MB


In [27]:
X = np.array([np.reshape(img, (256, 256, 1)) for img in X])

In [28]:
X.shape

(995, 256, 256, 1)