In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf

import os

import cv2 as cv
import pydicom

from glob import glob
from os.path import join
from tqdm.autonotebook import tqdm


# TRAINING SET
# The training data is provided as a set of patientIds and bounding boxes (x-min y-min width height)
# There is also a binary target column, Target, indicating pneumonia or non-pneumonia.
# There may be multiple rows per patientId
# stage_2_train_labels.csv -> 1st column = patientID, 6th column = 1 if pneumonia / 0 if not

# imgList: to concatenate images into 'stage_2_train_images' in order to obtain a list; each image is named as follows: 'patientId'.dcm
# infoAboutData: file csv containing information about available data (1st column : patientId, 6th column: 1/0)

def MakeDataFrame(imgList,infoAboutData):
    
    filepath=[]
    filename=[]
    label=[]
    classe=[]
    
    # firstly imgList is ordered, then it is used as input
    # infoAboutData is already ordered, but copied elements are removed before using it as input:
    index = 0 # patients' indexes coincide
    for imagePath in imgList:
        
        #path = tf.strings.split(imagePath)
        filepath.append(imagePath)
        
        image = (tf.strings.split(imagePath, os.path.sep))[-1]
        filename.append(image)
        target = infoAboutData.iloc[index]['Target']
        if target == 1:
            label.append(1)
            classe.append('PNEUMONIA')
        elif target==0:
            label.append(0)
            classe.append('NORMAL')
        
        index+=1
        
    dataFrame=pd.DataFrame({
        "path":filepath,
        "X":filename,
        "y":label,
        "class":classe
        })
    
    return dataFrame

In [None]:
NUM_IMG=2000
DOWNLOAD_TEST = True


In [None]:
# Make a list with all the names of the images of the training set
imgNames = sorted(glob('../input/rsna-pneumonia-detection-challenge/stage_2_train_images/*')) 
info = pd.read_csv('../input/rsna-pneumonia-detection-challenge/stage_2_train_labels.csv')
info = info.drop_duplicates(subset=['patientId'],ignore_index=True)
info=info.sort_values(by=['patientId'])

df=MakeDataFrame(imgNames,info)

In [None]:
print(df.shape)
df.head(30)

In [None]:
# SAMPLING: to choose randomly NUM_IMG pneumonia and NUM_IMG healthy samples.
# (axis = 0 since raws, and so patients, are selected)

tot_pneumonia = df.query('y==1')
samp_pneumonia = tot_pneumonia.sample(n=NUM_IMG, axis=0)
print(samp_pneumonia.shape)
print(samp_pneumonia.head())
print()

tot_normal = df.query('y==0')
samp_normal = tot_normal.sample(n=NUM_IMG, axis=0)
print(samp_normal.shape)
print(samp_normal.head())

In [None]:
#pydicom.dcmread('path/to/file')
def load_image(fname):
    img_dcm = pydicom.dcmread(fname)
    img_np = img_dcm.pixel_array
    # img_np is a numpy array
    return img_np

In [None]:
def scarico_dati(fname, y, n, folder):
    img = load_image(fname)
    img_name = fname.split("/")[-1].split(".")[0]
    path_NORM=join(folder,'Normale')
    path_PNEU=join(folder,'Pneumonia')
    if not os.path.exists(path_NORM):
        os.makedirs(path_NORM)
    if not os.path.exists(path_PNEU):
        os.makedirs(path_PNEU)
    # Pneumonia
    if y==1:
        out_file_pneu = join(path_PNEU, f"{img_name}_label{y}.jpg")
        cv.imwrite(out_file_pneu, img)
    # Normal
    elif y==0:
        out_file_norm = join(path_NORM, f"{img_name}_label{y}.jpg")
        cv.imwrite(out_file_norm, img)
        
    return

In [None]:
path_test="/kaggle/working/Test"
if not os.path.exists(path_test):   
    os.makedirs(path_test)
else:
    print('the file exist')
if DOWNLOAD_TEST:
        for fname,y in tqdm(samp_pneumonia[["path","y"]].values):
            scarico_dati(fname, y, n=1, folder=path_test) 
        for fname,y in tqdm(samp_normal[["path","y"]].values):
            scarico_dati(fname, y, n=1, folder=path_test) 