In [None]:
from pathlib import Path
import pydicom
import numpy as np
import cv2
import imageio
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

In [None]:
train_labels = pd.read_csv("./train_labels.csv") #Add path to the labels file for the train/val dataset
test_labels = pd.read_csv("./test_labels.csv") #Add path to the labels file for the test dataset

In [None]:
train_labels = train_labels.drop_duplicates("patientId")

In [None]:
TRAIN_ROOT_PATH = Path("./train_images") #Add path to the folder with train/val images
TEST_ROOT_PATH = Path("./test_images") #Add path to the folder with test images
SAVE_PATH = Path("./processed/") #Add path to the folder where the processed images will be saved

In [None]:
train_sums = 0
train_sums_squared = 0

targets=[]
count = 0

for c, target in enumerate(tqdm(train_labels.Target)):
    count += 1
    targets.append(target)

train_idx, validation_idx = train_test_split(np.arange(count), test_size=0.15, random_state=43,
                                             shuffle=True, stratify=targets)

for c, patient_id in enumerate(tqdm(train_labels.patientId)):
    img_path = TRAIN_ROOT_PATH/patient_id
    img_path = img_path.with_suffix(".dcm")
    
    img = pydicom.read_file(img_path).pixel_array / 255  
    img_array = cv2.resize(img, (224, 224)).astype(np.float16)

    label = train_labels.Target.iloc[c]

    if c in train_idx:
        train_or_val = "train"
    else:
        train_or_val = "val"
        
    current_save_path = SAVE_PATH/train_or_val/str(label)
    current_save_path.mkdir(parents=True, exist_ok=True)
    np.save(current_save_path/patient_id, img_array)
    
    normalizer = img_array.shape[0] * img_array.shape[1]
    if train_or_val == "train":
        train_sums += np.sum(img_array) / normalizer
        train_sums_squared += (np.power(img_array, 2).sum()) / normalizer

In [None]:
train_mean = train_sums / len(train_idx)
train_std = np.sqrt(train_sums_squared / len(train_idx) - (train_mean**2))

In [None]:
print(f"Mean of Train Dataset: {train_mean}, STD: {train_std}")

In [None]:
for c, patient_id in enumerate(tqdm(test_labels.fileName)):
    img_path = TEST_ROOT_PATH/patient_id
    
    img = imageio.v2.imread(img_path) / 255  
    img_array = cv2.resize(img, (224, 224)).astype(np.float16)

    if len(img_array.shape)!=2:
        continue
    
    label = test_labels.pneumonia.iloc[c]

    filename = patient_id[:-5]
    
    current_save_path = SAVE_PATH/str("test")/str(label)
    current_save_path.mkdir(parents=True, exist_ok=True)
    np.save(current_save_path/filename, img_array)