# Preprocessing and saving images as numpy(.npy) for training models faster 
For those who're still loading dcm images in the DataLoader


Save preprocessed numpy images

In [None]:
import os
import glob
import pydicom
import numpy as np
import cv2
from tqdm import tqdm
import math
import pandas as pd

In [None]:
def load_slices(patient, mri_type='FLAIR'):
    sorted_imgs = sorted(glob.glob('../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/'+patient+'/'+ mri_type + '/*.dcm'))
    slices = [pydicom.read_file(s) for s in sorted_imgs]
    return slices

def get_pixels_hu(slices):
    '''
    Convert pixels to hounsfield units (IMPLEMENTATION SPECIFIC)
    '''
    image = np.stack([s.pixel_array for s in slices])
    image = image.astype(np.int16)
    image[image == -2000] = 0
    for slice_number in range(len(slices)):
        intercept = slices[slice_number].RescaleIntercept
        slope = slices[slice_number].RescaleSlope
        if slope != 1:
            image[slice_number] = slope * image[slice_number].astype(np.float64)
            image[slice_number] = image[slice_number].astype(np.int16)
        image[slice_number] += np.int16(intercept)
    return np.array(image, dtype=np.int16)

def remove_blanks(hu_images):
    blanked_images = []
    for i in range(hu_images.shape[0]):
        if np.min(hu_images[i]) != np.max(hu_images[i]):
            blanked_images.append(hu_images[i])
    return np.array(blanked_images, dtype=np.int16)

### Saving loop

In [None]:
# loading labels
mri_type = 'FLAIR'
labels_df = pd.read_csv('../input/rsna-miccai-brain-tumor-radiogenomic-classification/train_labels.csv', converters={'BraTS21ID': lambda x: str(x)})
labels_df = labels_df.set_index('BraTS21ID')
patients = os.listdir('../input/rsna-miccai-brain-tumor-radiogenomic-classification/train')

for patient in tqdm(patients):
    output = []
    slices = load_slices(patient)
    if len(slices) < 10:
        continue
    images = get_pixels_hu(slices)
    images = remove_blanks(images)
    im = []
    for j in range(images.shape[0]):
        im.append(cv2.resize(images[j], (256, 256)))
    images = np.array(im)
    # DO PREPROCESSING AUGMENTATION CALLS HERE
    
    label = labels_df._get_value(patient, 'MGMT_value')
    # Change label shape depending on your loss function
    if label == 1:
        label = np.array(1)
    elif label == 0:
        label = np.array(0)
        
    output.append([images, label])
    all_data_numpy = np.array(output)
    filename = patient + '.npy'
    np.save(filename, all_data_numpy)

# Move these to a directory 

In [None]:
from torch.utils.data import Dataset
class NpData(Dataset):
    def __init__(self, test=False):
        self.patients = os.listdir('.') 
        if test == True:
            self.patients = self.patients[-50:]
        else:
            self.patients = self.patients[:-50]
        
    def __len__(self):
        return len(self.patients)
    
    def __getitem__(self, x):
        patient = self.patients[x]
        # Images in data directory
        filename = './data/'+ patient
        image = np.load(filename, allow_pickle=True)
        
        images = image[0][0]
        
        images = torch.from_numpy(images)
        
        images = torch.reshape(images, (1, 64, 256, 256))
        
        label = image[0][1].item()
        label = torch.tensor(abs(label), dtype=torch.float)
        
        return images, label