The size of the entire dataset is roughly 343.51 GB. This will obviously not work on Google Colab given Google Colab's disk space. One way we can work around this is by mounting the GCS of the competition data to Google Colab.

Relevant links:

- [RSNA 2022 Cervical Spine Fracture Detection](https://www.kaggle.com/competitions/rsna-2022-cervical-spine-fracture-detection/overview)

- [How to access kaggle competition data without using Google Drive / colab disk space](https://slash-z.com/google-colab-mount-kaggle-competition-dataset/)


All credits go to the original author [Qishen Ha](https://www.kaggle.com/code/haqishen/rsna-2022-1st-place-solution-train-stage1/notebook)

In [None]:
## Upload kaggle.json & Save kaggle.json to ~/.kaggle/
!mkdir ~/.kaggle && mv kaggle.json ~/.kaggle/

In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
!echo "deb http://packages.cloud.google.com/apt gcsfuse-`lsb_release -c -s` main" | sudo tee /etc/apt/sources.list.d/gcsfuse.list
!curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add -
!apt-get -y -q update
!apt-get -y -q install gcsfuse

In [None]:
!mkdir -p tmp
!gcsfuse --implicit-dirs --limit-bytes-per-sec -1 --limit-ops-per-sec -1 "ADD_GCS_PATH" tmp

We should now have access to the dataset in tmp

Extra datasets & files we need.

In [None]:
### Extra files
! kaggle datasets download -d boliu0/covn3d-same
! kaggle datasets download -d haqishen/pylibjpeg140py3

In [None]:
! unzip covn3d-same.zip && unzip pylibjpeg140py3.zip

In [None]:
!pip -q install monai
!pip -q install segmentation-models-pytorch==0.2.1
!pip -q install pylibjpeg-1.4.0-py3-none-any.whl
!pip -q install python_gdcm-3.0.17.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!pip -q install pydicom

In [None]:
DEBUG = False

import os
import sys
sys.path = [
    '/content/covn3d-same',
] + sys.path

In [None]:
import os
import sys
import gc
import ast
import cv2
import time
import timm
import pickle
import random
import pydicom
import argparse
import warnings
import numpy as np
import pandas as pd
from glob import glob
import nibabel as nib
from PIL import Image
from tqdm import tqdm
import albumentations
from pylab import rcParams
import matplotlib.pyplot as plt
import segmentation_models_pytorch as smp
from sklearn.model_selection import KFold, StratifiedKFold

import torch
import torch.nn as nn
import torch.optim as optim
import torch.cuda.amp as amp
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

from monai.transforms import Resize
import  monai.transforms as transforms

%matplotlib inline
rcParams['figure.figsize'] = 20, 8
device = torch.device('cuda')
torch.backends.cudnn.benchmark = True

# Config

In [None]:
kernel_type = 'timm3d_res18d_unet4b_128_128_128_dsv2_flip12_shift333p7_gd1p5_bs4_lr3e4_20x50ep'
load_kernel = None
load_last = True
n_blocks = 4
n_folds = 5
backbone = 'resnet18d'

image_sizes = [128, 128, 128]
R = Resize(image_sizes)

init_lr = 3e-3
batch_size = 4
drop_rate = 0.
drop_path_rate = 0.
loss_weights = [1, 1]
p_mixup = 0.1

data_dir = './tmp/'
use_amp = True
num_workers = 4
out_dim = 7

n_epochs = 1000

log_dir = './logs'
model_dir = './models'
os.makedirs(log_dir, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)

In [None]:
transforms_train = transforms.Compose([
    transforms.RandFlipd(keys=["image", "mask"], prob=0.5, spatial_axis=1),
    transforms.RandFlipd(keys=["image", "mask"], prob=0.5, spatial_axis=2),
    transforms.RandAffined(keys=["image", "mask"], translate_range=[int(x*y) for x, y in zip(image_sizes, [0.3, 0.3, 0.3])], padding_mode='zeros', prob=0.7),
    transforms.RandGridDistortiond(keys=("image", "mask"), prob=0.5, distort_limit=(-0.01, 0.01), mode="nearest"),    
])

transforms_valid = transforms.Compose([
])

In [None]:
df_train = pd.read_csv(os.path.join(data_dir, 'train.csv'))

mask_files = os.listdir(f'{data_dir}/segmentations')
df_mask = pd.DataFrame({
    'mask_file': mask_files,
})
df_mask['StudyInstanceUID'] = df_mask['mask_file'].apply(lambda x: x[:-4])
df_mask['mask_file'] = df_mask['mask_file'].apply(lambda x: os.path.join(data_dir, 'segmentations', x))
df = df_train.merge(df_mask, on='StudyInstanceUID', how='left')
df['image_folder'] = df['StudyInstanceUID'].apply(lambda x: os.path.join(data_dir, 'train_images', x))
df['mask_file'].fillna('', inplace=True)

df_seg = df.query('mask_file != ""').reset_index(drop=True)

kf = KFold(5)
df_seg['fold'] = -1
for fold, (train_idx, valid_idx) in enumerate(kf.split(df_seg, df_seg)):
    df_seg.loc[valid_idx, 'fold'] = fold

df_seg.tail()

# Dataset

In [1]:
revert_list = [
    '1.2.826.0.1.3680043.1363',
    '1.2.826.0.1.3680043.20120',
    '1.2.826.0.1.3680043.2243',
    '1.2.826.0.1.3680043.24606',
    '1.2.826.0.1.3680043.32071'
]

In [2]:
def load_dicom(path):
    dicom = pydicom.read_file(path)
    data = dicom.pixel_array
    data = cv2.resize(data, (image_sizes[0], image_sizes[1]), interpolation = cv2.INTER_LINEAR)
    return data


def load_dicom_line_par(path):

    t_paths = sorted(glob(os.path.join(path, "*")),
       key=lambda x: int(x.split('/')[-1].split(".")[0]))

    n_scans = len(t_paths)
    indices = np.quantile(list(range(n_scans)), np.linspace(0., 1., image_sizes[2])).round().astype(int)
    t_paths = [t_paths[i] for i in indices]

    images = []
    for filename in t_paths:
        images.append(load_dicom(filename))
    images = np.stack(images, -1)
    
    images = images - np.min(images)
    images = images / (np.max(images) + 1e-4)
    images = (images * 255).astype(np.uint8)

    return images


def load_sample(row, has_mask=True):

    image = load_dicom_line_par(row.image_folder)
    if image.ndim < 4:
        image = np.expand_dims(image, 0).repeat(3, 0)  # to 3ch

    if has_mask:
        mask_org = nib.load(row.mask_file).get_fdata()
        shape = mask_org.shape
        mask_org = mask_org.transpose(1, 0, 2)[::-1, :, ::-1]  # (d, w, h)
        mask = np.zeros((7, shape[0], shape[1], shape[2]))
        for cid in range(7):
            mask[cid] = (mask_org == (cid+1))
        mask = mask.astype(np.uint8) * 255
        mask = R(mask).numpy()
        
        return image, mask
    else:
        return image

In [None]:
## Used for saving preprocesssed data to .npy
class SaveToDiskSEGDataset(Dataset):
    def __init__(self, df, mode, transform):

        self.df = df.reset_index()
        self.mode = mode
        self.transform = transform

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):
        row = self.df.iloc[index]

        ### not using cache
        image, mask = load_sample(row, has_mask=True)
    
        if row.StudyInstanceUID in revert_list:
            mask = mask[:, :, :, ::-1]

        res = self.transform({'image':image, 'mask':mask})
        image = res['image'] / 255.
        mask = res['mask']
        mask = (mask > 127).astype(np.float32)

        image, mask = torch.tensor(image).float(), torch.tensor(mask).float()

        return row.StudyInstanceUID, image, mask

In [None]:
rcParams['figure.figsize'] = 20,8

df_show = df_seg
dataset_show = SaveToDiskSEGDataset(df_show, 'train', transform=transforms_train)

# Save to .npy

In [None]:
from pathlib import Path

data = Path('data')
data.mkdir(exist_ok=True)

train_image_path_data = Path('data/train_images_npy')
train_image_path_data.mkdir(exist_ok=True)

segmentation_path_data = Path('data/segmentations_npy')
segmentation_path_data.mkdir(exist_ok=True)

for step, (study_instance_uid, image, mask) in tqdm(enumerate(dataset_show), total=len(dataset_show)):
  study_instance_uid = study_instance_uid
  image = image.cpu().detach().numpy()
  mask = mask.cpu().detach().numpy()

  np.save(os.path.join(train_image_path_data, f'{study_instance_uid}'), image)
  np.save(os.path.join(segmentation_path_data, f'{study_instance_uid}'), mask)

In [None]:
## zip data folder
! zip -r data.zip data

In [None]:
## Upload dataset to Kaggle datasets
! kaggle datasets init -p data/
! kaggle datasets create -p data/