[Next notebook](https://www.kaggle.com/keremt/04-generate-sequences)

In [None]:
from fastai.vision.all import *

In [None]:
pd.options.display.max_columns = 100

In [None]:
datapath = Path("/kaggle/input/rsna-str-pulmonary-embolism-detection/")
train_df = pd.read_csv(datapath/'train.csv')
test_df = pd.read_csv(datapath/'test.csv')
imagepath = Path("/kaggle/input/rsna-str-pe-detection-jpeg-256/")

In [None]:
train_df.head(2)

In [None]:
train_qi = train_df.groupby(['StudyInstanceUID'])['pe_present_on_image'].agg('mean')

In [None]:
train_df.pe_present_on_image.mean()

In [None]:
labels_dict = dict(zip(train_df['SOPInstanceUID'], train_df['pe_present_on_image']))

In [None]:
len(labels_dict)

In [None]:
unique_pids = train_df.StudyInstanceUID.unique()

In [None]:
n = len(unique_pids)
nvalid = int(n*0.05); nvalid, n

In [None]:
unique_pids = np.random.permutation(unique_pids)
train_pids = unique_pids[nvalid:]
valid_pids = unique_pids[:nvalid]
len(train_pids), len(valid_pids)

In [None]:
os.makedirs("pids", exist_ok=True)
pd.to_pickle(valid_pids, "pids/train_pids.pkl")
pd.to_pickle(valid_pids, "pids/valid_pids.pkl")

In [None]:
files = get_image_files(imagepath)

### Smart Sample

We don't have to use all the slices as input data per patient. We can simply sample every nth slice for each patient so that we have good enough variability within that patient and use that data for CNN feature training.

In [None]:
from fastai.medical.imaging import *

In [None]:
files_dict = defaultdict(list)
for o in files:
    files_dict[o.parent.parent.name].append(o)

In [None]:
for k in files_dict:
    files_dict[k] = sorted(files_dict[k], key=lambda o: int(o.name.split('_')[0]))

In [None]:
plt.hist([len(files_dict[k]) for k in files_dict]);

In [None]:
def sample_patient_slices(pid, num_slice_samples):
    "Use a fixed number of samples per patient for training speed up"
    files = array(files_dict[pid])
    n = len(files)
    if n > num_slice_samples:
        idxs = [np.clip(int(i), 0, n-1) for i in np.linspace(0, n, num_slice_samples)]
        return files[idxs]
    else: return files

In [None]:
train_sampled_files = parallel(partial(sample_patient_slices, num_slice_samples=120), train_pids)

In [None]:
train_files = []
for o in train_sampled_files: train_files += list(o)

In [None]:
valid_files = []
for o in valid_pids: valid_files += files_dict[o]

In [None]:
len(train_files), len(valid_files)

### Data

In [None]:
def aug_transforms(mult=1.0, do_flip=True, flip_vert=False, max_rotate=10., min_zoom=1., max_zoom=1.1,
                   max_lighting=0.2, max_warp=0.2, p_affine=0.75, p_lighting=0.75, xtra_tfms=None, size=None,
                   mode='bilinear', pad_mode=PadMode.Reflection, align_corners=True, batch=False, min_scale=1.):
    "Utility func to easily create a list of flip, rotate, zoom, warp, lighting transforms."
    res,tkw = [],dict(size=size if min_scale==1. else None, mode=mode, pad_mode=pad_mode, batch=batch, align_corners=align_corners)
    max_rotate,max_lighting,max_warp = array([max_rotate,max_lighting,max_warp])*mult
    if do_flip: res.append(Dihedral(p=0.5, **tkw) if flip_vert else Flip(p=0.5, **tkw))
    if max_warp:   res.append(Warp(magnitude=max_warp, p=p_affine, **tkw))
    if max_rotate: res.append(Rotate(max_deg=max_rotate, p=p_affine, **tkw))
    if min_zoom<1 or max_zoom>1: res.append(Zoom(min_zoom=min_zoom, max_zoom=max_zoom, p=p_affine, **tkw))
    if max_lighting:
        res.append(Brightness(max_lighting=max_lighting, p=p_lighting, batch=batch))
        res.append(Contrast(max_lighting=max_lighting, p=p_lighting, batch=batch))
    xtra_tfms = [RandomResizedCropGPU(size, min_scale=min_scale, ratio=(1,1))] + xtra_tfms
    return res + L(xtra_tfms)

In [None]:
wgtdict = {0:1, 1:10}

In [None]:
def get_label(o): return labels_dict[o.stem.split("_")[1]]

In [None]:
class FlipUD(RandTransform):
    def __init__(self, p=0.5): super().__init__(p=p)
    def encodes(self, x:TensorImage): return x.flip(-2)

In [None]:
from time import time
def get_dls(train_files, valid_files, resize=256, size=224, bs=128):
    
    files = train_files + valid_files
    trn_idxs = list(range(0, len(train_files)))
    val_idxs = list(range(len(train_files), len(files)))
    trn_wgts = [wgtdict[get_label(o)] for o in train_files]
    print(f"Collected idxs")

    tfms = [[PILImage.create, ToTensor, RandomResizedCrop(resize, min_scale=0.9)], 
            [get_label, Categorize()]]
    dsets = Datasets(files, tfms=tfms, splits=(trn_idxs, val_idxs))
    print(f"Created dset")

    aug_tfms = aug_transforms(size=size, max_lighting=False, max_warp=False, flip_vert=False, min_scale=0.85,
                              xtra_tfms=[RandomErasing(sh=0.2, min_aspect=0.15), FlipUD(p=0.3)])    
    batch_tfms = [IntToFloatTensor] + aug_tfms
    dls = dsets.dataloaders(bs=bs, after_batch=batch_tfms, dl_type=WeightedDL, dl_kwargs=[{"wgts":trn_wgts}, {}])
    print(f"DLs ready")
    return dls

In [None]:
dls = get_dls(train_files, valid_files, resize=256, bs=128)

In [None]:
len(dls.train.dataset), len(dls.valid.dataset)

In [None]:
dls.show_batch(max_n=16)

In [None]:
opt_func = partial(ranger, **dict(sqrmom=0.99, mom=0.95, beta=0., eps=1e-4))

In [None]:
loss_func = LabelSmoothingCrossEntropyFlat(eps=0.05)
learn = cnn_learner(dls, xresnet34, opt_func=opt_func, pretrained=True, loss_func=loss_func,
                    metrics=[accuracy], cbs=[SaveModelCallback("accuracy", fname="xresnet34-256", every_epoch=True)])
learn.to_fp16();

In [None]:
# learn.lr_find()

In [None]:
learn.path = Path(".")

In [None]:
base_lr = 2e-3
lr_mult = 100
learn.freeze()
learn.fit_flat_cos(1, slice(base_lr))
base_lr /= 2
learn.unfreeze()
learn.fit_flat_cos(4, slice(base_lr/lr_mult, base_lr))