In [None]:
!cp /kaggle/input/gdcm-conda-install/gdcm.tar .
!tar  -xzf gdcm.tar
!conda install -q --offline ./gdcm/gdcm-2.8.9-py37h71b2a6d_0.tar.bz2

In [None]:
from fastai.vision.all import *
from fastai.medical.imaging import *

In [None]:
datapath = Path("/kaggle/input/rsna-str-pulmonary-embolism-detection/")
testdatapath = datapath/'test'

cnn2dmodelpath = Path("/kaggle/input/rsnape2dmodels/")
cnn3dmodelpath = Path("/kaggle/input/rsnape3dmodels/")
seqmodelpath = Path("/kaggle/input/rsnapeseqmodels/")

test_df = pd.read_csv(datapath/'test.csv')
sub_df = pd.read_csv(datapath/'sample_submission.csv')

In [None]:
[o for o in sub_df['id'].values if "df06fad17bc3" in o]

In [None]:
device = default_device()

RAM Until Here: 1GB

### Predict Study

In [None]:
test_study_dirnames = [datapath/'test'/o for o in test_df['StudyInstanceUID'].unique()]
study_dirname = test_study_dirnames[0]

In [None]:
# RGB windows
lung_window = (1500, -600)
pe_window = (700, 100)
mediastinal_window = (400, 40)
windows = (lung_window, pe_window, mediastinal_window)

def read_dcm_img(dcm, windows=windows):
    "Read single slice in RGB"
    return torch.stack([dcm.windowed(*w) for w in windows])

#### CNN Model

In [None]:
# Load CNN model
def get_dls(tensors, size=256, bs=128):
    "Get study dataloader"
    tfms = [[RandomResizedCropGPU(size, min_scale=0.9)], []]

    dsets = Datasets(tensors, tfms=tfms, splits=([0,1], [2,3]))

#     batch_tfms = [Normalize.from_stats(*imagenet_stats)]
    batch_tfms = []
    dls = dsets.dataloaders(bs=bs, after_batch=batch_tfms, num_workers=2)
    return dls

dls = get_dls(torch.zeros(4, 3, 512, 512), size=480, bs=32)
dls.c = 2

In [None]:
learn2d_fold0 = cnn_learner(dls, xresnet34, pretrained=False, loss_func=nn.CrossEntropyLoss(), model_dir='.')
learn2d_fold0.path = cnn2dmodelpath
learn2d_fold0.load('xresnet34-512-PR-fold0');

In [None]:
learn2d_fold1 = cnn_learner(dls, xresnet34, pretrained=False, loss_func=nn.CrossEntropyLoss(), model_dir='.')
learn2d_fold1.path = cnn2dmodelpath
learn2d_fold1.load('xresnet34-512-PR-fold1');

In [None]:
learn2d_fold2 = cnn_learner(dls, xresnet34, pretrained=False, loss_func=nn.CrossEntropyLoss(), model_dir='.')
learn2d_fold2.path = cnn2dmodelpath
learn2d_fold2.load('xresnet34-512-PR-fold2');

In [None]:
learn2d_fold3 = cnn_learner(dls, xresnet34, pretrained=False, loss_func=nn.CrossEntropyLoss(), model_dir='.')
learn2d_fold3.path = cnn2dmodelpath
learn2d_fold3.load('xresnet34-512-PR-fold3');

In [None]:
learn2d_fold4 = cnn_learner(dls, xresnet34, pretrained=False, loss_func=nn.CrossEntropyLoss(), model_dir='.')
learn2d_fold4.path = cnn2dmodelpath
learn2d_fold4.load('xresnet34-512-PR-fold4');

In [None]:
model0 = learn2d_fold0.model.eval().to(device)
model1 = learn2d_fold1.model.eval().to(device)
model2 = learn2d_fold2.model.eval().to(device)
model3 = learn2d_fold3.model.eval().to(device)
model4 = learn2d_fold4.model.eval().to(device)

RAM Until Here: 3.2 GB

#### Sequence Model

In [None]:
from fastai.text.all import *

In [None]:
input_pad_idx = None

In [None]:
class AWD_LSTM(Module):
    "AWD-LSTM inspired by https://arxiv.org/abs/1708.02182"
    initrange=0.1

    def __init__(self, emb_sz,n_hid, n_layers, hidden_p=0.2, input_p=0.6, weight_p=0.5, bidir=False):
        store_attr('emb_sz,n_hid,n_layers')
        self.bs = 1
        self.n_dir = 2 if bidir else 1
        
        self.rnns = nn.ModuleList([self._one_rnn(emb_sz if l == 0 else n_hid, (n_hid)//self.n_dir, bidir, weight_p, l) for l in range(n_layers)])

        self.input_dp = RNNDropout(input_p)
        self.hidden_dps = nn.ModuleList([RNNDropout(hidden_p) for l in range(n_layers)])
        self.reset()

    def forward(self, x, from_embeds=False):
        
        if from_embeds: inp = x
        else: inp = combined_embeddings[x].to(device)
        bs,sl = inp.shape[:2]
        if bs!=self.bs: self._change_hidden(bs)

        output = self.input_dp(inp)
        new_hidden = []
        for l, (rnn,hid_dp) in enumerate(zip(self.rnns, self.hidden_dps)):
            output, new_h = rnn(output, self.hidden[l])
            new_hidden.append(new_h)
            if l != self.n_layers - 1: output = hid_dp(output)
        self.hidden = to_detach(new_hidden, cpu=False, gather=False)
        return output

    def _change_hidden(self, bs):
        self.hidden = [self._change_one_hidden(l, bs) for l in range(self.n_layers)]
        self.bs = bs

    def _one_rnn(self, n_in, n_out, bidir, weight_p, l):
        "Return one of the inner rnn"
        rnn = nn.LSTM(n_in, n_out, 1, batch_first=True, bidirectional=bidir, bias=False)
        return WeightDropout(rnn, weight_p)

    def _one_hidden(self, l):
        "Return one hidden state"
        nh = (self.n_hid) // self.n_dir
        return (one_param(self).new_zeros(self.n_dir, self.bs, nh).to(device), one_param(self).new_zeros(self.n_dir, self.bs, nh).to(device))

    def _change_one_hidden(self, l, bs):
        if self.bs < bs:
            nh = (self.n_hid) // self.n_dir
            return tuple(torch.cat([h, h.new_zeros(self.n_dir, bs-self.bs, nh)], dim=1) for h in self.hidden[l])
        if self.bs > bs: return (self.hidden[l][0][:,:bs].contiguous(), self.hidden[l][1][:,:bs].contiguous())
        return self.hidden[l]

    def reset(self):
        "Reset the hidden states"
        [r.reset() for r in self.rnns if hasattr(r, 'reset')]
        self.hidden = [self._one_hidden(l) for l in range(self.n_layers)]

In [None]:
lstm_width = 512 
layers = [lstm_width * 3] + [lstm_width] + [9]

class MultiHeadedSequenceClassifier(Module):
    "dim: input sequence feature dim"
    def __init__(self, bptt=72, input_pad_idx=input_pad_idx, n_meta=1, dim=1024):
        
        store_attr('input_pad_idx')
        self.awd_lstm = AWD_LSTM(dim+n_meta, lstm_width, 2, bidir=True)
        self.encoder = SentenceEncoder(bptt=bptt, module=self.awd_lstm, pad_idx=input_pad_idx)
        
        # image level preds
        self.seq_head = LinearDecoder(1, lstm_width, bias=True)
 
        # exam level preds
        self.exam_head = PoolingLinearClassifier(layers, ps=[0.4, 0.1], bptt=bptt)
        
    
    def forward(self, x):
        out, mask = self.encoder(x) 
       
        # img level out
        seq_cls_out,_,_ = self.seq_head(out)
        seq_cls_out = seq_cls_out.squeeze(-1)
              
        # exam level out
        exam_out,_,_ = self.exam_head((out,mask))

        return (seq_cls_out, exam_out)
    
    def predict(self, x):
        out = self.awd_lstm(x, from_embeds=True)
        
        # img level out
        seq_cls_out,_,_ = self.seq_head(out)
        seq_cls_out = seq_cls_out.squeeze(-1)
        
       
        # exam level out
        mask = torch.zeros(x.shape[:-1]).bool().to(device)
        exam_out,_,_ = self.exam_head((out, mask))
        return (seq_cls_out, exam_out)


In [None]:
layers1 = [512 * 3] + [512] + [12]

class MultiHeadedSoftmaxSequenceClassifier(Module):
    "dim: input sequence feature dim"
    def __init__(self, bptt=72, input_pad_idx=input_pad_idx, n_meta=2, dim=1024):
        
        store_attr('input_pad_idx')
        self.awd_lstm = AWD_LSTM(dim+n_meta, 512, 2, bidir=True)
        self.encoder = SentenceEncoder(bptt=bptt, module=self.awd_lstm, pad_idx=input_pad_idx)
        
        # image level preds
        self.seq_head = LinearDecoder(1, 512, bias=True)
 
        # exam level preds
        self.exam_head = PoolingLinearClassifier(layers1, ps=[0.4, 0.1], bptt=bptt)

#         self.posnegind_head = PoolingLinearClassifier(layers1, ps=[0.4, 0.1], bptt=bptt)
#         self.rvlv_head = PoolingLinearClassifier(layers2, ps=[0.4, 0.1], bptt=bptt)
#         self.lrc_head = PoolingLinearClassifier(layers3, ps=[0.4, 0.1], bptt=bptt)
#         self.chroacute_head = PoolingLinearClassifier(layers4, ps=[0.4, 0.1], bptt=bptt)
        
    
    def forward(self, x):
        out, mask = self.encoder(x) 
       
        # img level out
        seq_cls_out,_,_ = self.seq_head(out)
        seq_cls_out = seq_cls_out.squeeze(-1)
              
        # exam level out
        exam_out,_,_ = self.exam_head((out,mask))
        posneg_out, rvlv_out, lrc_out, chroacute_out = (exam_out[:,:3], 
                                                        exam_out[:,3:6], 
                                                        exam_out[:,6:9], 
                                                        exam_out[:,9:])

        return (seq_cls_out, posneg_out, rvlv_out, lrc_out, chroacute_out)
    
    
    def predict(self, x):
        out = self.awd_lstm(x, from_embeds=True)
        
        # img level out
        seq_cls_out,_,_ = self.seq_head(out)
        seq_cls_out = seq_cls_out.squeeze(-1)
        
       
        # exam level out
        mask = torch.zeros(x.shape[:-1]).bool().to(device)
        exam_out,_,_ = self.exam_head((out, mask))
        posneg_out, rvlv_out, lrc_out, chroacute_out = (exam_out[:,:3], 
                                                        exam_out[:,3:6], 
                                                        exam_out[:,6:9], 
                                                        exam_out[:,9:])

        return (seq_cls_out, posneg_out, rvlv_out, lrc_out, chroacute_out)

In [None]:
seq_model0 = SequentialRNN(MultiHeadedSoftmaxSequenceClassifier(bptt=256, dim=1024, n_meta=2))
seq_model0.load_state_dict(torch.load(seqmodelpath/"nometa_sequence_softmax_with_preds_fulldata_fold0.pth"));

seq_model1 = SequentialRNN(MultiHeadedSoftmaxSequenceClassifier(bptt=256, dim=1024, n_meta=2))
seq_model1.load_state_dict(torch.load(seqmodelpath/"nometa_sequence_softmax_with_preds_fulldata_fold1.pth"));

seq_model2 = SequentialRNN(MultiHeadedSoftmaxSequenceClassifier(bptt=256, dim=1024, n_meta=2))
seq_model2.load_state_dict(torch.load(seqmodelpath/"nometa_sequence_softmax_with_preds_fulldata_fold2.pth"));

seq_model3 = SequentialRNN(MultiHeadedSoftmaxSequenceClassifier(bptt=256, dim=1024, n_meta=2))
seq_model3.load_state_dict(torch.load(seqmodelpath/"nometa_sequence_softmax_with_preds_fulldata_fold3.pth"));

seq_model4 = SequentialRNN(MultiHeadedSoftmaxSequenceClassifier(bptt=256, dim=1024, n_meta=2))
seq_model4.load_state_dict(torch.load(seqmodelpath/"nometa_sequence_softmax_with_preds_fulldata_fold4.pth"));

In [None]:
seq_model0 = seq_model0[0].to(device).eval()
seq_model1 = seq_model1[0].to(device).eval()
seq_model2 = seq_model2[0].to(device).eval()
seq_model3 = seq_model3[0].to(device).eval()
seq_model4 = seq_model4[0].to(device).eval()

RAM Until Here: 3.4 GB

### Metadata

In [None]:
mean_std_dict = {
#             'img_min': [-1542.35551498553, 849.3331965009891],
#             'img_max': [3209.4925326455914, 1138.112174280331],
#             'img_mean': [165.7994337836255, 278.9659609535833],
#             'img_std': [994.3087633304141, 293.05859626364196],
#             'img_pct_window': [0.43983955119726603, 0.11747102851831802],
            'scaled_position': [0.5078721739409284, 0.29139548181397823]
}

#### Predict

In [None]:
from fastai.medical.imaging import *
import pydicom

In [None]:
class EmbeddingHook:
    def __init__(self, m):
        self.embeddings, self.m = tensor([]).to(device), m
        if len(m._forward_hooks) > 0: self.reset()
        self.hook = Hook(m, self.hook_fn, cpu=False)
       
    def hook_fn(self, m, inp, out): 
        "Stack and save computed embeddings"
        self.embeddings = torch.cat([self.embeddings, out])
    
    def reset(self): 
        self.m._forward_hooks = OrderedDict()

In [None]:
# meta_feat_cols = ['img_min', 'img_max', 'img_mean', 'img_std', 'img_pct_window', 'scaled_position']
meta_feat_cols = ['scaled_position']

In [None]:
def minmax_scaler(o): return (o - min(o))/(max(o) - min(o))

In [None]:
mean, std = mean_std_dict['scaled_position']

In [None]:
def read_dcm_img_v2(dcm, windows=windows):
    "Read single slice in RGB"
    return [dcm.windowed(*w) for w in windows]

In [None]:
def predict_study(study_dirname):   
    # get metadata
    study_df = test_df.query(f'StudyInstanceUID == "{study_dirname.stem}"')
    sop_ids = study_df['SOPInstanceUID'].values
    study_files = str(testdatapath) + "/" + study_df['StudyInstanceUID'] + "/" + study_df['SeriesInstanceUID'] + '/' + sop_ids + '.dcm'
    dcm_ds = array([pydicom.read_file(o) for o in study_files])

    z_positions = array([int(o.ImagePositionPatient[-1]) for o in dcm_ds])
    sortidxs = np.argsort(z_positions)

    sop_ids = sop_ids[sortidxs]
    dcm_ds = dcm_ds[sortidxs]
    z_positions = z_positions[sortidxs]

    #     imgs = torch.stack([read_dcm_img(o) for o in dcm_ds])
    imgs = []
    for o in dcm_ds: imgs += read_dcm_img_v2(o)
    h,w = imgs[0].size()
    imgs = torch.stack(imgs).view(-1,3,h,w)

    meta_embeddings = tensor((minmax_scaler(z_positions) - mean)/std).unsqueeze(1)

    emb_hook0 = EmbeddingHook(model0[1][1])
    emb_hook1 = EmbeddingHook(model1[1][1])
    emb_hook2 = EmbeddingHook(model2[1][1])
    emb_hook3 = EmbeddingHook(model3[1][1])
    emb_hook4 = EmbeddingHook(model4[1][1])


    with torch.no_grad():
        test_dl = learn2d_fold0.dls.test_dl(imgs.numpy(), bs=32, num_workers=2)

        outs0, outs1, outs2, outs3, outs4, outs5 = [],[],[],[],[],[]
        for xb in test_dl:
            out0 = model0(*xb); outs0.append(out0)
            out1 = model1(*xb); outs1.append(out1)
            out2 = model2(*xb); outs2.append(out2)
            out3 = model3(*xb); outs3.append(out3)
            out4 = model4(*xb); outs4.append(out4)

        outs0 = torch.cat(outs0)[:,1].view(-1,1)
        outs1 = torch.cat(outs1)[:,1].view(-1,1)
        outs2 = torch.cat(outs2)[:,1].view(-1,1)
        outs3 = torch.cat(outs3)[:,1].view(-1,1)
        outs4 = torch.cat(outs4)[:,1].view(-1,1)

        seq_inp0 = torch.cat([emb_hook0.embeddings, meta_embeddings.to(device), outs0],1)
        seq_inp1 = torch.cat([emb_hook1.embeddings, meta_embeddings.to(device), outs1],1)
        seq_inp2 = torch.cat([emb_hook2.embeddings, meta_embeddings.to(device), outs2],1)
        seq_inp3 = torch.cat([emb_hook3.embeddings, meta_embeddings.to(device), outs3],1)
        seq_inp4 = torch.cat([emb_hook4.embeddings, meta_embeddings.to(device), outs4],1)
        
        # sequence pred
        seq_img_preds0, posneg_preds0, rvlv_preds0, lrc_preds0, chroacute_preds0 = seq_model0.predict(seq_inp0[None,...])
        seq_img_preds1, posneg_preds1, rvlv_preds1, lrc_preds1, chroacute_preds1 = seq_model1.predict(seq_inp1[None,...])
        seq_img_preds2, posneg_preds2, rvlv_preds2, lrc_preds2, chroacute_preds2 = seq_model2.predict(seq_inp2[None,...])
        seq_img_preds3, posneg_preds3, rvlv_preds3, lrc_preds3, chroacute_preds3 = seq_model3.predict(seq_inp3[None,...])
        seq_img_preds4, posneg_preds4, rvlv_preds4, lrc_preds4, chroacute_preds4 = seq_model4.predict(seq_inp4[None,...])
        
        # tta pred - flip sequence order
        seq_img_preds0_tta, posneg_preds0_tta, rvlv_preds0_tta, lrc_preds0_tta, chroacute_preds0_tta = seq_model0.predict(seq_inp0.flip(dims=[0])[None,...])
        seq_img_preds1_tta, posneg_preds1_tta, rvlv_preds1_tta, lrc_preds1_tta, chroacute_preds1_tta = seq_model1.predict(seq_inp1.flip(dims=[0])[None,...])
        seq_img_preds2_tta, posneg_preds2_tta, rvlv_preds2_tta, lrc_preds2_tta, chroacute_preds2_tta = seq_model2.predict(seq_inp2.flip(dims=[0])[None,...])
        seq_img_preds3_tta, posneg_preds3_tta, rvlv_preds3_tta, lrc_preds3_tta, chroacute_preds3_tta = seq_model3.predict(seq_inp3.flip(dims=[0])[None,...])
        seq_img_preds4_tta, posneg_preds4_tta, rvlv_preds4_tta, lrc_preds4_tta, chroacute_preds4_tta = seq_model4.predict(seq_inp4.flip(dims=[0])[None,...])
            
        # flip back to align image slices
        seq_img_preds0_tta = seq_img_preds0_tta.flip(dims=[1])
        seq_img_preds1_tta = seq_img_preds1_tta.flip(dims=[1])
        seq_img_preds2_tta = seq_img_preds2_tta.flip(dims=[1])
        seq_img_preds3_tta = seq_img_preds3_tta.flip(dims=[1])
        seq_img_preds4_tta = seq_img_preds4_tta.flip(dims=[1])
        
        
        seq_img_preds0,posneg_preds0,rvlv_preds0,lrc_preds0,chroacute_preds0 =  ((seq_img_preds0+seq_img_preds0_tta)/2, 
                                                                                 (posneg_preds0+posneg_preds0_tta)/2, 
                                                                                 (rvlv_preds0+rvlv_preds0_tta)/2, 
                                                                                 (lrc_preds0+lrc_preds0_tta)/2, 
                                                                                 (chroacute_preds0+chroacute_preds0_tta)/2)
        
        seq_img_preds1,posneg_preds1,rvlv_preds1,lrc_preds1,chroacute_preds1 =  ((seq_img_preds1+seq_img_preds1_tta)/2, 
                                                                                 (posneg_preds1+posneg_preds1_tta)/2, 
                                                                                 (rvlv_preds1+rvlv_preds1_tta)/2, 
                                                                                 (lrc_preds1+lrc_preds1_tta)/2, 
                                                                                 (chroacute_preds1+chroacute_preds1_tta)/2)

        seq_img_preds2,posneg_preds2,rvlv_preds2,lrc_preds2,chroacute_preds2 =  ((seq_img_preds2+seq_img_preds2_tta)/2, 
                                                                                 (posneg_preds2+posneg_preds2_tta)/2, 
                                                                                 (rvlv_preds2+rvlv_preds2_tta)/2, 
                                                                                 (lrc_preds2+lrc_preds2_tta)/2, 
                                                                                 (chroacute_preds2+chroacute_preds2_tta)/2)
        
        seq_img_preds3,posneg_preds3,rvlv_preds3,lrc_preds3,chroacute_preds3 =  ((seq_img_preds3+seq_img_preds3_tta)/2, 
                                                                                 (posneg_preds3+posneg_preds3_tta)/2, 
                                                                                 (rvlv_preds3+rvlv_preds3_tta)/2, 
                                                                                 (lrc_preds3+lrc_preds3_tta)/2, 
                                                                                 (chroacute_preds3+chroacute_preds3_tta)/2)
        
        seq_img_preds4,posneg_preds4,rvlv_preds4,lrc_preds4,chroacute_preds4 =  ((seq_img_preds4+seq_img_preds4_tta)/2, 
                                                                                 (posneg_preds4+posneg_preds4_tta)/2, 
                                                                                 (rvlv_preds4+rvlv_preds4_tta)/2, 
                                                                                 (lrc_preds4+lrc_preds4_tta)/2, 
                                                                                 (chroacute_preds4+chroacute_preds4_tta)/2)
        
        
    seq_img_preds0 = seq_img_preds0.sigmoid()
    seq_img_preds1 = seq_img_preds1.sigmoid()
    seq_img_preds2 = seq_img_preds2.sigmoid()
    seq_img_preds3 = seq_img_preds3.sigmoid()
    seq_img_preds4 = seq_img_preds4.sigmoid()

    posneg_preds = ((
                      posneg_preds0.softmax(1) 
                    + posneg_preds1.softmax(1)
                    + posneg_preds2.softmax(1)
                    + posneg_preds3.softmax(1)
                    + posneg_preds4.softmax(1)) / 5)[0]

    rvlv_preds = ((
                      rvlv_preds0.softmax(1) 
                    + rvlv_preds1.softmax(1)
                    + rvlv_preds2.softmax(1)
                    + rvlv_preds3.softmax(1)
                    + rvlv_preds4.softmax(1)) / 5)[0]

    lrc_preds = ((
                      lrc_preds0.sigmoid() 
                    + lrc_preds1.sigmoid()
                    + lrc_preds2.sigmoid()
                    + lrc_preds3.sigmoid()
                    + lrc_preds4.sigmoid()) / 5)[0]

    chroacute_preds = ((
                      chroacute_preds0.softmax(1) 
                    + chroacute_preds1.softmax(1)
                    + chroacute_preds2.softmax(1)
                    + chroacute_preds3.softmax(1)
                    + chroacute_preds4.softmax(1)) / 5)[0]

    # pos, neg, ind
    _, negative, indeterminate = posneg_preds
    
    # rvlv >= 1, rvlv < 1, neither
    rvlv_gte, rvlv_lt, _ = rvlv_preds
    
    # left, right, central
    left_pe, right_pe, central_pe = lrc_preds
    
    # chronic, acute_and_chronic, neither
    chro_pe, acute_chro_pe, _ = chroacute_preds
    
    seq_exam_preds = torch.stack([negative, rvlv_gte, rvlv_lt, left_pe, chro_pe, right_pe, acute_chro_pe, central_pe, indeterminate])

    seq_img_preds = ((seq_img_preds0
                    + seq_img_preds1
                    + seq_img_preds2
                    + seq_img_preds3
                    + seq_img_preds4) / 5)[0]
    
    return (sop_ids, to_cpu(seq_img_preds), to_cpu(seq_exam_preds))

RAM Until Here: 4.3 GB

In [None]:
def get_study_res(sid, sop_ids, img_preds, exam_preds):
    "Get preds from 2D and 3D cnn models"
    sub_res = []
    for sopid, p in zip(sop_ids, to_np(img_preds)):
        sub_res.append((sopid, p))
            
    # exam probas (same order for 3D and no meta sequence models)
    target_cols = [
        'negative_exam_for_pe', # exam level
        'rv_lv_ratio_gte_1', # exam level
        'rv_lv_ratio_lt_1', # exam level
        'leftsided_pe', # exam level
        'chronic_pe', # exam level
        'rightsided_pe', # exam level
        'acute_and_chronic_pe', # exam level
        'central_pe', # exam level
        'indeterminate' # exam level
    ]
    
    
    for tcol, p in zip(target_cols, to_np(exam_preds)):
        sub_res.append((f"{sid}_{tcol}", p))
    return sub_res

In [None]:
%%time
do_full = False
n = 20

if Path('../input/rsna-str-pulmonary-embolism-detection/train').exists() and not do_full: 
    test_study_dirnames = [datapath/'test'/o for o in test_df['StudyInstanceUID'].unique()]
    test_study_dirnames = np.random.choice(test_study_dirnames, n, replace=False)

sub_res = []
for study_dirname in test_study_dirnames:
    sop_ids, seq_img_preds, seq_exam_preds = predict_study(study_dirname)
    study_res = get_study_res(study_dirname.stem, sop_ids, seq_img_preds, seq_exam_preds)
    sub_res += study_res

In [None]:
final_sub_df = pd.DataFrame(sub_res, columns=['id', 'label'])
final_sub_df['label']  = np.clip(final_sub_df['label'], 0.0001, 0.9999)

In [None]:
final_sub_df.to_csv("submission.csv", index=False)

### Consistency Check

In [None]:
def check_consistency(sub, test):
    
    '''
    Checks label consistency and returns the errors
    
    Args:
    sub   = submission dataframe (pandas)
    test  = test.csv dataframe (pandas)
    '''
    
    # EXAM LEVEL
    df_tmp = sub.loc[sub.id.str.contains('_', regex = False)].reset_index(drop = True)
    df_tmp['StudyInstanceUID'] = df_tmp['id'].apply(lambda x:str(x).split('_')[0])
    df_tmp['label_type'] = df_tmp['id'].apply(lambda x:'_'.join(str(x).split('_')[1:]))
    df_exam = df_tmp.pivot(index = 'StudyInstanceUID', columns = 'label_type', values = 'label')
    
    # IMAGE LEVEL
    df_image = sub.loc[sub.id.isin(test.SOPInstanceUID)].reset_index(drop = True)
    df_image = df_image.merge(test, how = 'left', left_on = 'id', right_on = 'SOPInstanceUID')
    df_image.rename(columns = {"label": "pe_present_on_image"}, inplace = True)
    del df_image['id']
    
    # MERGER
    df = df_exam.merge(df_image, how = 'left', on = 'StudyInstanceUID')
    ids    = ['StudyInstanceUID', 'SeriesInstanceUID', 'SOPInstanceUID']
    labels = [c for c in df.columns if c not in ids]
    df = df[ids + labels]
    
    # SPLIT NEGATIVE AND POSITIVE EXAMS
    df['positive_images_in_exam'] = df['StudyInstanceUID'].map(df.groupby(['StudyInstanceUID']).pe_present_on_image.max())
    df_pos = df.loc[df.positive_images_in_exam >  0.5]
    df_neg = df.loc[df.positive_images_in_exam <= 0.5]
    
    # CHECKING CONSISTENCY OF POSITIVE EXAM LABELS
    rule1a = df_pos.loc[((df_pos.rv_lv_ratio_lt_1  >  0.5)  & 
                         (df_pos.rv_lv_ratio_gte_1 >  0.5)) | 
                        ((df_pos.rv_lv_ratio_lt_1  <= 0.5)  & 
                         (df_pos.rv_lv_ratio_gte_1 <= 0.5))].reset_index(drop = True)
    rule1a['broken_rule'] = '1a'
    rule1b = df_pos.loc[(df_pos.central_pe    <= 0.5) & 
                        (df_pos.rightsided_pe <= 0.5) & 
                        (df_pos.leftsided_pe  <= 0.5)].reset_index(drop = True)
    rule1b['broken_rule'] = '1b'
    rule1c = df_pos.loc[(df_pos.acute_and_chronic_pe > 0.5) & 
                        (df_pos.chronic_pe           > 0.5)].reset_index(drop = True)
    rule1c['broken_rule'] = '1c'
    rule1d = df_pos.loc[(df_pos.indeterminate        > 0.5) | 
                        (df_pos.negative_exam_for_pe > 0.5)].reset_index(drop = True)
    rule1d['broken_rule'] = '1d'

    # CHECKING CONSISTENCY OF NEGATIVE EXAM LABELS
    rule2a = df_neg.loc[((df_neg.indeterminate        >  0.5)  & 
                         (df_neg.negative_exam_for_pe >  0.5)) | 
                        ((df_neg.indeterminate        <= 0.5)  & 
                         (df_neg.negative_exam_for_pe <= 0.5))].reset_index(drop = True)
    rule2a['broken_rule'] = '2a'
    rule2b = df_neg.loc[(df_neg.rv_lv_ratio_lt_1     > 0.5) | 
                        (df_neg.rv_lv_ratio_gte_1    > 0.5) |
                        (df_neg.central_pe           > 0.5) | 
                        (df_neg.rightsided_pe        > 0.5) | 
                        (df_neg.leftsided_pe         > 0.5) |
                        (df_neg.acute_and_chronic_pe > 0.5) | 
                        (df_neg.chronic_pe           > 0.5)].reset_index(drop = True)
    rule2b['broken_rule'] = '2b'
    
    # MERGING INCONSISTENT PREDICTIONS
    errors = pd.concat([rule1a, rule1b, rule1c, rule1d, rule2a, rule2b], axis = 0)
    
    # OUTPUT
    print('Found', len(errors), 'inconsistent predictions')
    if len(errors) > 0:
        print(errors.broken_rule.value_counts())
        
    return errors

In [None]:
%%time
consistency_df0 = check_consistency(final_sub_df, test_df)

In [None]:
consistency_df0.StudyInstanceUID.unique()

In [None]:
consistency_df0.head()

In [None]:
consistency_df0['broken_rule'].value_counts()

In [None]:
from scipy.special import softmax

In [None]:
def solve_rule_1a (sub, errors, log=False):
    '''
    ((df_pos.rv_lv_ratio_lt_1  >  0.5)  & 
                         (df_pos.rv_lv_ratio_gte_1 >  0.5)) | 
                        ((df_pos.rv_lv_ratio_lt_1  <= 0.5)  & 
                         (df_pos.rv_lv_ratio_gte_1 <= 0.5))
    '''
    
    studies_break_rule = errors.query("((rv_lv_ratio_lt_1  >  0.5) & (rv_lv_ratio_gte_1 >  0.5)) | \
                                  ((rv_lv_ratio_lt_1  <=  0.5) & (rv_lv_ratio_gte_1 <=  0.5))")['StudyInstanceUID'].unique()

    if log: 
        print (studies_break_rule,studies_break_rule.shape)
    
    for i in studies_break_rule:
        
        idx = sub[sub['id'].str.contains(i)].index.values
        old = sub.loc[idx[[1,2]], 'label'].values
        if old[0] > 0.5 and old[1] > 0.5:
            new = softmax (old)
        elif old[0] == 0.5 and old[1] == 0.5:
            pass
        else:
            new = softmax (old)
        
        # softmax collateral effect
        if new[0]== 0.5 and new[1] == 0.5:
            pass
        
        if log: print (old , new)
        sub.loc[idx[[1,2]], 'label'] = new

In [None]:
def solve_rule_1b (sub, errors, log=False):
    
    studies_break_rule = errors.query("(central_pe <= 0.5) & (rightsided_pe <= 0.5) & (leftsided_pe  <= 0.5)")['StudyInstanceUID'].unique()

    if log: 
        print (studies_break_rule,studies_break_rule.shape)
    
    for i in studies_break_rule:
        
        idx = sub[sub['id'].str.contains(i)].index.values
        #print (i, idx[[3,5,7]])
        #print (sub.loc[(sub['id'].str.contains(i))])
        old = sub.loc[idx[[3,5,7]], 'label'].values
        best = np.argmax (old)
        new = old.copy()
        
        if (old[0] == 0.5) and (old[1] == 0.5) and (old[2] == 0.5):
            pass
        else:
            new[best] = new[best] + 0.5
            new = np.clip (new , 0, 0.51) # remember that is possible to have all > 0.5
            
        if log: 
            print (old, new)
        sub.loc[idx[[3,5,7]], 'label'] = new

In [None]:
def solve_rule_1c (sub, errors, log=False):
    
    studies_break_rule = errors.query("(acute_and_chronic_pe > 0.5) & (chronic_pe > 0.5)")['StudyInstanceUID'].unique()

    if log: 
        print (studies_break_rule, studies_break_rule.shape)
    
    for i in studies_break_rule:
        
        idx = sub[sub['id'].str.contains(i)].index.values
        #print (i, idx[1:3])
        #print (sub.loc[(sub['id'].str.contains(i))])
        old = sub.loc[idx[[4,6]], 'label'].values
        new = softmax (old)
        if log: 
            print (old , new)
            
        sub.loc[idx[[4,6]], 'label'] == new

In [None]:
def solve_rule_1d(sub, errors, log=False):
    
    studies_break_rule = errors.query("(indeterminate > 0.5) | (negative_exam_for_pe > 0.5)")['StudyInstanceUID'].unique()

    if log: 
        print (studies_break_rule, studies_break_rule.shape)
    
    for i in studies_break_rule:
        
        idx = sub[sub['id'].str.contains(i)].index.values
        #print (sub.loc[(sub['id'].str.contains(i))])
        
        old = sub.loc[idx[[0,-1]], 'label'].values
        #if (old_neg_ind[0] == old_neg_ind[1]):
        #    if log: print ('negative = 0.5 = indet')
        #    new_neg_ind = [0.7 , 0.3]
        #else:
        #    new_neg_ind = softmax (old_neg_ind)
        
        new = np.clip (old, 0. , 0.4)
        if log: 
            print (old , new)
            
        sub.loc[idx[[0,-1]], 'label'] = new

In [None]:
def solve_negative (sub, errors):
    # clip all images to (0, 0.5) that are predicted as negative exam
    negative_sopids = errors.query("negative_exam_for_pe > 0.5")['SOPInstanceUID'].values
    neg_clip_idxs = sub[sub['id'].isin(negative_sopids)].index
    sub.loc[neg_clip_idxs, 'label'] = np.clip(sub.loc[neg_clip_idxs, 'label'], 0, 0.5)

In [None]:
%%time
solve_negative (sub=final_sub_df, errors=consistency_df0)

In [None]:
%%time
consistency_df0 = check_consistency(sub=final_sub_df, test=test_df)

In [None]:
%%time
solve_rule_1a (sub=final_sub_df, errors=consistency_df0, log=False)
solve_rule_1b (sub=final_sub_df, errors=consistency_df0, log=False)
solve_rule_1c (sub=final_sub_df, errors=consistency_df0, log=False)
solve_rule_1d (sub=final_sub_df, errors=consistency_df0, log=False)

In [None]:
consistency_df_final = check_consistency(final_sub_df, test_df)

In [None]:
submit_fixed = True
if submit_fixed:
    final_sub_df.to_csv("submission.csv", index=False)

### Submission Stats

In [None]:
final_sub_df['label_prefix'] = final_sub_df.apply(lambda o: "_".join(o['id'].split("_")[1:]), 1)
stats = final_sub_df.groupby("label_prefix").agg(['min', 'max', 'mean', 'median'])
stats

In [None]:
final_sub_df.query("label_prefix == ''")['label'].hist();