In [None]:
!cp ../input/gdcm-conda-install/gdcm.tar .
!tar -xvzf gdcm.tar
!conda install --offline ./gdcm/gdcm-2.8.9-py37h71b2a6d_0.tar.bz2

In [None]:
import gdcm

In [None]:
import ast
import gc
import math
import os
import numpy as np
import pandas as pd
import pydicom
from tqdm import tqdm
from joblib import Parallel, delayed

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
from torch.utils.data import TensorDataset, DataLoader, Dataset
import torch.optim as optim
from torch.nn import TransformerEncoder, TransformerEncoderLayer
import matplotlib.pyplot as plt
from torchvision import datasets, transforms
from torchvision import models

from fastprogress import progress_bar

print(os.cpu_count())
n_gpu = torch.cuda.device_count()

In [None]:
def set_seeds(SEED):
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    # torch.backends.cudnn.benchmark = True

set_seeds(SEED=2020)

In [None]:
batch_size = 8
num_workers = os.cpu_count()

In [None]:
# commit時に時間がかからないようにフラグを立てる
do_all = len(os.listdir("../input/rsna-str-pulmonary-embolism-detection/test/")) > 700
# print(do_all)

In [None]:
# do_all = True

## Extract meta data from test data

In [None]:
test_df = pd.read_csv('../input/rsna-str-pulmonary-embolism-detection/test.csv')
# test_df = test_df[:100]

In [None]:
print(test_df.shape)

In [None]:
%%time

def get_file_paths(df, base_dir):
    PATH = "../input/rsna-str-pulmonary-embolism-detection/"
    df["file_path"] = df.apply(lambda x: (PATH + base_dir + 
                                    x["StudyInstanceUID"] + "/" + 
                                    x["SeriesInstanceUID"]+ "/" +
                                    x["SOPInstanceUID"]+ ".dcm"), axis=1)
    return df

test_df = get_file_paths(test_df, "test/")

In [None]:
# https://www.kaggle.com/kozodoi/extract-meta-features-from-training-images
def extract_meta_feats(file_path):
    image = pydicom.dcmread(file_path)

    study_uid = image.StudyInstanceUID
    series_uid = image.SeriesInstanceUID
    sop_uid = image.SOPInstanceUID
    image_position_patient = image.ImagePositionPatient

    return [study_uid, series_uid, sop_uid, image_position_patient]

In [None]:
%%time

if do_all:
    test_files = test_df["file_path"].values.tolist()
    results = Parallel(n_jobs=-1, verbose=1)(map(delayed(extract_meta_feats), test_files))
    test_meta_df = pd.DataFrame(results, columns=[
        "StudyInstanceUID", "SeriesInstanceUID", "SOPInstanceUID", "ImagePositionPatient"])
    test_meta_df.to_csv('test_metadata.csv', index=False)

In [None]:
print(test_meta_df.shape)
print(test_meta_df.columns)

In [None]:
del test_meta_df
gc.collect()

## Extract ResNet18 Features for Seqeunce Model

In [None]:
def pixel_array(d):
    return d.pixel_array

In [None]:
def load_dicom_array(file_path):
    dicom = pydicom.dcmread(file_path)
    M = float(dicom.RescaleSlope)
    B = float(dicom.RescaleIntercept)
    dicom = pixel_array(dicom)
    dicom = dicom * M
    dicom = dicom + B
    return dicom

In [None]:
def window(img, WL=50, WW=350):
    upper, lower = (WL+WW)//2, (WL-WW)//2
    X = np.clip(img.copy(), lower, upper)
    X = X - np.min(X)
    X = X / np.max(X)
    return X

In [None]:
def get_3ch_image(image, transform=None):
    image_lung = window(image, WL=-600, WW=1500).astype(np.float32)
    image_mediastinal = window(image, WL=40, WW=400).astype(np.float32)
    image_pe_specific = window(image, WL=100, WW=700).astype(np.float32)
    
    if transform:
        image_lung = transform(image=image_lung)['image']
        image_mediastinal = transform(image=image_mediastinal)['image']
        image_pe_specific = transform(image=image_pe_specific)['image']
    
    image_array = np.stack([image_lung, image_mediastinal, image_pe_specific], axis=-1)
    image_array = image_array.transpose(2, 0, 1)
    return image_array

In [None]:
class ImageDataset(data.Dataset):
    def __init__(self, df, img_list, img_size=512, transform=None):
        self.df = df
        self.img_list = img_list
        self.img_size = img_size
        self.transform = transform

    def __len__(self):
        return len(self.img_list)

    def __getitem__(self, idx: int):
        row = self.df.iloc[idx]
        img_path = self.img_list[idx]
        img_array = get_3ch_image(load_dicom_array(img_path), self.transform)
        
        item = {"img": img_array, 
                "StudyInstanceUID": row["StudyInstanceUID"],
                "SeriesInstanceUID": row["SeriesInstanceUID"],
                "SOPInstanceUID": row["SOPInstanceUID"]}
        
        return item

In [None]:
id_columns = ["StudyInstanceUID", "SeriesInstanceUID", "SOPInstanceUID"]

test_loader = data.DataLoader(ImageDataset(test_df[id_columns], img_list=test_df["file_path"].values,),
                              batch_size=batch_size,
                              shuffle=False,
                              num_workers=num_workers,
                              pin_memory=True,
                              drop_last=False)

In [None]:
test_loader.dataset[4]["img"].shape

## CNN Net Definition

In [None]:
class ResNet(nn.Module):
    def __init__(self, pretrained=False,
                 num_classes=2):
        super().__init__()
        base_model = models.__getattribute__("resnet18")(
            pretrained=pretrained)
        layers = list(base_model.children())[:-2]
        layers.append(nn.AdaptiveMaxPool2d(1))
        self.encoder = nn.Sequential(*layers)

        in_features = base_model.fc.in_features

        self.classifier = nn.Sequential(
            nn.Linear(in_features, 512), nn.ReLU(), nn.Dropout(p=0.5),
            nn.Linear(512, 2))
        
    def forward(self, x):
        batch_size = x.size(0)
        features = self.encoder(x).view(batch_size, -1)
        x = self.classifier(features)
        return F.softmax(x, dim=1), features

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ResNet()

In [None]:
checkpoint = torch.load("../input/0927-resnet18-loss03220/train.4.pth", map_location=device)
model.load_state_dict(checkpoint["model_state_dict"])
model.to(device)
model.eval()
print("model load finish")

In [None]:
print(model)

## CNN Net Inference

In [None]:
%%time

if do_all:
    embs, predictions = [], []
    StudyInstanceUIDs, SeriesInstanceUIDs, SOPInstanceUIDs = [], [], []
    for batch in progress_bar(test_loader):
        with torch.no_grad():
            logits, emb = model(batch["img"].to(device, dtype=torch.float, non_blocking=True))
            embs.append(emb.detach().cpu().numpy())
            predictions.append(logits.transpose(1, 0)[1].detach().cpu().numpy())

            StudyInstanceUID = batch["StudyInstanceUID"]
            SeriesInstanceUID = batch["SeriesInstanceUID"]
            SOPInstanceUID = batch["SOPInstanceUID"]
            StudyInstanceUIDs.append(StudyInstanceUID)
            SeriesInstanceUIDs.append(SeriesInstanceUID)
            SOPInstanceUIDs.append(SOPInstanceUID)

    predictions = np.concatenate(predictions, 0).astype(np.float32)

    out_embs = np.concatenate(embs, 0).astype(np.float32)
    print("Write embeddings: shape {} {}".format(*out_embs.shape))

    out_StudyInstanceUIDs = np.concatenate(StudyInstanceUIDs, 0)
    out_SeriesInstanceUIDs = np.concatenate(SeriesInstanceUIDs, 0)
    out_SOPInstanceUIDs = np.concatenate(SOPInstanceUIDs, 0)
    assert len(out_embs) == len(out_StudyInstanceUIDs), "{} {}".format(len(out_embs), len(out_StudyInstanceUIDs))
    assert len(out_StudyInstanceUIDs) == len(out_SeriesInstanceUIDs), "{} {}".format(len(out_StudyInstanceUIDs), len(out_SeriesInstanceUIDs))
    assert len(out_SOPInstanceUIDs) == len(out_SeriesInstanceUIDs), "{} {}".format(len(out_SOPInstanceUIDs), len(out_SeriesInstanceUIDs))
    print("MODE: test Size: {}".format(len(out_StudyInstanceUIDs)))

    out_dict = {
        "embeddings": out_embs,
        "StudyInstanceUID": out_StudyInstanceUIDs,
        "SeriesInstanceUID": out_SeriesInstanceUIDs,
        "SOPInstanceUID": out_SOPInstanceUIDs
    }
    # print(out_dict)

    output_filename = "emb_test_embdim512"
    print("Embedding file name: {}".format(output_filename))
    np.savez_compressed(output_filename, **out_dict)
    print("file Saved.")
    gc.collect()

In [None]:
if do_all:
    preds_df = pd.DataFrame({
        "id": test_df["SOPInstanceUID"].values[:len(predictions)],
        "pred" : predictions
    })
    print(preds_df.head())

In [None]:
del test_loader, model
gc.collect()

In [None]:
label_cols = [
    "negative_exam_for_pe",
    "rv_lv_ratio_gte_1",
    "rv_lv_ratio_lt_1",
    "leftsided_pe",
    "chronic_pe",
    "rightsided_pe",
    "acute_and_chronic_pe",
    "central_pe",
    "indeterminate",
]

n_classes = 9
batch_size = 8
num_workers = os.cpu_count()

# LSTM_UNITS = 512
lr = 1e-5
lrgamma = 0.95
DECAY = 0.0

## Transformer Test preprocessing

In [None]:
tstmdf = pd.read_csv('test_metadata.csv')

In [None]:
%%time

tstmdf['SliceID'] = tstmdf[['SeriesInstanceUID', 'StudyInstanceUID']].apply(
    lambda x: '{}__{}'.format(*x.tolist()), 1)

In [None]:
tstmdf.head()

In [None]:
%%time

poscols = ['ImagePos{}'.format(i) for i in range(1, 4)]
tstmdf[poscols] = pd.DataFrame(tstmdf['ImagePositionPatient']\
              .apply(lambda x: list(map(float, ast.literal_eval(x)))).tolist())

tstmdf = tstmdf.sort_values(['SliceID']+poscols)\
                [['StudyInstanceUID', 'SliceID', 'SOPInstanceUID']+poscols].reset_index(drop=True)

In [None]:
tstmdf['seq'] = (tstmdf.groupby(['SliceID']).cumcount() + 1)

In [None]:
tstmdf.head()

In [None]:
keepcols = ['StudyInstanceUID', 'SliceID', 'SOPInstanceUID', 'seq']
tstmdf = tstmdf[keepcols]

In [None]:
%%time

test_emb_f = np.load("emb_test_embdim512.npz")

test_emb = test_emb_f["embeddings"]
tstdf = pd.DataFrame({
    "StudyInstanceUID": test_emb_f["StudyInstanceUID"], 
    "SeriesInstanceUID": test_emb_f["SeriesInstanceUID"],
    "SOPInstanceUID": test_emb_f["SOPInstanceUID"]})

In [None]:
tstdf = tstdf[["SOPInstanceUID"]].merge(tstmdf, on="SOPInstanceUID", how="left")

In [None]:
print(tstdf.shape)

In [None]:
tstdf.head()

In [None]:
tstdf['embidx'] = range(tstdf.shape[0])

In [None]:
print(tstdf.shape)

In [None]:
tstdf.head()

In [None]:
del test_df, tstmdf
gc.collect()

## Data Loader

In [None]:
class PEDataset(data.Dataset):
    def __init__(self, df, mat, labels=True):
        self.data = df
        self.mat = mat
        self.labels = labels
        self.patients = df.SliceID.unique()
        self.data = self.data.set_index('SliceID')

    def __len__(self):
        return len(self.patients)

    def __getitem__(self, idx):
        patidx = self.patients[idx]
        study_id = self.data.loc[patidx]["StudyInstanceUID"].values[0]
        # print(study_id)
        patdf = self.data.loc[patidx].sort_values('seq')
        patemb = self.mat[patdf['embidx'].values]

        patdeltalag  = np.zeros(patemb.shape)
        patdeltalead = np.zeros(patemb.shape)
        patdeltalag[1:] = patemb[1:] - patemb[:-1]
        patdeltalead[:-1] = patemb[:-1] - patemb[1:]

        patemb = np.concatenate((patemb, patdeltalag, patdeltalead), -1)
        # print(patemb.shape)
        
        ids = torch.tensor(patdf['embidx'].values)
        
        assert len(patemb) == len(ids), "emb size: {} id size: {}".format(len(patemb), len(ids))
        
        if self.labels:
            labels = torch.tensor(patdf[label_cols].values[0])
            return {'emb': patemb, 'embidx' : ids, 'labels': labels, "StudyInstanceUID": study_id}    
        else:      
            return {'emb': patemb, 'embidx' : ids, "StudyInstanceUID": study_id}

In [None]:
def collatefn(batch):
    maxlen = max([l['emb'].shape[0] for l in batch])
    embdim = batch[0]['emb'].shape[1]
    withlabel = 'labels' in batch[0]
#     if withlabel:
#         labdim = batch[0]['labels'].shape[1]
        
    for b in batch:
        masklen = maxlen-len(b['emb'])
        b['emb'] = np.vstack((np.zeros((masklen, embdim)), b['emb']))
        b['embidx'] = torch.cat((torch.ones((masklen),dtype=torch.long)*-1, b['embidx']))
        b['mask'] = np.ones((maxlen))
        b['mask'][:masklen] = 0.
#         if withlabel:
#             b['labels'] = np.vstack((np.zeros((maxlen-len(b['labels']), labdim)), b['labels']))
            
    outbatch = {'emb' : torch.tensor(np.vstack([np.expand_dims(b['emb'], 0) \
                                                for b in batch])).float()}  
    outbatch['mask'] = torch.tensor(np.vstack([np.expand_dims(b['mask'], 0) \
                                                for b in batch])).float()
    outbatch['embidx'] = torch.tensor(np.vstack([np.expand_dims(b['embidx'], 0) \
                                                for b in batch])).float()
    if withlabel:
        # outbatch['labels'] = torch.tensor(np.vstack([np.expand_dims(b['labels'], 0) for b in batch])).float()
        outbatch["labels"] = torch.tensor(np.vstack([b["labels"] for b in batch])).float()
    
    outbatch["StudyInstanceUID"] = [b["StudyInstanceUID"] for b in batch]
    return outbatch

In [None]:
test_dataset = PEDataset(tstdf, test_emb, labels=False)
test_loader = data.DataLoader(test_dataset, 
                              batch_size=batch_size, 
                              shuffle=False, 
                              num_workers=num_workers, 
                              collate_fn=collatefn)

## Transformer Definition

In [None]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [None]:
class PETransformerModel(nn.Module):
    def __init__(self, n_classes, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(PETransformerModel, self).__init__()
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, n_classes)
        
        self.init_weigths()
    
    def init_weigths(self):
        initrange = 0.1
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)
        
    def forward(self, x):
        x = x * math.sqrt(self.ninp)
        x = self.pos_encoder(x)
        hidden = self.transformer_encoder(x)
        # print(hidden.size())
        output = self.decoder(hidden.mean(1))
        return output

In [None]:
nhid = 768
nlayers = 2
nhead = 2
dropout = 0.2

In [None]:
embed_size = test_emb.shape[-1] * 3
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = PETransformerModel(n_classes, embed_size, nhead, nhid, nlayers, dropout=dropout)
checkpoint = torch.load("../input/pe-transformer-model/transformer_epoch15.pth", map_location=device)
model.load_state_dict(checkpoint)
model.to(device)

In [None]:
model.eval()

In [None]:
classes = [
    "{}_negative_exam_for_pe",
    "{}_rv_lv_ratio_gte_1",
    "{}_rv_lv_ratio_lt_1",
    "{}_leftsided_pe",
    "{}_chronic_pe",
    "{}_rightsided_pe",
    "{}_acute_and_chronic_pe",
    "{}_central_pe",
    "{}_indeterminate",
]

In [None]:
sub = pd.read_csv("../input/rsna-str-pulmonary-embolism-detection/sample_submission.csv")
print(sub.shape)

In [None]:
%%time

preds_all_df = []

if do_all:
    tqdm_loader = tqdm(test_loader)
    for step, batch in enumerate(tqdm_loader):
        mask = batch['mask'].to(device, dtype=torch.int)
        inputs = batch["emb"]
        inputs = inputs.to(device, dtype=torch.float, non_blocking=True)
        StudyInstanceUIDs = batch["StudyInstanceUID"]
        # print(StudyInstanceUIDs)
        
        with torch.no_grad():
            logits = model(inputs)
            preds = torch.sigmoid(logits).detach().cpu().numpy()
            
            for i, pred in enumerate(preds):
                StudyInstanceUID = StudyInstanceUIDs[i]
                id_names = [c.format(StudyInstanceUID) for c in classes]
                tmp_df = pd.DataFrame({"id": id_names, "pred": pred.tolist()})
                preds_all_df.append(tmp_df)
    
    preds_all_df.append(preds_df)
    pred_sub = pd.concat(preds_all_df, 0)
    sub = sub.merge(pred_sub, on="id", how="left")
    sub = sub[["id", "pred"]]
    sub.columns = ["id", "label"]
    print(sub.isna().sum())
    sub = sub.fillna(0.5)
    sub.to_csv("submission.csv", index=False)
else:
    sub.to_csv("submission.csv", index=False)

In [None]:
sub.head()