In [None]:
!cp ../input/gdcm-conda-install/gdcm.tar .
!tar -xvzf gdcm.tar
!conda install --offline ./gdcm/gdcm-2.8.9-py37h71b2a6d_0.tar.bz2

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os, glob, pickle, time, gc, copy, sys, multiprocessing
from joblib import Parallel, delayed

import warnings
import cv2, pydicom
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 100) # 表示できる列数

In [None]:
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim as optim
import torch.utils.data
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch.optim.lr_scheduler import _LRScheduler
from sklearn import metrics


sys.path.append('../input/timm-efficientnet/pytorch-image-models-master/') # Done path OK
sys.path.append('../input/pretrainedmodels/pretrained-models.pytorch-master/') # Done path OK
import timm
import pretrainedmodels

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cudnn.benchmark = True

# test data loading

In [None]:
col_index = 'SOPInstanceUID'
col_groupby = 'StudyInstanceUID'
df_test_path = "../input/rsna-str-pulmonary-embolism-detection/test.csv"
# df_test_path = "../input/rsna-str-pulmonary-embolism-detection/train.csv"
df_train_path = "../input/rsna-str-pulmonary-embolism-detection/train.csv"
df_sub_path = "../input/rsna-str-pulmonary-embolism-detection/sample_submission.csv"
test_image_path = "../input/rsna-str-pulmonary-embolism-detection/test/"
# test_image_path = "../input/rsna-str-pulmonary-embolism-detection/train/"
train_image_path = "../input/rsna-str-pulmonary-embolism-detection/train/"
LEN_PUBLIC = 146853
DEBUG = False
num_cpu = multiprocessing.cpu_count()
num_features1_1 = 1408
weight_dir1_1 = "../input/rsna2020-1/201024_5_b2_1loss_512_fp16_bs80_lr1e3/201024_5_b2_1loss_512_fp16_bs80_lr1e3"
weight_dir2_1 = "../input/rsna2020-1/201026_3_2ndNNs_features_pool_flip_b0b2/201026_3_2ndNNs_features_pool_flip_b0b2"
do_full = False
BATCH_SIZE = 64
NUM_FOLD = 5

In [None]:
col_targets = [
    'negative_exam_for_pe',
    'indeterminate',
    'chronic_pe',
    'acute_and_chronic_pe',
    'central_pe',
    'leftsided_pe',
    'rightsided_pe',
    'rv_lv_ratio_gte_1',
    'rv_lv_ratio_lt_1',
    'pe_present_on_image',
]

In [None]:
if os.path.exists('../input/rsna-str-pulmonary-embolism-detection/train') and not do_full:
    df_test_full=pd.read_csv(df_test_path)
    df_test_full_study = df_test_full[df_test_full[col_groupby].duplicated()==False]
    df_test=pd.read_csv(df_test_path).head(2000)
else:
    df_test_full=pd.read_csv(df_test_path)
    df_test_full_study = df_test_full[df_test_full[col_groupby].duplicated()==False]
    df_test=pd.read_csv(df_test_path)
# df_test=pd.read_csv(df_test_path)
# df_test=pd.read_csv(df_test_path).head(2000)
print(df_test.shape)
df_test.head()

In [None]:
df_test = df_test.sort_values([col_groupby, col_index]).reset_index(drop=True)
df_test.head()

In [None]:
df_test['dicom_path'] = test_image_path + "/" +\
    df_test[col_groupby].values + "/" + \
    df_test['SeriesInstanceUID'].values + "/" + \
    df_test[col_index].values + ".dcm"
print(df_test['dicom_path'][0])
# df_test['dicom_path'][0] = "asdfja:sdfk"
df_test.head()

In [None]:
df_test_study = df_test[df_test[col_groupby].duplicated()==False]
df_test_study['start_index'] = df_test_study.index.values
df_tmp = df_test.groupby(col_groupby)[col_index].agg(len).reset_index()
df_tmp.columns = [col_groupby, 'num_images']
df_test_study = pd.merge(df_test_study, df_tmp, on=col_groupby)
# df_test_study = df_test_study.reset_index(drop=True)
# df_test_study = pd.concat([df_test_study, df_test_study, df_test_study, df_test_study]).reset_index()
print(df_test_study.shape)
df_test_study.head()

# Prediction

In [None]:
class DicomDataset(Dataset):
    def __init__(self, X_study, X_image, transform=None, meta=False, verbose=False):
        self.X_study = X_study
        self.X_image = X_image
        self.transform = transform
        self.verbose = verbose

    def __getitem__(self, index):
        # get df_study
        study = self.X_study[col_groupby][index]
        start_index = self.X_study['start_index'][index]
        end_index = self.X_study['start_index'][index] + self.X_study['num_images'][index]
        df_study = self.X_image.iloc[start_index:end_index].reset_index(drop=True)

        # load dicoms
        images_study = []
        z_pos = []
        for i in range(len(df_study)):
            tmp_path = df_study['dicom_path'][i]
            try:
                tmp_dcm = pydicom.dcmread(tmp_path)
                tmp_npy = np.asarray(tmp_dcm.pixel_array)
                images_study.append(tmp_npy)
                if i==0:
                    RescaleSlope = tmp_dcm['RescaleSlope'].value
                    RescaleIntercept = tmp_dcm['RescaleIntercept'].value
                    PatientPosition = tmp_dcm['PatientPosition'].value
                z_pos.append(tmp_dcm['ImagePositionPatient'].value[-1])
            except:
                print("loading error!!!, study: {}, index: {}".format(study, i))
                tmp_npy = np.zeros([512, 512], np.int16)
                images_study.append(tmp_npy)
                if i==0:
                    RescaleSlope = 1
                    RescaleIntercept = -1024
                    PatientPosition = 'HFS'
                z_pos.append(-10000-i)
                
        images_study = np.array(images_study)
        z_pos = np.array(z_pos)
        images_study = images_study[np.argsort(z_pos)]
        df_study['z_pos'] = z_pos
        df_study = df_study.sort_values('z_pos').reset_index(drop=True)
        df_study['series_index'] = np.arange(len(df_study))
        if self.verbose: print(images_study.shape)
        if self.verbose: print(z_pos)
        if self.verbose: print(RescaleIntercept, RescaleSlope, PatientPosition)
            
        # process images
        images_study_processed = (images_study.astype(np.float32) * RescaleSlope + RescaleIntercept)/1000
        if PatientPosition=='FFP':
            images_study_processed = images_study_processed[:, ::-1, ::-1]
        images_study_processed = images_study_processed.reshape([-1, 1, 512, 512]).astype(np.float16)
        
        return images_study_processed, df_study
    
    def __len__(self):
        return len(self.X_study)

In [None]:
def my_collate(batch):
    return torch.Tensor(batch[0][0]), batch[0][1]

In [None]:
class nnWindow(nn.Module):
    def __init__(self):
        super(nnWindow, self).__init__()
        wso = np.array(((40,80),(80,200),(40,400)))/1000
        conv_ = nn.Conv2d(1,3, kernel_size=(1, 1))
        conv_.weight.data.copy_(torch.tensor([[[[1./wso[0][1]]]],[[[1./wso[1][1]]]],[[[1./wso[2][1]]]]]))
        conv_.bias.data.copy_(torch.tensor([0.5 - wso[0][0]/wso[0][1],
                                            0.5 - wso[1][0]/wso[1][1],
                                            0.5 -wso[2][0]/wso[2][1]]))
        self.window = nn.Sequential(
            conv_,
            nn.Sigmoid(),
            nn.InstanceNorm2d(3)
        )
    def forward(self, input1):
        return self.window(input1)
        
        
class MyEffNet_b0(nn.Module):
    def __init__(self, num_classes=10, base_model='tf_efficientnet_b0_ns'):
        super(MyEffNet_b0, self).__init__()

        self.num_classes = num_classes
        self.mode = 'train'
        self.window = nnWindow()
#         self.base_model = pretrainedmodels.__dict__['resnet18'](num_classes=1000, pretrained='imagenet')
        self.base_model = timm.create_model(base_model, pretrained=False, num_classes=10).to(device, non_blocking=True)
        self.avgpool = nn.AdaptiveAvgPool2d(1)
#         self.last_linear = nn.Linear(512, num_classes+1)
        self.last_linear = nn.Linear(self.base_model.num_features, num_classes)

    def forward(self, input1):
        bs, ch, h, w = input1.size()
        x = self.window(input1)
        x = self.base_model.forward_features(x) #; print('layer conv1 ',x.size()) # [8, 64, 112, 112]
        feature = self.avgpool(x).view(bs, -1)
        y = self.last_linear(feature)

        return y

    def feature(self, input1):
        bs, ch, h, w = input1.size()
        x = self.window(input1)
        x = self.base_model.forward_features(x) #; print('layer conv1 ',x.size()) # [8, 64, 112, 112]
        feature = self.avgpool(x).view(bs, -1)
        y = self.last_linear(feature)

        return y, feature

In [None]:
"""
gdcm
dcm読むときにtry except

"""

In [None]:
class SEModule(nn.Module):

    def __init__(self, channels, reduction):
        super(SEModule, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool1d(1)
        self.fc1 = nn.Conv1d(channels, channels // reduction, kernel_size=1,
                             padding=0)
        self.relu = nn.ReLU(inplace=True)
        self.fc2 = nn.Conv1d(channels // reduction, channels, kernel_size=1,
                             padding=0)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        module_input = x
        x = self.avg_pool(x)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return module_input * x
    
class CNN1D(nn.Module):

    def __init__(self, num_classes=400, input_ch=1, verbose=False):

        super(CNN1D, self).__init__()
        pool = 4
        drop = 0.1
        self.verbose = verbose
        self.layer1 = nn.Sequential(
                nn.Conv1d(input_ch//pool, 64, kernel_size=7, stride=1, padding=3, bias=False),
                nn.BatchNorm1d(64),
                nn.ReLU(inplace=True),
                SEModule(64, 16),
#                 nn.Dropout(drop),
        )
        self.fpool = nn.MaxPool1d(kernel_size=pool, stride=pool, padding=0)
        self.maxpool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)
#         self.upsample = nn.Upsample(scale_factor=2, mode='bilinear')
        self.layer2 = nn.Sequential(
                nn.Conv1d(64, 128, kernel_size=3, stride=1, padding=1, bias=False),
                nn.BatchNorm1d(128),
                nn.ReLU(inplace=True),
                SEModule(128, 16),
#                 nn.Dropout(drop),
        )
        self.layer3 = nn.Sequential(
                nn.Conv1d(128, 256, kernel_size=3, stride=1, padding=1, bias=False),
                nn.BatchNorm1d(256),
                nn.ReLU(inplace=True),
                SEModule(256, 16),
#                 nn.Dropout(drop),
        )
        self.layer4 = nn.Sequential(
                nn.Conv1d(256, 512, kernel_size=3, stride=1, padding=1, bias=False),
                nn.BatchNorm1d(512),
                nn.ReLU(inplace=True),
                SEModule(512, 16),
#                 nn.Dropout(drop),
        )
        self.avgpool = nn.AdaptiveAvgPool1d(1)
        self.fc2 = nn.Conv1d(
            input_ch//pool+64+128+256+512, 
            2, kernel_size=1)
#         self.fc = nn.Linear(512, 9)
        self.fc = nn.Sequential(
                nn.Linear(512, 512),
                nn.ReLU(inplace=True),
                nn.Dropout(0.5),
                nn.Linear(512, 512),
                nn.ReLU(inplace=True),
                nn.Dropout(0.5),
                nn.Linear(512, 9),
        )

    def forward(self, x_input):
        bs, ch, d = x_input.size()
        x0 = torch.transpose(x_input, 1, 2)
        x0 = self.fpool(x0)
        x0 = torch.transpose(x0, 1, 2)
        x1 = self.layer1(x0)
        x1 = self.maxpool(x1)

        x2 = self.layer2(x1)
        x2 = self.maxpool(x2)
        x3 = self.layer3(x2)
        x3 = self.maxpool(x3)
        x4 = self.layer4(x3)
        
#         tmp = F.adaptive_avg_pool1d(x1, d)
#         print(tmp.shape)
#         tmp = F.adaptive_avg_pool1d(x2, d)
#         print(tmp.shape)
        x5 = torch.cat([
            x0,
            F.adaptive_avg_pool1d(x1, d), 
            F.adaptive_avg_pool1d(x2, d), 
            F.adaptive_avg_pool1d(x3, d), 
            F.adaptive_avg_pool1d(x4, d), 
        ], axis=1)
        y2 = self.fc2(x5)
        
        b, ch, d = x_input.size()
#         x1 = self.fc(x)
#         x1 = x1.view(b, -1, 1)
            
        y = self.avgpool(x4)
        y = y.view(b, -1)
        y = self.fc(y)
        return y, y2

In [None]:
dataset_test = DicomDataset(df_test_study, df_test)
test_loader = DataLoader(
    dataset_test,
    batch_size=1,
    shuffle=False,
    num_workers=num_cpu,
    pin_memory=True,
    collate_fn=my_collate
)

In [None]:
model_b0 = MyEffNet_b0(base_model='tf_efficientnet_b2_ns').to(device, non_blocking=True)
model_1dcnn = CNN1D(input_ch=num_features1_1).to(device, non_blocking=True)
model_b0.eval()
model_1dcnn.eval()
lastfunc = nn.Sigmoid().to(device, non_blocking=True)

In [None]:
def batch_padding(batch):
    bs, ch, d = batch.shape
    d_new = int(np.ceil(d/64)*64)
#     d_new = int(np.ceil(1083/64)*64)
    batch_new = torch.from_numpy(np.zeros([bs, ch, d_new], np.float32)).to(device, non_blocking=True)
    batch_new[:, :, :d] = batch
    return batch_new

In [None]:
df_pred_image = []
df_pred_study = []
starttime = time.time()
verbose = False
for study_index, (images, df_study) in enumerate(test_loader):
#     if study_index>=100: break
    if verbose: print("load index {} done".format(study_index), time.time()-starttime)
    if (study_index+1)%10==0:
        print("{}/{}, sec: {:.1f}".format(study_index+1, len(df_test_study), time.time()-starttime))
#     if study_index>10: break
    num_batches = int(np.ceil(images.shape[0]/BATCH_SIZE))
    num_images = len(df_study)
    df_study_image = df_study[[col_groupby, col_index, 'SeriesInstanceUID']]
    df_study_study = df_study[[col_groupby]].iloc[:1]
    for fold in range(NUM_FOLD):
#         if verbose: print("load weight start", time.time()-starttime)
        model_b0.load_state_dict(torch.load("{}/weight_epoch_16_fold{}.pth".format(weight_dir1_1, fold+1)))
        model_1dcnn.load_state_dict(torch.load("{}/cnn_weight_best_fold{}.pth".format(weight_dir2_1, fold+1)))
#         if verbose: print("load weight done", time.time()-starttime)
        features = []
        for batch_index in range(num_batches):
            with torch.no_grad():
                with torch.cuda.amp.autocast():
                    batch = images[batch_index*BATCH_SIZE:(batch_index+1)*BATCH_SIZE].to(device, non_blocking=True)
                    _, feature = model_b0.feature(batch)
#             print(feature.dtype)
            features.append(feature)
        features = torch.cat(features, axis=0) # bs=d, ch
        features = torch.transpose(features, 0,1).reshape([1, num_features1_1, -1])
        features = batch_padding(features)
        with torch.no_grad():
#             with torch.cuda.amp.autocast():
            output1, output2 = model_1dcnn(features)
            output2 = output2[:,-1:]
            output1 = lastfunc(output1)
            output2 = lastfunc(output2)[:,:,:num_images]
        for i, col in enumerate(col_targets[:-1]):
            df_study_study["{}_pred_fold{}".format(col, fold+1)] = output1[0, i].data.cpu().numpy()
        df_study_image["{}_pred_fold{}".format(col_targets[-1], fold+1)] = output2[0, 0].data.cpu().numpy()
    df_pred_study.append(df_study_study)
    df_pred_image.append(df_study_image)
# 10/650, sec: 45.6 目安

# Postprocessing

In [None]:
df_pred_image = pd.concat(df_pred_image).reset_index(drop=True)
df_pred_study = pd.concat(df_pred_study).reset_index(drop=True)
print(df_pred_image.shape, df_pred_study.shape)
df_pred_image.head()

In [None]:
df_pred_study.head()

In [None]:
for col in col_targets[:-1]:
    cols_tmp = []
    for fold in range(NUM_FOLD):
        cols_tmp.append("{}_pred_fold{}".format(col, fold+1))
    df_pred_study[col] = df_pred_study[cols_tmp].values.mean(axis=1)
df_pred_study.head(30)

In [None]:
cols_tmp = []
for fold in range(NUM_FOLD):
    cols_tmp.append("{}_pred_fold{}".format(col_targets[-1], fold+1))
df_pred_image[col_targets[-1]] = df_pred_image[cols_tmp].values.mean(axis=1)
df_pred_image.head()

In [None]:
# process conflict

def solve_conflict(df_pred_s, df_pred, TH_NEGATIVE=0.5, TH_INDETERMINATE = 0.5, verbose=True):
    index_indeterminate = df_pred_s['indeterminate']>TH_INDETERMINATE
    index_negative = (index_indeterminate==False) & (df_pred_s['negative_exam_for_pe']>TH_NEGATIVE)
    index_positive = (index_indeterminate==False) & (index_negative==False)


    index_negative_and_negative_lte_05 = index_negative & (df_pred_s['negative_exam_for_pe']<=0.5)
    df_pred_s['negative_exam_for_pe'][index_negative_and_negative_lte_05] = 0.5001

    index_indeterminate_and_indeterminate_lte_05 = index_indeterminate & (df_pred_s['indeterminate']<=0.5)
    df_pred_s['indeterminate'][index_indeterminate_and_indeterminate_lte_05] = 0.5001

    index_indeterminate_and_negative_gt_05 = index_indeterminate & (df_pred_s['negative_exam_for_pe']>0.5)
    df_pred_s['negative_exam_for_pe'][index_indeterminate_and_negative_gt_05] = 0.5

    index_negative_and_indeterminate_gt_05 = index_negative & (df_pred_s['indeterminate']>0.5)
    df_pred_s['indeterminate'][index_negative_and_indeterminate_gt_05] = 0.5

    
    index_positive_and_negative_gt_05 = index_positive & (df_pred_s['negative_exam_for_pe']>0.5)
    df_pred_s['negative_exam_for_pe'][index_positive_and_negative_gt_05] = 0.5
    
    index_positive_and_indeterminate_gt_05 = index_positive & (df_pred_s['indeterminate']>0.5)
    df_pred_s['indeterminate'][index_positive_and_indeterminate_gt_05] = 0.5
    
    ################################################
    index_negative_and_rv_lv_ratio_lt_1_gt_05 = (index_positive==False) & (df_pred_s['rv_lv_ratio_lt_1']>0.5)
    df_pred_s['rv_lv_ratio_lt_1'][index_negative_and_rv_lv_ratio_lt_1_gt_05] = 0.5

    index_negative_and_rv_lv_ratio_gte_1_gt_05 = (index_positive==False) & (df_pred_s['rv_lv_ratio_gte_1']>0.5)
    df_pred_s['rv_lv_ratio_gte_1'][index_negative_and_rv_lv_ratio_gte_1_gt_05] = 0.5

    index_negative_and_central_pe_gt_05 = (index_positive==False) & (df_pred_s['central_pe']>0.5)

    index_negative_and_rightsided_pe_gt_05 = (index_positive==False) & (df_pred_s['rightsided_pe']>0.5)
    df_pred_s['rightsided_pe'][index_negative_and_rightsided_pe_gt_05] = 0.5

    index_negative_and_leftsided_pe_gt_05 = (index_positive==False) & (df_pred_s['leftsided_pe']>0.5)
    df_pred_s['leftsided_pe'][index_negative_and_leftsided_pe_gt_05] = 0.5

    index_negative_and_chronic_pe_gt_05 = (index_positive==False) & (df_pred_s['chronic_pe']>0.5)
    df_pred_s['chronic_pe'][index_negative_and_chronic_pe_gt_05] = 0.5

    index_negative_and_acute_and_chronic_pe_gt_05 = (index_positive==False) & (df_pred_s['acute_and_chronic_pe']>0.5)
    df_pred_s['acute_and_chronic_pe'][index_negative_and_acute_and_chronic_pe_gt_05] = 0.5

    ################################################
    index_positive_and_rv_gte_lv = index_positive & (df_pred_s['rv_lv_ratio_lt_1']<=df_pred_s['rv_lv_ratio_gte_1'])
    index_positive_and_rv_lt_lv = index_positive & (df_pred_s['rv_lv_ratio_lt_1']>df_pred_s['rv_lv_ratio_gte_1'])

    index_positive_and_rv_gte_lv_and_rv_lv_ratio_gte_1_lte_05 =\
        (index_positive_and_rv_gte_lv) & (df_pred_s['rv_lv_ratio_gte_1']<=0.5)
    df_pred_s['rv_lv_ratio_gte_1'][index_positive_and_rv_gte_lv_and_rv_lv_ratio_gte_1_lte_05] = 0.5001

    index_positive_and_rv_gte_lv_and_rv_lv_ratio_lt_1_gt_05 =\
        (index_positive_and_rv_gte_lv) & (df_pred_s['rv_lv_ratio_lt_1']>0.5)
    df_pred_s['rv_lv_ratio_lt_1'][index_positive_and_rv_gte_lv_and_rv_lv_ratio_lt_1_gt_05] = 0.5

    index_positive_and_rv_lt_lv_and_rv_lv_ratio_lt_1_lte_05 =\
        (index_positive_and_rv_lt_lv) & (df_pred_s['rv_lv_ratio_lt_1']<=0.5)
    df_pred_s['rv_lv_ratio_lt_1'][index_positive_and_rv_lt_lv_and_rv_lv_ratio_lt_1_lte_05] = 0.5001

    index_positive_and_rv_lt_lv_and_rv_lv_ratio_gte_1_gt_05 =\
        (index_positive_and_rv_lt_lv) & (df_pred_s['rv_lv_ratio_gte_1']>0.5)
    df_pred_s['rv_lv_ratio_gte_1'][index_positive_and_rv_lt_lv_and_rv_lv_ratio_gte_1_gt_05] = 0.5

    index_positive_and_central_is_greatest = index_positive & (df_pred_s['central_pe']>=df_pred_s['rightsided_pe']) & (df_pred_s['central_pe']>=df_pred_s['leftsided_pe'])
    index_positive_and_right_is_greatest = index_positive & (index_positive_and_central_is_greatest==False) & (df_pred_s['rightsided_pe']>=df_pred_s['leftsided_pe'])
    index_positive_and_left_is_greatest = index_positive & (index_positive_and_central_is_greatest==False) & (index_positive_and_right_is_greatest==False) 


    index_positive_and_central_is_greatest_and_central_pe_lte_05 = (index_positive_and_central_is_greatest) & (df_pred_s['central_pe']<=0.5)
    df_pred_s['central_pe'][index_positive_and_central_is_greatest_and_central_pe_lte_05] = 0.5001

    index_positive_and_right_is_greatest_and_rightsided_pe_lte_05 = (index_positive_and_right_is_greatest) & (df_pred_s['rightsided_pe']<=0.5)
    df_pred_s['rightsided_pe'][index_positive_and_right_is_greatest_and_rightsided_pe_lte_05] = 0.5001

    index_positive_and_left_is_greatest_and_leftsided_pe_lte_05 = (index_positive_and_left_is_greatest) & (df_pred_s['leftsided_pe']<=0.5)
    df_pred_s['leftsided_pe'][index_positive_and_left_is_greatest_and_leftsided_pe_lte_05] = 0.5001

     # acute_and_chronic_pe and chronic_pe: only one of them can have p > 0.5; neither having p > 0.5 is allowed.
    index_double_positive = index_positive & (df_pred_s['chronic_pe']>0.5) & (df_pred_s['acute_and_chronic_pe']>0.5)

    index_double_positive_and_chronic_lte_acute_and_chronic = index_double_positive & (df_pred_s['chronic_pe']<=df_pred_s['acute_and_chronic_pe'])
    df_pred_s['chronic_pe'][index_double_positive_and_chronic_lte_acute_and_chronic] = 0.5

    index_double_positive_and_chronic_gt_acute_and_chronic = index_double_positive & (df_pred_s['chronic_pe']>df_pred_s['acute_and_chronic_pe'])
    df_pred_s['acute_and_chronic_pe'][index_double_positive_and_chronic_gt_acute_and_chronic] = 0.5

    ################################################
    df_pred_s['positive'] = 0
    df_pred_s['positive'][index_positive] = 1
    df_pred2 = pd.merge(df_pred, df_pred_s[[col_groupby, 'positive']], on=col_groupby, how='left')

    df_agg = df_pred.groupby(col_groupby)['pe_present_on_image'].agg('max').reset_index()
    df_agg.columns = [col_groupby, 'pe_present_on_image_pred_max']
    df_pred2 = pd.merge(df_pred2, df_agg, on=col_groupby, how='left')
    df_pred2['peak'] = df_pred2['pe_present_on_image']==df_pred2['pe_present_on_image_pred_max']
    # df_tmp = df_s_p[[col_groupby]]
    # df_tmp['positive'] = True

    index_positive_i = df_pred2['positive']==1

    index_negative_and_pe_present_on_image_gt_05_i = (index_positive_i==False) & (df_pred2['pe_present_on_image']>0.5)
    df_pred['pe_present_on_image'][index_negative_and_pe_present_on_image_gt_05_i] = 0.5

    index_positive_and_peak_and_pe_present_on_image_lte_05_i = index_positive_i & (df_pred2['peak']) & (df_pred2['pe_present_on_image']<=0.5)
    df_pred['pe_present_on_image'][index_positive_and_peak_and_pe_present_on_image_lte_05_i] = 0.5001
   
    if verbose:
        print("num study", len(df_pred_s))
        print("num image", len(df_pred))
        print("split to 3 classes")
        print(" num predicted_as_negative:", index_negative.sum())
        print(" num predicted_as_indeterminate:", index_indeterminate.sum())
        print(" num predicted_as_positive:", index_positive.sum())
        print("process 3 class conflict")
        print(" num predicted_as_negative and negative<=0.5:", index_negative_and_negative_lte_05.sum())
        print(" num predicted_as_indeterminate and indeterminate<=0.5:", index_indeterminate_and_indeterminate_lte_05.sum())

        print(" num predicted_as_indeterminate and negative_exam_for_pe>0.5:", index_indeterminate_and_negative_gt_05.sum())
        print(" num predicted_as_negative and indeterminate>0.5:", index_negative_and_indeterminate_gt_05.sum())
        print(" num predicted_as_positive and negative_exam_for_pe>0.5:", index_positive_and_negative_gt_05.sum())
        print(" num predicted_as_positive and indeterminate>0.5:", index_positive_and_indeterminate_gt_05.sum())
      
        print("process negative case")
        print(" num predicted_as_not_positive and rv_lv_ratio_lt_1>0.5:", index_negative_and_rv_lv_ratio_lt_1_gt_05.sum())
        print(" num predicted_as_not_positive and rv_lv_ratio_gte_1>0.5:", index_negative_and_rv_lv_ratio_gte_1_gt_05.sum())
        print(" num predicted_as_not_positive and central_pe>0.5:", index_negative_and_central_pe_gt_05.sum())
        print(" num predicted_as_not_positive and central_pe>0.5:", index_negative_and_rightsided_pe_gt_05.sum())
        print(" num predicted_as_not_positive and leftsided_pe>0.5:", index_negative_and_leftsided_pe_gt_05.sum())
        print(" num predicted_as_not_positive and chronic_pe>0.5:", index_negative_and_chronic_pe_gt_05.sum())
        print(" num predicted_as_not_positive and acute_and_chronic_pe>0.5:", index_negative_and_acute_and_chronic_pe_gt_05.sum())

        print("process positive case")
        print(" num predicted_as_positive and rv_lv_ratio_lt_1<=rv_lv_ratio_gte_1:", index_positive_and_rv_gte_lv.sum())
        print(" num predicted_as_positive and rv_lv_ratio_lt_1>rv_lv_ratio_gte_1:", index_positive_and_rv_lt_lv.sum())
        print(" num predicted_as_positive and (rv_lv_ratio_lt_1<=rv_lv_ratio_gte_1) and (rv_lv_ratio_gte_1<=0.5): ",
               index_positive_and_rv_gte_lv_and_rv_lv_ratio_gte_1_lte_05.sum())
        print(" num predicted_as_positive and (rv_lv_ratio_lt_1<=rv_lv_ratio_gte_1) and rv_lv_ratio_lt_1>0.5: ",
               index_positive_and_rv_gte_lv_and_rv_lv_ratio_lt_1_gt_05.sum())
        print(" num predicted_as_positive and (rv_lv_ratio_lt_1>rv_lv_ratio_gte_1) and rv_lv_ratio_lt_1<=0.5: ",
               index_positive_and_rv_lt_lv_and_rv_lv_ratio_lt_1_lte_05.sum())
        print(" num predicted_as_positive and (rv_lv_ratio_lt_1>rv_lv_ratio_gte_1) and rv_lv_ratio_gte_1>0.5: ",
               index_positive_and_rv_lt_lv_and_rv_lv_ratio_gte_1_gt_05.sum())
        print(" num predicted_as_positive and central is greatest:", index_positive_and_central_is_greatest.sum())
        print(" num predicted_as_positive and right is greatest:", index_positive_and_right_is_greatest.sum())
        print(" num predicted_as_positive and left is greatest:", index_positive_and_left_is_greatest.sum())
        print(" num predicted_as_positive and central is greatest and central_pe<=0.5:", index_positive_and_central_is_greatest_and_central_pe_lte_05.sum())
        print(" num predicted_as_positive and right is greatest and rightsided_pe<=0.5:", index_positive_and_right_is_greatest_and_rightsided_pe_lte_05.sum())
        print(" num predicted_as_positive and left is greatest and leftsided_pe<=0.5:", index_positive_and_left_is_greatest_and_leftsided_pe_lte_05.sum())
        print(" num both chronic_pe and acute_and_chronic_pe is positive:", index_double_positive.sum())
        print(" num both chronic_pe and acute_and_chronic_pe is positive and chronic<=acute_and_chronic:", index_double_positive_and_chronic_lte_acute_and_chronic.sum())
        print(" num both chronic_pe and acute_and_chronic_pe is positive and chronic>acute_and_chronic:", index_double_positive_and_chronic_gt_acute_and_chronic.sum())

        print("process image level")
        print(" num img of predicted_as_positive:", index_positive_i.sum())
        print(" num img of predicted_as_negative:", (index_positive_i==0).sum())
        print(" num img of peak:", df_pred2['peak'].sum())
        print(" num img of predicted_as_negative and pe_present_on_image>0.5:", index_negative_and_pe_present_on_image_gt_05_i.sum())
        print(" num img of predicted_as_positive and peak and pe_present_on_image<=0.5:", index_positive_and_peak_and_pe_present_on_image_lte_05_i.sum())

    return df_pred_s, df_pred


In [None]:
df_pred_study_const, df_pred_image_const = solve_conflict(df_pred_study, df_pred_image)
df_pred_study_const.head()

# make submission

In [None]:
df_sub_pred = copy.deepcopy(df_pred_image_const[[col_index, col_targets[-1]]])
df_sub_pred.columns = ['id', 'label']
for i, col in enumerate(col_targets[:-1]):
    df_tmp = df_pred_study_const[[col_groupby, col]]
    df_tmp.columns = ['id', 'label']
    df_tmp['id'] = df_tmp['id'] + '_{}'.format(col)
    df_sub_pred = pd.concat([df_sub_pred, df_tmp])
df_sub_pred = df_sub_pred.reset_index(drop=True)
print(df_sub_pred.shape)
df_sub_pred.head()

In [None]:
df_sub = pd.read_csv(df_sub_path)
print(df_sub.shape)
df_sub.head()

In [None]:
df_sub = pd.merge(df_sub[['id']], df_sub_pred, on='id', how='left')
# df_sub = df_sub.fillna(0.5)
print(df_sub.shape)
df_sub.head(30)

In [None]:
# nan埋め
mean_targets = [
    0.674681,
    0.021569,
    0.040115,
    0.019920,
    0.055090,
    0.212117,
    0.257590,
    0.129139,
    0.174612,
    0.289885,
]
df_sub_mean = copy.deepcopy(df_test_full[[col_index]])
df_sub_mean.columns = ['id']
df_sub_mean['label'] = mean_targets[-1]
for i, col in enumerate(col_targets[:-1]):
    df_tmp = df_test_full_study[[col_groupby]]
    df_tmp.columns = ['id']
    df_tmp['label'] = mean_targets[i]
    df_tmp['id'] = df_tmp['id'] + '_{}'.format(col)
    df_sub_mean = pd.concat([df_sub_mean, df_tmp])
df_sub_mean = df_sub_mean.reset_index(drop=True)
print(df_sub_mean.shape)

In [None]:
df_sub['label'][pd.isna(df_sub['label'])] = pd.merge(df_sub[['id']], df_sub_mean, on='id', how='left')['label'][pd.isna(df_sub['label'])]
df_sub.head(20)

In [None]:
def check_consistency2(df_exam, df_image, test):
    
    '''
    Checks label consistency and returns the errors
    
    Args:
    sub   = submission dataframe (pandas)
    test  = test.csv dataframe (pandas)
    '''

    
    # MERGER
    df = df_exam.merge(df_image, how = 'left', on = 'StudyInstanceUID')
    ids    = ['StudyInstanceUID', 'SeriesInstanceUID', 'SOPInstanceUID']
    labels = [c for c in df.columns if c not in ids]
    df = df[ids + labels]
    
    # SPLIT NEGATIVE AND POSITIVE EXAMS
    df['positive_images_in_exam'] = df['StudyInstanceUID'].map(df.groupby(['StudyInstanceUID']).pe_present_on_image.max())
    df_pos = df.loc[df.positive_images_in_exam >  0.5]
    df_neg = df.loc[df.positive_images_in_exam <= 0.5]
    
    # CHECKING CONSISTENCY OF POSITIVE EXAM LABELS
    rule1a = df_pos.loc[((df_pos.rv_lv_ratio_lt_1  >  0.5)  & 
                         (df_pos.rv_lv_ratio_gte_1 >  0.5)) | 
                        ((df_pos.rv_lv_ratio_lt_1  <= 0.5)  & 
                         (df_pos.rv_lv_ratio_gte_1 <= 0.5))].reset_index(drop = True)
    rule1a['broken_rule'] = '1a'
    rule1b = df_pos.loc[(df_pos.central_pe    <= 0.5) & 
                        (df_pos.rightsided_pe <= 0.5) & 
                        (df_pos.leftsided_pe  <= 0.5)].reset_index(drop = True)
    rule1b['broken_rule'] = '1b'
    rule1c = df_pos.loc[(df_pos.acute_and_chronic_pe > 0.5) & 
                        (df_pos.chronic_pe           > 0.5)].reset_index(drop = True)
    rule1c['broken_rule'] = '1c'
    rule1d = df_pos.loc[(df_pos.indeterminate        > 0.5) | 
                        (df_pos.negative_exam_for_pe > 0.5)].reset_index(drop = True)
    rule1d['broken_rule'] = '1d'

    # CHECKING CONSISTENCY OF NEGATIVE EXAM LABELS
    rule2a = df_neg.loc[((df_neg.indeterminate        >  0.5)  & 
                         (df_neg.negative_exam_for_pe >  0.5)) | 
                        ((df_neg.indeterminate        <= 0.5)  & 
                         (df_neg.negative_exam_for_pe <= 0.5))].reset_index(drop = True)
    rule2a['broken_rule'] = '2a'
    rule2b = df_neg.loc[(df_neg.rv_lv_ratio_lt_1     > 0.5) | 
                        (df_neg.rv_lv_ratio_gte_1    > 0.5) |
                        (df_neg.central_pe           > 0.5) | 
                        (df_neg.rightsided_pe        > 0.5) | 
                        (df_neg.leftsided_pe         > 0.5) |
                        (df_neg.acute_and_chronic_pe > 0.5) | 
                        (df_neg.chronic_pe           > 0.5)].reset_index(drop = True)
    rule2b['broken_rule'] = '2b'
    
    # MERGING INCONSISTENT PREDICTIONS
    errors = pd.concat([rule1a, rule1b, rule1c, rule1d, rule2a, rule2b], axis = 0)
    
    # OUTPUT
    print('Found', len(errors), 'inconsistent predictions')
    return errors

In [None]:
error = check_consistency2(df_pred_study_const, df_pred_image_const, df_test)

In [None]:
if len(error)==0:
    df_sub.to_csv('submission.csv', index=None)
else:
    print("error!")

In [None]:
df_sub.head()

In [None]:
3989*3/3600*2