In [None]:
import numpy as np
import pandas as pd

In [None]:
test = pd.read_csv('../input/rsna-str-pulmonary-embolism-detection/test.csv')

In [None]:
# IMPORT EXAMPLE SUBMISSION
# sub = pd.read_csv('../input/peinferencelstm/submission_seresnext26_ver3.csv')
sub = pd.read_csv('../input/lightgbm-on-meta-features/submission.csv')
sub.shape

# CHECKING CONSISTENCY RULES

this function is taken from [this notebook](https://www.kaggle.com/anthracene/host-confirmed-label-consistency-check)

In [None]:
def check_consistency(sub, test):
    
    '''
    Checks label consistency and returns the errors
    
    Args:
    sub   = submission dataframe (pandas)
    test  = test.csv dataframe (pandas)
    '''
    
    # EXAM LEVEL
    df = None
    for i in test['StudyInstanceUID'].unique():
        df_tmp = sub.loc[sub.id.str.contains(i, regex = False)].reset_index(drop = True)
        df_tmp['StudyInstanceUID'] = df_tmp['id'].str.split('_').str[0]
        df_tmp['label_type']       = df_tmp['id'].str.split('_').str[1:].apply(lambda x: '_'.join(x))
        del df_tmp['id']
        
        df = pd.concat([df, df_tmp], axis = 0)
    
    df_exam = df.pivot(index = 'StudyInstanceUID', columns = 'label_type', values = 'label')
    
    # IMAGE LEVEL
    df_image = sub.loc[sub.id.isin(test.SOPInstanceUID)].reset_index(drop = True)
    df_image = df_image.merge(test, how = 'left', left_on = 'id', right_on = 'SOPInstanceUID')
    df_image.rename(columns = {"label": "pe_present_on_image"}, inplace = True)
    del df_image['id']
    
    # MERGER
    df = df_exam.merge(df_image, how = 'left', on = 'StudyInstanceUID')
    ids    = ['StudyInstanceUID', 'SeriesInstanceUID', 'SOPInstanceUID']
    labels = [c for c in df.columns if c not in ids]
    df = df[ids + labels]
    
    # SPLIT NEGATIVE AND POSITIVE EXAMS
    df['positive_images_in_exam'] = df['StudyInstanceUID'].map(df.groupby(['StudyInstanceUID']).pe_present_on_image.max())
    df_pos = df.loc[df.positive_images_in_exam >  0.5]
    df_neg = df.loc[df.positive_images_in_exam <= 0.5]
    
    # CHECKING CONSISTENCY OF POSITIVE EXAM LABELS
    rule1a = df_pos.loc[((df_pos.rv_lv_ratio_lt_1  >  0.5)  & 
                         (df_pos.rv_lv_ratio_gte_1 >  0.5)) | 
                        ((df_pos.rv_lv_ratio_lt_1  <= 0.5)  & 
                         (df_pos.rv_lv_ratio_gte_1 <= 0.5))].reset_index(drop = True)
    rule1a['broken_rule'] = '1a'
    rule1b = df_pos.loc[(df_pos.central_pe    <= 0.5) & 
                        (df_pos.rightsided_pe <= 0.5) & 
                        (df_pos.leftsided_pe  <= 0.5)].reset_index(drop = True)
    rule1b['broken_rule'] = '1b'
    rule1c = df_pos.loc[(df_pos.acute_and_chronic_pe > 0.5) & 
                        (df_pos.chronic_pe           > 0.5)].reset_index(drop = True)
    rule1c['broken_rule'] = '1c'
    rule1d = df_pos.loc[(df_pos.indeterminate        > 0.5) | 
                        (df_pos.negative_exam_for_pe > 0.5)].reset_index(drop = True)
    rule1d['broken_rule'] = '1d'

    # CHECKING CONSISTENCY OF NEGATIVE EXAM LABELS
    rule2a = df_neg.loc[((df_neg.indeterminate        >  0.5)  & 
                         (df_neg.negative_exam_for_pe >  0.5)) | 
                        ((df_neg.indeterminate        <= 0.5)  & 
                         (df_neg.negative_exam_for_pe <= 0.5))].reset_index(drop = True)
    rule2a['broken_rule'] = '2a'
    rule2b = df_neg.loc[(df_neg.rv_lv_ratio_lt_1     > 0.5) | 
                        (df_neg.rv_lv_ratio_gte_1    > 0.5) |
                        (df_neg.central_pe           > 0.5) | 
                        (df_neg.rightsided_pe        > 0.5) | 
                        (df_neg.leftsided_pe         > 0.5) |
                        (df_neg.acute_and_chronic_pe > 0.5) | 
                        (df_neg.chronic_pe           > 0.5)].reset_index(drop = True)
    rule2b['broken_rule'] = '2b'
    
    # MERGING INCONSISTENT PREDICTIONS
    errors = pd.concat([rule1a, rule1b, rule1c, rule1d, rule2a, rule2b], axis = 0)
    
    # OUTPUT
    print('Found', len(errors), 'inconsistent predictions')
    return errors

In [None]:
# CHECK
errors = check_consistency(sub, test)
errors['broken_rule'].value_counts()

# Post Processing

In [None]:
test = pd.read_csv('../input/rsna-str-pulmonary-embolism-detection/test.csv')

In [None]:
# IMPORT EXAMPLE SUBMISSION
sub = pd.read_csv('../input/peinferencelstm/submission_ver12.csv')
sub.shape

In [None]:
%%time

# EXAM LEVEL
df = None
for i in test['StudyInstanceUID'].unique():

    df_tmp = sub.loc[sub.id.str.contains(i, regex = False)].reset_index(drop = True)
    df_tmp['StudyInstanceUID'] = df_tmp['id'].str.split('_').str[0]
    df_tmp['label_type']       = df_tmp['id'].str.split('_').str[1:].apply(lambda x: '_'.join(x))
    del df_tmp['id']
    
    df = pd.concat([df, df_tmp], axis = 0)
        
df_exam = df.pivot(index = 'StudyInstanceUID', columns = 'label_type', values = 'label')

In [None]:
# IMAGE LEVEL
df_image = sub.loc[sub.id.isin(test.SOPInstanceUID)].reset_index(drop = True)
df_image = df_image.merge(test, how = 'left', left_on = 'id', right_on = 'SOPInstanceUID')
df_image.rename(columns = {"label": "pe_present_on_image"}, inplace = True)
del df_image['id']
df_image.head()

In [None]:
%%time

# MERGER
df = df_exam.merge(df_image, how = 'left', on = 'StudyInstanceUID')
ids    = ['StudyInstanceUID', 'SeriesInstanceUID', 'SOPInstanceUID']
labels = [c for c in df.columns if c not in ids]
df = df[ids + labels]
df.head()

In [None]:
# SPLIT NEGATIVE AND POSITIVE EXAMS

df['positive_images_in_exam'] = df['StudyInstanceUID'].map(df.groupby(['StudyInstanceUID']).pe_present_on_image.max())

pos_indices = df.positive_images_in_exam > 0.5
neg_indices = df.positive_images_in_exam <= 0.5
df_pos = df.loc[pos_indices]
df_neg = df.loc[neg_indices]

In [None]:
delta = 1e-4

# 1a ~ 1d rules

In [None]:
# either rv_lv_ratio_lt_1 or rv_lv_ratio_gte_1 must have p > 0.5; both cannot have p > 0.5.
def rule1a(row):
    rv_lv_ratio_lt_1 = row["rv_lv_ratio_lt_1"]
    rv_lv_ratio_gte_1 = row["rv_lv_ratio_gte_1"]
    if rv_lv_ratio_lt_1 > rv_lv_ratio_gte_1:
        rv_lv_ratio_lt_1 = max(0.5 + delta, rv_lv_ratio_lt_1)
        rv_lv_ratio_gte_1 = min(0.5 - delta, rv_lv_ratio_gte_1)
    else:
        rv_lv_ratio_lt_1 = min(0.5 - delta, rv_lv_ratio_lt_1)
        rv_lv_ratio_gte_1 = max(0.5 + delta, rv_lv_ratio_gte_1)
    return rv_lv_ratio_lt_1, rv_lv_ratio_gte_1

def postprocess_rule1a(df, df_pos, pos_indices):
    indices = ((df_pos.rv_lv_ratio_lt_1  >  0.5)  & 
                 (df_pos.rv_lv_ratio_gte_1 >  0.5)) | \
                ((df_pos.rv_lv_ratio_lt_1  <= 0.5)  & \
                 (df_pos.rv_lv_ratio_gte_1 <= 0.5))

    if np.any(indices):
        columns = ["rv_lv_ratio_lt_1", "rv_lv_ratio_gte_1"]
        tmp_val = df_pos.copy().loc[indices, columns].apply(lambda row: rule1a(row), axis=1).values
        for col, val in zip(columns, zip(*tmp_val)):
            df.loc[(pos_indices & indices), col] = val

postprocess_rule1a(df, df_pos, pos_indices)

In [None]:
# at least one of central_pe, rightsided_pe and leftsided_pe must have p > 0.5; multiple having p > 0.5 is allowed.
def rule1b(row):
    central_pe = row["central_pe"]
    rightsided_pe = row["rightsided_pe"]
    leftsided_pe = row["leftsided_pe"]
    l = [central_pe, rightsided_pe, leftsided_pe]
    max_idx = np.argmax(l)
    l[max_idx] = 0.5 + delta
    return l[0], l[1], l[2]

def postprocess_rule1b(df, df_pos, pos_indices):
    indices = (df_pos.central_pe    <= 0.5) & \
              (df_pos.rightsided_pe <= 0.5) & \
              (df_pos.leftsided_pe  <= 0.5)
    
    if np.any(indices):
        columns = ["central_pe", "rightsided_pe", "leftsided_pe"]
        tmp_val = df_pos.copy().loc[indices, columns].apply(lambda row: rule1b(row), axis=1).values
        for col, val in zip(columns, zip(*tmp_val)):
            df.loc[(pos_indices & indices), col] = val

postprocess_rule1b(df, df_pos, pos_indices)

In [None]:
# acute_and_chronic_pe and chronic_pe: only one of them can have p > 0.5; neither having p > 0.5 is allowed.
def rule1c(row):
    acute_and_chronic_pe = row["acute_and_chronic_pe"]
    chronic_pe = row["chronic_pe"]
    if acute_and_chronic_pe > chronic_pe:
        chronic_pe = 0.5 - delta
    else:
        acute_and_chronic_pe = 0.5 - delta
    return acute_and_chronic_pe, chronic_pe

def postprocess_rule1c(df, df_pos, pos_indices):
    indices = (df_pos.acute_and_chronic_pe > 0.5) & \
              (df_pos.chronic_pe           > 0.5)

    if np.any(indices):
        columns = ["acute_and_chronic_pe", "chronic_pe"]
        tmp_val = df_pos.copy().loc[indices, columns].apply(lambda row: rule1c(row), axis=1).values
        for col, val in zip(columns, zip(*tmp_val)):
            df.loc[(pos_indices & indices), col] = val

postprocess_rule1c(df, df_pos, pos_indices)

In [None]:
def rule1d(row):
    negative_exam_for_pe = row["negative_exam_for_pe"]
    indeterminate = row["indeterminate"]
    negative_exam_for_pe = min(0.5 - delta, negative_exam_for_pe)
    indeterminate = min(0.5 - delta, indeterminate)
    return negative_exam_for_pe, indeterminate

def postprocess_rule1d(df, df_pos, pos_indices):
    indices = (df_pos.indeterminate        > 0.5) | \
              (df_pos.negative_exam_for_pe > 0.5)
    
    if np.any(indices):
        columns = ["negative_exam_for_pe", "indeterminate"]
        tmp_val = df_pos.copy().loc[indices, columns].apply(lambda row: rule1d(row), axis=1).values
        for col, val in zip(columns, zip(*tmp_val)):
            df.loc[(pos_indices & indices), col] = val

postprocess_rule1d(df, df_pos, pos_indices)

# 2a ~ 2b rules

In [None]:
# either indeterminate or negative_exam_for_pe must have p > 0.5; both cannot have p > 0.5.
def rule2a(row):
    negative_exam_for_pe = row["negative_exam_for_pe"]
    indeterminate = row["indeterminate"]
    if negative_exam_for_pe > 0.5:
        if negative_exam_for_pe > indeterminate:
            indeterminate = 0.5 - delta
        else:
            negative_exam_for_pe = 0.5 - delta
    else: 
        if negative_exam_for_pe > indeterminate:
            negative_exam_for_pe = 0.5 + delta
        else:
            indeterminate = 0.5 + delta
    return negative_exam_for_pe, indeterminate
        

def postprocess_rule2a(df, df_neg, neg_indices):
    indices = ((df_neg.indeterminate        >  0.5)  & 
               (df_neg.negative_exam_for_pe >  0.5)) | \
              ((df_neg.indeterminate        <= 0.5)  & 
               (df_neg.negative_exam_for_pe <= 0.5))
    
    if np.any(indices):
        columns = ["negative_exam_for_pe", "indeterminate"]
        tmp_val = df_neg.copy().loc[indices, columns].apply(lambda row: rule2a(row), axis=1).values
        for col, val in zip(columns, zip(*tmp_val)):
            df.loc[(neg_indices & indices), col] = val

postprocess_rule2a(df, df_neg, neg_indices)

In [None]:
# all positive-related labels: rv_lv_ratio_lt_1, rv_lv_ratio_gte_1, central_pe, rightsided_pe, leftsided_pe, acute_and_chronic_pe and chronic_pe must have p < 0.5.
def rule2b(row):
    rv_lv_ratio_lt_1 = row["rv_lv_ratio_lt_1"]
    rv_lv_ratio_gte_1 = row["rv_lv_ratio_gte_1"]
    central_pe = row["central_pe"]
    rightsided_pe = row["rightsided_pe"]
    leftsided_pe = row["leftsided_pe"]
    acute_and_chronic_pe = row["acute_and_chronic_pe"]
    chronic_pe = row["chronic_pe"]
    
    rv_lv_ratio_lt_1 = min(0.5 - delta, rv_lv_ratio_lt_1)
    rv_lv_ratio_gte_1 = min(0.5 - delta, rv_lv_ratio_gte_1)
    central_pe = min(0.5 - delta, central_pe)
    rightsided_pe = min(0.5 - delta, rightsided_pe)
    leftsided_pe = min(0.5 - delta, leftsided_pe)
    acute_and_chronic_pe = min(0.5 - delta, acute_and_chronic_pe)
    chronic_pe = min(0.5 - delta, chronic_pe)
    return rv_lv_ratio_lt_1, rv_lv_ratio_gte_1, \
           central_pe, rightsided_pe, leftsided_pe, \
           acute_and_chronic_pe, chronic_pe 

def postprocess_rule2b(df, df_neg, neg_indices):
    indices = (df_neg.rv_lv_ratio_lt_1     > 0.5) | \
              (df_neg.rv_lv_ratio_gte_1    > 0.5) | \
              (df_neg.central_pe           > 0.5) | \
              (df_neg.rightsided_pe        > 0.5) | \
              (df_neg.leftsided_pe         > 0.5) | \
              (df_neg.acute_and_chronic_pe > 0.5) | \
              (df_neg.chronic_pe           > 0.5)
    
    if np.any(indices):
        columns = [
            "rv_lv_ratio_lt_1", "rv_lv_ratio_gte_1", "central_pe",
            "rightsided_pe", "leftsided_pe", "acute_and_chronic_pe", "chronic_pe"]
        tmp_val = df_neg.copy().loc[indices, columns].apply(lambda row: rule2b(row), axis=1).values
        for col, val in zip(columns, zip(*tmp_val)):
            df.loc[(neg_indices & indices), col] = val

postprocess_rule2b(df, df_neg, neg_indices)

# Restore the submission.csv format

In [None]:
remove_columns = ["SeriesInstanceUID", "SOPInstanceUID", "positive_images_in_exam", "pe_present_on_image"]
df_columns = [c for c in df.columns if c not in remove_columns]
df = df[df_columns]
df.head()

In [None]:
df = df.melt(id_vars=["StudyInstanceUID"])

In [None]:
df.rename(columns={"variable": "id", "value": "label"}, inplace=True)

In [None]:
print(df.shape)
df.head()

In [None]:
df["id"] = df.apply(lambda x: x["StudyInstanceUID"] + "_" + x["id"], axis=1)

In [None]:
df.drop("StudyInstanceUID", inplace=True, axis=1)

In [None]:
df = df.drop_duplicates().reset_index(drop=True)

In [None]:
print(df.shape)
df.head()

In [None]:
df_image = sub.loc[sub.id.isin(test.SOPInstanceUID)].reset_index(drop = True)

In [None]:
print(df_image.shape)
df_image.head()

In [None]:
df = pd.concat([df, df_image])

In [None]:
assert sub.shape == df.shape

In [None]:
df.head()

In [None]:
%%time

errors = check_consistency(df, test)
errors['broken_rule'].value_counts()