# Evaluation of the dependencies between the binary targets.

In [None]:
# packages

# standard
import numpy as np
import pandas as pd

# plots
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# load data and first glance
df_train = pd.read_csv('../input/ranzcr-clip-catheter-line-classification/train.csv')
df_train.head()

In [None]:
# dimensions of data frame
n_rows = df_train.shape[0]
df_train.shape

In [None]:
# select target columns
targets = ['ETT - Abnormal', 'ETT - Borderline',
           'ETT - Normal', 'NGT - Abnormal', 'NGT - Borderline',
           'NGT - Incompletely Imaged', 'NGT - Normal', 'CVC - Abnormal',
           'CVC - Borderline', 'CVC - Normal', 'Swan Ganz Catheter Present']

targets_ETT = ['ETT - Abnormal', 'ETT - Borderline', 'ETT - Normal']

targets_NGT = ['NGT - Abnormal', 'NGT - Borderline','NGT - Incompletely Imaged',
               'NGT - Normal']

targets_CVC = ['CVC - Abnormal', 'CVC - Borderline', 'CVC - Normal']

In [None]:
# count entries per column
col_counts = df_train[targets].sum(axis=0)
# and plot
col_counts.plot(kind='bar')
plt.title('Absolute frequencies')
plt.grid()
plt.show()

In [None]:
# plot relative counts
(col_counts / n_rows).plot(kind='bar')
plt.title('Percentages')
plt.grid()

### The indicators are not unique in each row (see e. g. the second row). Let's evaluate the multiplicities:

In [None]:
# count
df_train['Sum_Indicators'] = df_train[targets].sum(axis=1)
print(df_train.Sum_Indicators.value_counts().sort_index())
# and plot
df_train.Sum_Indicators.value_counts().sort_index().plot(kind='bar')
plt.title('Multiplicities of indicators')
plt.grid()
plt.show()

In [None]:
# let's check the most extreme cases (count=6)
df_train[df_train.Sum_Indicators==6]

In [None]:
# we have also 24 rows without any "1":
df_train[df_train.Sum_Indicators==0]

# Correlation of columns

#### Pearson correlation is not a really good tool for binary variables, nevertheless let's use it to get a first impression. At the end of this notebook you will find a better (asymmetric) alternative.

In [None]:
# correlation of columns
corr_pearson = df_train[targets].corr()
# plot correlation matrix
fig = plt.figure(figsize = (12,9))
sns.heatmap(corr_pearson, annot=True, cmap='RdYlGn')
plt.title('Pearson correlation')
plt.show()

#### We can observe some connections between ETT and NGT as well as something happening withing the CVC "block".

# ETT Targets (endotracheal tube)

In [None]:
# look at ETT targets only
df_train['Sum_ETT'] = df_train[targets_ETT].sum(axis=1)
print(df_train.Sum_ETT.value_counts().sort_index())
# and plot
df_train.Sum_ETT.value_counts().sort_index().plot(kind='bar')
plt.title('Multiplicities of ETT indicators')
plt.grid()
plt.show()

#### Indicators of ETT targets are actually mutually exclusive. So we could in theory convert the ETT columns into only one (multi-class) column:

In [None]:
df_train['ETT'] = 'NONE'
df_train.loc[df_train['ETT - Abnormal']==1,'ETT'] = 'ETT_Abnormal'
df_train.loc[df_train['ETT - Borderline']==1,'ETT'] = 'ETT_Borderline'
df_train.loc[df_train['ETT - Normal']==1,'ETT'] = 'ETT_Normal'
# evaluate frequencies
df_train.ETT.value_counts()
df_train.ETT.value_counts().plot(kind='bar')
plt.title('Frequency of ETT Targets')
plt.grid()
plt.show()

# NGT Targets (nasogastric tube)

In [None]:
# now look at NGT targets only
df_train['Sum_NGT'] = df_train[targets_NGT].sum(axis=1)
print(df_train.Sum_NGT.value_counts().sort_index())
# and plot
df_train.Sum_NGT.value_counts().sort_index().plot(kind='bar')
plt.title('Multiplicities of NGT indicators')
plt.grid()
plt.show()

#### Indicators of NGT targets are NOT mutually exclusive (however, we have "only" 45 rows with double entries). This is actually multi-label not multi-class...
#### Let's check the cross tables to identify common occurrences:

In [None]:
targets_TEMP = targets_NGT
nn = len(targets_TEMP)
for i in range(1,nn+1):
    for j in range(1,nn+1):
       if (i<j):
        f1 = targets_TEMP[i-1]
        f2 = targets_TEMP[j-1]
        print(pd.crosstab(df_train[f1], df_train[f2]))
        print('\n')

#### Interesting: In 9 cases we have "Normal" and "Abnormal" at the same time... No contradiction, we can have more than one object within one image!

#### Let's evaluate the conditional frequencies (e. g. NGT-Normal=1 given NGT-Borderline=1 and vice versa) systematically.

In [None]:
# use correlation matrix as container for the frequencies
cond_NGT = df_train[targets_NGT].corr()

# calc frequency of x given y for all pairs
for i in range(1,nn+1):
    for j in range(1,nn+1):
       if (i!=j):
        f1 = targets_TEMP[i-1]
        f2 = targets_TEMP[j-1]
        ctab = pd.crosstab(df_train[f1], df_train[f2])
        n_1 = df_train[f1].sum() # feature 1 = 1
        n_both = ctab.iloc[1,1]  # both features = 1
        perc_2_given_1 = n_both / n_1 # feature_2 = 1 given feature_1 = 1
        print('Percentage ',f2,' given ',f1,':',np.round(perc_2_given_1,4))
        cond_NGT.loc[f1,f2] = perc_2_given_1 # store value in correlation matrix

In [None]:
# plot values as matrix
sns.heatmap(cond_NGT, annot=True, cmap='RdYlGn')
plt.title('Conditional Frequencies')
plt.show()

#### Example how to read this:
* Relative Frequency of Borderline=1 given Normal=1 can be looked up in "Normal"-row: 0.0027.
* Relative Frequency of Normal=1 given Borderline=1 can be looked up in "Borderline"-row: 0.025.

# CVC Targets (central venous catheter)

In [None]:
# look at CVC targets only
df_train['Sum_CVC'] = df_train[targets_CVC].sum(axis=1)
print(df_train.Sum_CVC.value_counts().sort_index())
# and plot
df_train.Sum_CVC.value_counts().sort_index().plot(kind='bar')
plt.title('Multiplicities of CVC indicators')
plt.grid()
plt.show()

#### Indicators of CVC targets are also NOT mutually exclusive. Here we have quite a few duplicates and even 71 "triples".
#### Let's check the cross tables again:

In [None]:
targets_TEMP = targets_CVC
nn = len(targets_TEMP)
for i in range(1,nn+1):
    for j in range(1,nn+1):
       if (i<j):
        f1 = targets_TEMP[i-1]
        f2 = targets_TEMP[j-1]
        print(pd.crosstab(df_train[f1], df_train[f2]))
        print('\n')

#### We see that there are many (2607) common occurrences of "Normal" and "Borderline".

#### Let's evaluate the conditional frequencies (e. g. CVC Normal=1 given CVC Borderline=1) again.

In [None]:
# use correlation matrix as container for the frequencies
cond_CVC = df_train[targets_CVC].corr()

# calc frequency of x given y for all pairs
for i in range(1,nn+1):
    for j in range(1,nn+1):
       if (i!=j):
        f1 = targets_TEMP[i-1]
        f2 = targets_TEMP[j-1]
        ctab = pd.crosstab(df_train[f1], df_train[f2])
        n_1 = df_train[f1].sum()
        n_both = ctab.iloc[1,1] 
        perc_2_given_1 = n_both / n_1
        print('Percentage ',f2,' given ',f1,':',np.round(perc_2_given_1,4))
        cond_CVC.loc[f1,f2] = perc_2_given_1

In [None]:
# plot matrix of conditional frequencies
sns.heatmap(cond_CVC, annot=True, cmap='RdYlGn')
plt.title('Conditional Frequencies - CVC Targets')
plt.show()

# Swan Ganz Catheter

In [None]:
# finally let's check the "Swan Ganz Catheter Present" target:
df_train['Swan Ganz Catheter Present'].value_counts()
df_train['Swan Ganz Catheter Present'].value_counts().plot(kind='bar')
plt.title('Swan Ganz Catheter Present')
plt.grid()
plt.show()

#### Nothing really interesting here, just a very unbalanced target.

# Finally let's apply the conditional frequency approach to all the targets

### This is the alternative to the correlation matrix promised at the beginning:

In [None]:
# use correlation matrix as container for the frequencies
cond_ALL = df_train[targets].corr()

targets_TEMP = targets
nn = len(targets)

# calc frequency of x given y for all pairs
for i in range(1,nn+1):
    for j in range(1,nn+1):
       if (i!=j):
        f1 = targets_TEMP[i-1]
        f2 = targets_TEMP[j-1]
        ctab = pd.crosstab(df_train[f1], df_train[f2])
        n_1 = df_train[f1].sum()
        n_both = ctab.iloc[1,1] 
        perc_2_given_1 = n_both / n_1
        # print('Percentage ',f2,' given ',f1,':',np.round(perc_2_given_1,4))
        cond_ALL.loc[f1,f2] = perc_2_given_1
        
# plot matrix of conditional frequencies
fig = plt.figure(figsize = (12,9))
sns.heatmap(cond_ALL, annot=True, cmap='RdYlGn')
plt.title('Conditional Frequencies - All Targets')
plt.show()

#### Example:
* Conditional frequency for CVC-Normal given ETT-Normal is 0.73.
* Conditional frequency for ETT-Normal given CVC-Normal is 0.25. 

Let's check that:

In [None]:
pd.crosstab(df_train['ETT - Normal'], df_train['CVC - Normal'])

In [None]:
freq_check_1 = 5302 / (5302+1938)
print(freq_check_1)

In [None]:
freq_check_2 = 5302 / (5302+16022)
print(freq_check_2)

#### Different visualization (R corrplot style):

In [None]:
# "flatten" matrix to data frame
cond_ALL_df = cond_ALL.stack().reset_index(name='cond_freq')
# remove the trivial 1's to get a nicer plot
cond_ALL_df = cond_ALL_df[cond_ALL_df.cond_freq < 1]
# show structure
cond_ALL_df.head()

The following code for plotting is based on this kernel: https://www.kaggle.com/drazen/heatmap-with-sized-markers.
Many thanks to the author!

In [None]:
# plot matrix in "corrplot"-style

color_min, color_max = [0, 1] # range of values
n_colors = 256
palette = sns.mpl_palette('seismic', n_colors)

size_scale = 1000

# translate values into color of palette
def value_to_color(val):
    val_position = float((val - color_min)) / (color_max - color_min)
    ind = int(val_position * (n_colors - 1))
    return palette[ind]

fig, ax = plt.subplots(figsize=(7,7))

x = cond_ALL_df.level_1 # matrix columns
y = cond_ALL_df.level_0 # matrix rows
size = cond_ALL_df.cond_freq
color = cond_ALL_df.cond_freq

# define mapping between labels and coordinates
x_labels = y.unique() # intentionally using y here, we want the same (original) order on both axes!
y_labels = y.unique()
# reverse y_labels to get diagonal in NW to SE direction
y_labels = y_labels[::-1]
x_to_num = {p[1]:p[0] for p in enumerate(x_labels)} 
y_to_num = {p[1]:p[0] for p in enumerate(y_labels)} 

# finally the actual plotting
ax.scatter(
    x=x.map(x_to_num),
    y=y.map(y_to_num),
    s=size * size_scale,
    c=color.apply(value_to_color),
    marker='o' # use circles as markers
)

# set labels, title, etc.
ax.set_xticks([x_to_num[v] for v in x_labels])
ax.set_xticklabels(x_labels, rotation=90)
ax.set_yticks([y_to_num[v] for v in y_labels])
ax.set_yticklabels(y_labels)
ax.set(xlabel="", ylabel="", aspect='equal')
plt.title('Conditional Frequencies - All Targets')
plt.show()

In [None]:
# show top 10 "dependencies"
cond_ALL_df.sort_values('cond_freq', ascending=False)[0:10]