# Table of Contents:
* [Univariate Frequencies](#1)
* [Check structure for an invididual Image](#2)
* [Bivariate Perspective (Classes/Radiologists)](#3)
* [Pivot Tables for Classes and Radiologists](#4)
* [Geometry](#5)

In [None]:
# packages

# standard
import numpy as np
import pandas as pd

# plots
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# import training data file
df = pd.read_csv('../input/vinbigdata-chest-xray-abnormalities-detection/train.csv')
df.head()

In [None]:
# dimensions
df.shape

<a id='1'></a>
# Univariate Frequencies

In [None]:
# image id
df.image_id.value_counts()

In [None]:
# image id - plot frequencies (top 25)
fig = plt.figure(figsize = (12,4))
df.image_id.value_counts()[0:25].plot(kind='bar', color='darkred')
plt.title('25 images with most rows in train.csv')
plt.ylabel('Number of rows in train.csv')
plt.grid()
plt.show()

#### We have 67'914 rows of data, but only 15'000 unique images. Each image occurs at least 3 times in train.csv.

### Classes

In [None]:
# classes (name)
print(df.class_name.value_counts())
df.class_name.value_counts().plot(kind='bar')
plt.title('Class Name')
plt.grid()
plt.show()

### Radiologists

In [None]:
# radiologists
print(df.rad_id.value_counts())
df.rad_id.value_counts().plot(kind='bar', color='darkgreen')
plt.title('Radiologists')
plt.grid()
plt.show()

#### R8, R9, R10 did much more labelling than the other radiologists.

<a id='2'></a>
# Check structure for an individual image

In [None]:
# let's pick an image with lots of labels
df_example = df[df.image_id=='fa109c087e46fe1ea27e48ce6d154d2f']
df_example

In [None]:
# classes for this image
df_example.class_name.value_counts()

In [None]:
# radiologists for this image
df_example.rad_id.value_counts()

In [None]:
# cross table
pd.crosstab(df_example.class_name, df_example.rad_id)

<a id='3'></a>
# Bivariate perspective

In [None]:
# cross table of classes / radiologists
ctab = pd.crosstab(df.class_name, df.rad_id)
ctab

In [None]:
# visualize as heatmap
fig = plt.figure(figsize = (18,10))
sns.heatmap(ctab, annot=True)
plt.show()

#### R1, R3, R4, R5, R6 and R7 have no findings at all. R2 has only 3 findings.

# > Normalize table for each radiologist (column)

In [None]:
# normalize each column
ctab_norm_rad = ctab / ctab.sum()
# and visualize result as heatmap
fig = plt.figure(figsize = (18,10))
sns.heatmap(ctab_norm_rad, annot=True)
plt.show()

#### R11, R12, R13, R14, R15, R16 and R17 have mostly (>= 80%) no findings.


In [None]:
# let's plot the individual distributions again as bar charts
my_list = ['R11','R12','R13','R14','R15','R16','R17']
for rad in my_list:
    ctab_norm_rad[rad].plot(kind='bar')
    plt.title(rad)
    plt.grid()
    plt.show()

#### Only R8, R9 and R10 show a somewhat diversified labelling.

In [None]:
# let's plot the individual distributions again as bar charts
my_list = ['R8','R9','R10']
for rad in my_list:
    ctab_norm_rad[rad].plot(kind='bar')
    plt.title(rad)
    plt.grid()
    plt.show()

# > Normalize table for each class (row)

In [None]:
# normalize each row
ctab_norm_class = (ctab.transpose() / ctab.sum(axis=1)).transpose()
# and visualize result as heatmap
fig = plt.figure(figsize = (18,10))
sns.heatmap(ctab_norm_class, annot=True)
plt.show()

In [None]:
# example: pick the row of "No finding"
nofind_dist_on_rad = ctab_norm_class[ctab_norm_class.index=='No finding']

fig = plt.figure(figsize = (12,4))
plt.bar(x=nofind_dist_on_rad.columns.to_list(),
        height=np.asarray(nofind_dist_on_rad).flatten(),
        color='darkgreen')
plt.title('Distribution of no findings across radiologists (sum=100%)')
plt.grid()
plt.show()

<a id='4'></a>
# Pivot Tables for classes and radiologists

#### We want to reduce the training data to 15'000 rows corresponding to the 15'000 distinct images. For this we aggregate the labels and store the counts in new columns. Of course, we are losing information by doing this!

In [None]:
df_pivot_class = pd.pivot_table(data=df[['image_id','class_name','class_id']],
                                # class_id is used as dummy column for counting only
                                index='image_id', # group by image_id
                                columns=['class_name'], # new columns created from classes
                                fill_value=0,
                                aggfunc='count' # count values
                               )
df_pivot_class = df_pivot_class.class_id
# add count of labels
df_pivot_class['sum_labels'] = df_pivot_class.sum(axis=1)

# preview
df_pivot_class.head(10)

#### We can use this to find out the images w/o relevant labels:

In [None]:
df_empty = df_pivot_class[df_pivot_class['No finding'] == df_pivot_class.sum_labels]
df_empty

#### Conclusion: 10'606 of 15'000 images (70.7%) have no (non-trivial) labels!

In [None]:
# visualize (subset of) pivot table
n_sub = 50
plt.figure(figsize=(8,12))
sns.heatmap(df_pivot_class.iloc[0:n_sub,0:15])
plt.show()

### Pivot Table for radiologists:

In [None]:
df_pivot_rad = pd.pivot_table(data=df[['image_id','rad_id','class_id']],
                              # class_id is used as dummy column for counting only
                              index='image_id', # group by image_id
                              columns=['rad_id'], # new columns created from rad_id's
                              fill_value=0,
                              aggfunc='count' # count values
                             )
df_pivot_rad = df_pivot_rad.class_id

# add distinct count of radiologists and count of labels
temp1 = np.count_nonzero(df_pivot_rad, axis=1)
temp2 = df_pivot_rad.sum(axis=1)
df_pivot_rad['n_rads'] = temp1
df_pivot_rad['sum_labels'] = temp2

# preview
df_pivot_rad.head(10)

#### We always have exactly 3 radiologist looking at an image (see also data description):

In [None]:
df_pivot_rad.n_rads.value_counts() # check counts

In [None]:
# visualize (subset of) pivot table
n_sub = 50
plt.figure(figsize=(8,12))
sns.heatmap(df_pivot_rad.iloc[0:n_sub,0:17])
plt.show()

In [None]:
# make pivot tables available for download
df_pivot_class.to_csv('df_pivot_class.csv')
df_pivot_rad.to_csv('df_pivot_rad.csv')

<a id='5'></a>
# Geometry

In [None]:
# use only rows having findings
dfxy = df[df.class_name != 'No finding'].copy()

In [None]:
# define some new features
dfxy['dx'] = dfxy.x_max - dfxy.x_min   # width
dfxy['dy'] = dfxy.y_max - dfxy.y_min   # height
dfxy['dxdy'] = dfxy.dx * dfxy.dy       # pixel area
dfxy['dy_over_dx'] = dfxy.dy / dfxy.dx # aspect ratio

features_num = ['x_min', 'x_max', 'y_min', 'y_max', 
                'dx', 'dy', 'dxdy', 'dy_over_dx']

In [None]:
# basic stats
dfxy[features_num].describe()

## Plot coordinate based features

In [None]:
for f in features_num:
    plt.figure(figsize=(10,4))
    dfxy[f].plot(kind='hist', bins=200)
    plt.grid()
    plt.title(f)
    plt.show()

## Scatter Plots

### Straight pairwise scatter plot

In [None]:
sns.pairplot(data=dfxy[features_num],
             plot_kws={'alpha': 0.1})
plt.show()

### Now colored by class

In [None]:
# use only the main features
sns.pairplot(dfxy[['class_name','x_min','x_max','y_min','y_max']], 
             hue='class_name',
             plot_kws={'alpha': 0.5})
plt.show()

### Nice from an aesthetic perspective, but probably too much information. Let's reduce the plot to only two classes:

In [None]:
dfxy_sub = dfxy[dfxy.class_name.isin(['Cardiomegaly','Aortic enlargement'])]

sns.pairplot(dfxy_sub[['class_name','rad_id','x_min','x_max','y_min','y_max']], 
             hue='class_name',
             plot_kws={'alpha': 0.5})
plt.show()