<h1><center>RANZCR CLiP. Data Understanding.</center></h1>

<center><img src="https://pbs.twimg.com/profile_images/1100562573001277440/paTq0zKF_400x400.png"></center>


<div class="list-group" id="list-tab" role="tablist">
<h2 class="list-group-item list-group-item-action active" data-toggle="list" style='background:black; border:0; color:white' role="tab" aria-controls="home"><center>Quick navigation</center></h2>

* [1. Basic Data Overview](#1)
* [2. Image Overview](#2)

In [None]:
import numpy as np
import pandas as pd
import os

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objs as go
import matplotlib.pyplot as plt
import cv2

<a id="1"></a>
<h2 style='background:black; border:0; color:white'><center>1. Basic Data Overview</center><h2>

**train.csv** - contains image IDs, binary labels, and patient IDs.

* StudyInstanceUID - unique ID for each image
* ETT - Abnormal - endotracheal tube placement abnormal
* ETT - Borderline - endotracheal tube placement borderline abnormal
* ETT - Normal - endotracheal tube placement normal
* NGT - Abnormal - nasogastric tube placement abnormal
* NGT - Borderline - nasogastric tube placement borderline abnormal
* NGT - Incompletely Imaged - nasogastric tube placement inconclusive due to imaging
* NGT - Normal - nasogastric tube placement borderline normal
* CVC - Abnormal - central venous catheter placement abnormal
* CVC - Borderline - central venous catheter placement borderline abnormal
* CVC - Normal - central venous catheter placement normal
* Swan Ganz Catheter Present
* PatientID - unique ID for each patient in the dataset

In [None]:
train = pd.read_csv('/kaggle/input/ranzcr-clip-catheter-line-classification/train.csv')
train

In [None]:
cols = [
    'ETT - Abnormal', 'ETT - Borderline', 'ETT - Normal', 
    'NGT - Abnormal', 'NGT - Borderline', 'NGT - Incompletely Imaged', 
    'NGT - Normal', 'CVC - Abnormal', 'CVC - Borderline', 
    'CVC - Normal', 'Swan Ganz Catheter Present'
]

fig = make_subplots(rows=4, cols=3)

traces = [
    go.Bar(
        x=[0, 1], 
        y=[
            len(train[train[col]==0]),
            len(train[train[col]==1])
        ], 
        name=col,
        text = [
            str(round(100 * len(train[train[col]==0]) / len(train), 2)) + '%',
            str(round(100 * len(train[train[col]==1]) / len(train), 2)) + '%'
        ],
        textposition='auto'
    ) for col in cols
]

for i in range(len(traces)):
    fig.append_trace(traces[i], (i // 3) + 1, (i % 3)  +1)

fig.update_layout(
    title_text='Train columns',
    height=1200,
    width=1000
)

fig.show()

Let's see how many records for every column in training set have non zero values.

In [None]:
x = train[cols].sum(axis=0).sort_values().reset_index()
x.columns = ['column', 'nonzero_records']

fig = px.bar(
    x, 
    x='nonzero_records', 
    y='column', 
    orientation='h', 
    title='Columns and non zero samples', 
    height=800, 
    width=800
)

fig.show()

In [None]:
data = train[cols].astype(bool).sum(axis=1).reset_index()
data.columns = ['row', 'count']
data = data.groupby(['count'])['row'].count().reset_index()

fig = px.bar(
    data, 
    y=data['row'], 
    x="count", 
    title='Number of activations for every sample in training set', 
    width=800, 
    height=500
)

fig.show()

In [None]:
data = train[cols].astype(bool).sum(axis=1).reset_index()
data.columns = ['row', 'count']
data = data.groupby(['count'])['row'].count().reset_index()

fig = px.pie(
    data, 
    values=round((100 * data['row'] / len(train)), 2), 
    names="count", 
    title='Number of activations for every sample (Percent)', 
    width=800, 
    height=500
)

fig.show()

In [None]:
data = train[cols]

f = plt.figure(
    figsize=(12, 12)
)

plt.matshow(
    data.corr(), 
    fignum=f.number
)

plt.xticks(
    range(data.shape[1]), 
    data.columns, 
    fontsize=13, 
    rotation=70
)

plt.yticks(
    range(data.shape[1]), 
    data.columns, 
    fontsize=13
)

cb = plt.colorbar()

cb.ax.tick_params(
    labelsize=13
)

<a id="2"></a>
<h2 style='background:black; border:0; color:white'><center>2. Image Overview</center><h2>

Let's check some of our randomly selected images.

In [None]:
f, plots = plt.subplots(5, 5, sharex='col', sharey='row', figsize=(17, 17))
samples = train.sample(n=25, random_state=666)['StudyInstanceUID'].values

for i in range(25):
    plots[i // 5, i % 5].axis('off')
    image = cv2.imread(os.path.join("/kaggle/input/ranzcr-clip-catheter-line-classification/train/", f"{samples[i]}.jpg"))
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    plots[i // 5, i % 5].imshow(image)

Let's check image shapes for training set

In [None]:
%%time

check_dict = dict()

for filename in os.listdir("/kaggle/input/ranzcr-clip-catheter-line-classification/train/"):
    img = cv2.imread("/kaggle/input/ranzcr-clip-catheter-line-classification/train/" + filename)
    try:
        check_dict[img.shape] += 1
    except:
        check_dict[img.shape] = 1

In [None]:
shapes_df = pd.DataFrame(check_dict.items(), columns=['shape', 'count'])

x = shapes_df.sort_values(['count']).reset_index().drop(['index'], axis=1)
x['shape'] = x['shape'].astype(str)
x = x.tail(15)

fig = px.bar(
    x, 
    x='count', 
    y='shape', 
    orientation='h', 
    title='Top 15 shape by number of samples', 
    height=800, 
    width=800
)

fig.show()

In [None]:
train_annotations = pd.read_csv('/kaggle/input/ranzcr-clip-catheter-line-classification/train_annotations.csv')
train_annotations

In [None]:
import ast

def plot_annotation():
    sample = train_annotations.sample(n=1)['StudyInstanceUID'].values[0]
    data = train_annotations[train_annotations['StudyInstanceUID'] == sample]['data'].values[0]
    image_path = "/kaggle/input/ranzcr-clip-catheter-line-classification/train/" + sample + ".jpg"
    data = np.array(ast.literal_eval(data))
    plt.figure(figsize=(10, 10))
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    plt.imshow(image)
    plt.scatter(data[:, 0], data[:, 1])

In [None]:
plot_annotation()

### To be continued