<h2> Catheter and Line Position Prediction </h2>
<hr>

- [1. Overview](#1)
- [2. Exploratory Data Analysis](#2)
    * [2.1. Distributions](#3)   
        * [CVC - Normal](#4)
        * [CVC - Borderline](#5)
        * [CVC - Abnormal](#6)
        * [ETT - Normal](#7)
        * [ETT - Borderline](#8)
        * [ETT - Abnormal](#9)
        * [NGT - Normal](#10)
        * [NGT - Borderline](#11)
        * [NGT - Abnormal](#12)
        * [NGT - Incompletely Imaged](#13)
        * [Swan Ganz Catheter Present](#14)
- [3. Model](#15)

## <span id="1"></span> **1. Overview**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import os
import ast
import cv2
import random

In [None]:
BASE_DIR = "../input/ranzcr-clip-catheter-line-classification/"
print(os.listdir(BASE_DIR))

### Files
- **train.csv** - contains image IDs, binary labels, and patient IDs.
- **sample_submission.csv** - a sample submission file in the correct format
- **test** - test images
- **train** - training images


## <span id="2"></span> **2. Exploratory Data Analysis**
Mostly utilized from <a href="https://www.kaggle.com/ihelon/catheter-position-exploratory-data-analysis">Yaroslav</a>'s EDA notebook: https://www.kaggle.com/ihelon/catheter-position-exploratory-data-analysis

In [None]:
df_train = pd.read_csv(os.path.join(BASE_DIR, "train.csv"), index_col=0)
df_train.head()

### <span id="3"></span> Distributions

In [None]:
df_tmp = df_train.iloc[:, :-1].sum()
df_tmp = df_tmp.sort_values(ascending=True)

fig = px.bar(x=df_tmp.values, y=df_tmp.index)

fig.update_layout(
        title = {"text": "Distribution of Labels", "font_size" :18, "x": 0.5},
        xaxis_title="Count",
        yaxis_title="Label",
    )
    
fig.update_traces(
    marker_color=px.colors.qualitative.Prism
)


fig.show()

In [None]:
def show_values_on_bars(axs):
    def _show_on_single_plot(ax):        
        for p in ax.patches:
            _x = p.get_x() + p.get_width() / 2
            _y = p.get_y() + p.get_height()
            value = '{:.0f}'.format(p.get_height())
            ax.text(_x, _y, value, ha="center") 

    if isinstance(axs, np.ndarray):
        for idx, ax in np.ndenumerate(axs):
            _show_on_single_plot(ax)
    else:
        _show_on_single_plot(axs)

In [None]:
f, axes = plt.subplots(3,3,figsize=(14,14))

_palette = "tab10"

# CVC

sns.countplot(df_train['CVC - Normal'], ax = axes[0,0], palette=_palette)
axes[0,0].set_xlabel('CVC - Normal', fontsize=14)
axes[0,0].set_ylabel('Count', fontsize=14)
axes[0,0].yaxis.tick_left()

sns.countplot(df_train['CVC - Borderline'], ax = axes[0,1], palette=_palette)
axes[0,1].set_xlabel('CVC - Borderline', fontsize=14)
axes[0,1].set_ylabel('', fontsize=14)
axes[0,1].yaxis.set_label_position("right")
axes[0,1].yaxis.tick_left()

sns.countplot(df_train['CVC - Abnormal'], ax = axes[0,2], palette=_palette)
axes[0,2].set_xlabel('CVC - Abnormal', fontsize=14)
axes[0,2].set_ylabel('', fontsize=14)
axes[0,2].yaxis.set_label_position("right")
axes[0,2].yaxis.tick_left()

# ETT

sns.countplot(df_train['ETT - Normal'], ax = axes[1,0], palette=_palette)
axes[1,0].set_xlabel('ETT - Normal', fontsize=14)
axes[1,0].set_ylabel('Count', fontsize=14)
axes[1,0].yaxis.tick_left()

sns.countplot(df_train['ETT - Borderline'], ax = axes[1,1], palette=_palette)
axes[1,1].set_xlabel('ETT - Borderline', fontsize=14)
axes[1,1].set_ylabel('', fontsize=14)
axes[1,1].yaxis.set_label_position("right")
axes[1,1].yaxis.tick_left()

sns.countplot(df_train['ETT - Abnormal'], ax = axes[1,2], palette=_palette)
axes[1,2].set_xlabel('ETT - Abnormal', fontsize=14)
axes[1,2].set_ylabel('', fontsize=14)
axes[1,2].yaxis.set_label_position("right")
axes[1,2].yaxis.tick_left()

show_values_on_bars(axes)

# NGT

sns.countplot(df_train['NGT - Normal'], ax = axes[2,0], palette=_palette)
axes[2,0].set_xlabel('NGT - Normal', fontsize=14)
axes[2,0].set_ylabel('Count', fontsize=14)
axes[2,0].yaxis.tick_left()

sns.countplot(df_train['NGT - Borderline'], ax = axes[2,1], palette=_palette)
axes[2,1].set_xlabel('NGT - Borderline', fontsize=14)
axes[2,1].set_ylabel('', fontsize=14)
axes[2,1].yaxis.set_label_position("right")
axes[2,1].yaxis.tick_left()

sns.countplot(df_train['NGT - Abnormal'], ax = axes[2,2], palette=_palette)
axes[2,2].set_xlabel('NGT - Abnormal', fontsize=14)
axes[2,2].set_ylabel('', fontsize=14)
axes[2,2].yaxis.set_label_position("right")
axes[2,2].yaxis.tick_left()

show_values_on_bars(axes)

plt.show()


In [None]:
f, axes = plt.subplots(1,2,figsize=(14,4))

_palette = "tab10"

# 

sns.countplot(df_train['NGT - Incompletely Imaged'], ax = axes[0], palette=_palette)
axes[0].set_xlabel('NGT - Incompletely Imaged', fontsize=14)
axes[0].set_ylabel('Count', fontsize=14)
axes[0].yaxis.tick_left()

sns.countplot(df_train['Swan Ganz Catheter Present'], ax = axes[1], palette=_palette)
axes[1].set_xlabel('Swan Ganz Catheter Present', fontsize=14)
axes[1].set_ylabel('', fontsize=14)
axes[1].yaxis.set_label_position("right")
axes[1].yaxis.tick_left()

show_values_on_bars(axes)

plt.show()

In [None]:
print("There are {0} unique patients.".format(df_train["PatientID"].nunique()))

In [None]:
plt.figure(figsize=(16, 6))
df_tmp = df_train["PatientID"].value_counts()
sns.countplot(x=df_tmp.values)
plt.xticks(fontsize=12, rotation=90)
plt.yticks(fontsize=14)
plt.xlabel("Number of observations", fontsize=15)
plt.ylabel("Number of patients", fontsize=15)
plt.title("Distribution of observations by PatientID", fontsize=16);

In [None]:
df_annot = pd.read_csv(os.path.join(BASE_DIR, "train_annotations.csv"))
df_annot.head()

In [None]:
def plot_image_with_annotations(row_ind):
    row = df_annot.iloc[row_ind]
    image_path = os.path.join(BASE_DIR, "train", row["StudyInstanceUID"] + ".jpg")
    label = row["label"]
    data = np.array(ast.literal_eval(row["data"]))
    
    plt.figure(figsize=(16, 5))
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    plt.subplot(1, 2, 1)
    plt.imshow(image)
    plt.subplot(1, 2, 2)
    plt.imshow(image)
    plt.scatter(data[:, 0], data[:, 1])
    
    plt.suptitle(label, fontsize=15)

In [None]:
plot_image_with_annotations(8)

In [None]:
# Helpfer functions

def visualize_batch(image_ids):
    plt.figure(figsize=(16, 10))
    
    for ind, image_id in enumerate(image_ids):
        plt.subplot(2, 3, ind + 1)
        image = cv2.imread(os.path.join(BASE_DIR, "train", f"{image_id}.jpg"))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        plt.imshow(image)
        plt.axis("off")
    
    plt.show()

    
def plot_statistics(df, col):
    plt.figure(figsize=(16, 2))
    sns.countplot(y=df[col])
    
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.xlabel("Number of observations", fontsize=15)
    plt.ylabel(col, fontsize=15)
    plt.title(f"Distribution of {col}", fontsize=16);
    
    plt.show()    

def process_class(col_name):
    plot_statistics(df_train, col_name)
    tmp_df = df_train[df_train[col_name] == 1]
    visualize_batch(random.sample(tmp_df.index.tolist(), 6))

### <span id="4"></span> CVC - Normal

In [None]:
process_class("CVC - Normal")

### <span id="5"></span> CVC - Borderline

In [None]:
process_class("CVC - Borderline")

### <span id="6"></span> CVC - Abnormal

In [None]:
process_class("CVC - Abnormal")

### <span id="7"></span> ETT - Normal

In [None]:
process_class("ETT - Normal")

### <span id="8"></span> ETT - Borderline

In [None]:
process_class("ETT - Borderline")

### <span id="9"></span> ETT - Abnormal

In [None]:
process_class("ETT - Abnormal")

### <span id="10"></span> NGT - Normal

In [None]:
process_class("NGT - Normal")

### <span id="11"></span> NGT - Borderline

In [None]:
process_class("NGT - Borderline")

### <span id="12"></span> NGT - Abnormal

In [None]:
process_class("NGT - Abnormal")

### <span id="13"></span> NGT - Incompletely Imaged

In [None]:
process_class("NGT - Incompletely Imaged")

### <span id="14"></span> Swang Ganz Catheter Present

In [None]:
process_class("Swan Ganz Catheter Present")

## <span id="15"></span> **3. Model**
Completely utilized from <a href="https://www.kaggle.com/xhlulu">Xing Han</a>'s GPU starter notebook: https://www.kaggle.com/xhlulu/ranzcr-efficientnet-gpu-starter-train-submit

In [None]:
# import efficientnet.tfkeras as efn
import numpy as np
import pandas as pd
from kaggle_datasets import KaggleDatasets
from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow.keras.applications.efficientnet as efn

In [None]:
def auto_select_accelerator():
    """
    Reference: 
        * https://www.kaggle.com/mgornergoogle/getting-started-with-100-flowers-on-tpu
        * https://www.kaggle.com/xhlulu/ranzcr-efficientnet-tpu-training
    """
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
        print("Running on TPU:", tpu.master())
    except ValueError:
        strategy = tf.distribute.get_strategy()
    print(f"Running on {strategy.num_replicas_in_sync} replicas")
    
    return strategy


def build_decoder(with_labels=True, target_size=(256, 256), ext='jpg'):
    def decode(path):
        file_bytes = tf.io.read_file(path)
        if ext == 'png':
            img = tf.image.decode_png(file_bytes, channels=3)
        elif ext in ['jpg', 'jpeg']:
            img = tf.image.decode_jpeg(file_bytes, channels=3)
        else:
            raise ValueError("Image extension not supported")

        img = tf.cast(img, tf.float32) / 255.0
        img = tf.image.resize(img, target_size)

        return img
    
    def decode_with_labels(path, label):
        return decode(path), label
    
    return decode_with_labels if with_labels else decode


def build_augmenter(with_labels=True):
    def augment(img):
        img = tf.image.random_flip_left_right(img)
        img = tf.image.random_flip_up_down(img)
        return img
    
    def augment_with_labels(img, label):
        return augment(img), label
    
    return augment_with_labels if with_labels else augment


def build_dataset(paths, labels=None, bsize=32, cache=True,
                  decode_fn=None, augment_fn=None,
                  augment=True, repeat=True, shuffle=1024, 
                  cache_dir=""):
    if cache_dir != "" and cache is True:
        os.makedirs(cache_dir, exist_ok=True)
    
    if decode_fn is None:
        decode_fn = build_decoder(labels is not None)
    
    if augment_fn is None:
        augment_fn = build_augmenter(labels is not None)
    
    AUTO = tf.data.experimental.AUTOTUNE
    slices = paths if labels is None else (paths, labels)
    
    dset = tf.data.Dataset.from_tensor_slices(slices)
    dset = dset.map(decode_fn, num_parallel_calls=AUTO)
    dset = dset.cache(cache_dir) if cache else dset
    dset = dset.map(augment_fn, num_parallel_calls=AUTO) if augment else dset
    dset = dset.repeat() if repeat else dset
    dset = dset.shuffle(shuffle) if shuffle else dset
    dset = dset.batch(bsize).prefetch(AUTO)
    
    return dset

In [None]:
COMPETITION_NAME = "ranzcr-clip-catheter-line-classification"
strategy = auto_select_accelerator()
BATCH_SIZE = strategy.num_replicas_in_sync * 16
# GCS_DS_PATH = KaggleDatasets().get_gcs_path(COMPETITION_NAME)

In [None]:
load_dir = f"/kaggle/input/{COMPETITION_NAME}/"
df = pd.read_csv(load_dir + 'train.csv')
paths = load_dir + "train/" + df['StudyInstanceUID'] + '.jpg'

sub_df = pd.read_csv(load_dir + 'sample_submission.csv')
test_paths = load_dir + "test/" + sub_df['StudyInstanceUID'] + '.jpg'

# Get the multi-labels
label_cols = sub_df.columns[1:]
labels = df[label_cols].values

In [None]:
# Train test split
(
    train_paths, valid_paths, 
    train_labels, valid_labels
) = train_test_split(paths, labels, test_size=0.2, random_state=42)

In [None]:
# Build the tensorflow datasets
IMSIZES = (224, 240, 260, 300, 380, 456, 528, 600)
# index i corresponds to b-i
size = IMSIZES[2]

decoder = build_decoder(with_labels=True, target_size=(size, size))
test_decoder = build_decoder(with_labels=False, target_size=(size, size))

# Build the tensorflow datasets
dtrain = build_dataset(
    train_paths, train_labels, bsize=BATCH_SIZE, 
    cache_dir='/kaggle/tf_cache', decode_fn=decoder
)

dvalid = build_dataset(
    valid_paths, valid_labels, bsize=BATCH_SIZE, 
    repeat=False, shuffle=False, augment=False, 
    cache_dir='/kaggle/tf_cache', decode_fn=decoder
)

dtest = build_dataset(
    test_paths, bsize=BATCH_SIZE, repeat=False, 
    shuffle=False, augment=False, cache=False, 
    decode_fn=test_decoder
)

In [None]:
model_path = '../input/tfkeras-efficientnet-weights/efficientnetb2_notop.h5'  # imagenet
n_labels = labels.shape[1]

with strategy.scope():
    model = tf.keras.Sequential([
        efn.EfficientNetB2(
            input_shape=(size, size, 3),
            weights=model_path,
            include_top=False,
            drop_connect_rate=0.5),
        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.Dense(n_labels, activation='sigmoid')
    ])
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=[tf.keras.metrics.AUC(multi_label=True)])
    model.summary()

In [None]:
# ############### Train the model ###############
steps_per_epoch = train_paths.shape[0] // BATCH_SIZE
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    'model.h5', save_best_only=True, monitor='val_auc', mode='max')
lr_reducer = tf.keras.callbacks.ReduceLROnPlateau(
    monitor="val_auc", patience=3, min_lr=1e-6, mode='max')

In [None]:
history = model.fit(
    dtrain, 
    epochs=20,
    verbose=1,
    callbacks=[checkpoint, lr_reducer],
    steps_per_epoch=steps_per_epoch,
    validation_data=dvalid)

In [None]:
model.load_weights('model.h5')

In [None]:
hist_df = pd.DataFrame(history.history)
hist_df.to_csv('history.csv')

In [None]:
sub_df[label_cols] = model.predict(dtest, verbose=1)
sub_df.to_csv('submission.csv', index=False)

sub_df.head()