### Welcome to RANZCR CLiP - Catheter and Line Position Challenge
### In short, this notebook contains the following information:
* How many times each of the classes apear in training set?
* Distribution of each of the 4 class categories (CVC, NGT, EET and SGC) in the training set
* How many tubes of each category a train image can contain? And their correlation
* Proper 5 fold CV split and looking at number of classes in each of the folds

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import cv2
import matplotlib.image as mpimg
from sklearn.model_selection import GroupKFold

In [None]:
#This function is taken from here: https://www.kaggle.com/c/ranzcr-clip-catheter-line-classification/discussion/204638 Thanks to @gunesevitan
def visualize_annotations(filename):
    image = cv2.imread(f'../input/ranzcr-clip-catheter-line-classification/train/{filename}')
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    study_instance_uid = filename.split('.jpg')[0]
    if study_instance_uid in df_train_annotations['StudyInstanceUID'].values:
        labels = df_train_annotations.loc[df_train_annotations['StudyInstanceUID'] == study_instance_uid]['label'].values.tolist()
        lines = df_train_annotations.loc[df_train_annotations['StudyInstanceUID'] == study_instance_uid]['data'].apply(lambda x: eval(x)).values.tolist()
        #print(f'Sample {study_instance_uid}\n{"-" * (7 + len(study_instance_uid))}\n')
        fig = plt.figure(figsize=(image.shape[0] // 300, image.shape[1] // 300))
        ax = plt.imshow(image)
        for line, label in zip(lines, labels):
            #print(f'{label}\n{"-" * len(label)}\n{line}\n')            
            xs = []
            ys = []
            for point in line:
                xs.append(point[0])
                ys.append(point[-1])
            plt.scatter(xs, ys, s=40, label=label)
        plt.tick_params(axis='x', labelsize=10)
        plt.tick_params(axis='y', labelsize=10)
        plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0, prop={'size': 10})
        #plt.title(f'{study_instance_uid} Annotations', size=10, pad=10)
        plt.show()
    else:
        return None

In [None]:
X_train = pd.read_csv("../input/ranzcr-clip-catheter-line-classification/train.csv")
df_train_annotations = pd.read_csv("../input/ranzcr-clip-catheter-line-classification/train_annotations.csv")
sample_sub = pd.read_csv("../input/ranzcr-clip-catheter-line-classification/sample_submission.csv")

### Number of images in the training set

In [None]:
X_train.shape[0]

### Target columns

In [None]:
target_cols = X_train.columns[1:12]
target_cols

## Class distribution in the Training Set

In [None]:
target_cols = X_train.columns[1:12]
target_counts = X_train[target_cols].sum(axis = 0).sort_values(ascending = False)
plt.figure(figsize = (15, 5))
sns.barplot(y = target_counts.index.values, x = target_counts.values)

### Now lets take a look at how many images contain at least one type of CVC, NGT, ETT and Swan Ganz Catheter Present

In [None]:
cvc_target = ["CVC - Normal", "CVC - Borderline", "CVC - Abnormal"]
ngt_target = ["NGT - Normal", "NGT - Incompletely Imaged", "NGT - Borderline", "NGT - Abnormal"]
ett_target = ["ETT - Normal", "ETT - Borderline", "ETT - Abnormal"]
sgcp_target = ["Swan Ganz Catheter Present"]
shorten_tar = ["CVC", "NGT", "ETT", "Swan Ganz Catheter Present"]

cvc_counts = X_train[X_train.apply(lambda x: x["CVC - Normal"] == 1 or x["CVC - Borderline"] == 1 or x["CVC - Abnormal"] == 1, axis = 1)]
ngt_counts = X_train[X_train.apply(lambda x: x["NGT - Normal"] == 1 or x["NGT - Borderline"] == 1 or x["NGT - Abnormal"] == 1 or x["NGT - Incompletely Imaged"] == 1, axis = 1)]
ett_counts = X_train[X_train.apply(lambda x: x["ETT - Normal"] == 1 or x["ETT - Borderline"] == 1 or x["ETT - Abnormal"] == 1, axis = 1)]
sgcp_counts = X_train[X_train.apply(lambda x: x["Swan Ganz Catheter Present"] == 1, axis = 1)]

plt.figure(figsize = (10,7))
sns.barplot(y = [cvc_counts.shape[0] * 100 / 30083, ngt_counts.shape[0] * 100 / 30083, ett_counts.shape[0] * 100 / 30083, sgcp_counts.shape[0] * 100 / 30083], x = shorten_tar, palette="ch:.25")
plt.xlabel("Category", fontsize = 20)
plt.ylabel("% of training set", fontsize = 20)

## How many images contain more than one tube of each categories ("CVC", "NGT", "EET" and "Swan Ganz Catheter Present")

# CVC (Central Venous Catheter)

### Number of images containing 0, 1, 2 and 3 CVC tubes

In [None]:
vals = pd.DataFrame(X_train["CVC - Normal"] + X_train["CVC - Abnormal"] + X_train["CVC - Borderline"]).value_counts().sort_index(ascending = True)
vals = pd.Series(vals.values).sort_values(ascending = False)
plt.figure(figsize = (15, 5))
sns.barplot(x = vals.index.values, y = vals.values, palette = "Reds")
plt.ylabel("# of images", fontsize = 15)
plt.xlabel("# of CVC tubes", fontsize = 15)

### An image with all the three CVC categories True

In [None]:
a = X_train[X_train.apply(lambda x: x["CVC - Normal"] == 1 and x["CVC - Borderline"] == 1 and x["CVC - Abnormal"] == 1, axis = 1)]
cvc3_ex = a.iloc[0]["StudyInstanceUID"]
a.head(1)

In [None]:
visualize_annotations(f'{cvc3_ex}.jpg')

# NGT (Nasogastric tube)

### Number of images containing different number of NGT tubes

In [None]:
vals = pd.DataFrame(X_train["NGT - Normal"] + X_train["NGT - Abnormal"] + X_train["NGT - Borderline"] + X_train["NGT - Incompletely Imaged"]).value_counts().sort_index(ascending = True)
vals = pd.Series(vals.values).sort_values(ascending = False)
plt.figure(figsize = (10, 5))
sns.barplot(x = vals.index.values, y = vals.values, palette = "Greens")
plt.ylabel("# of images", fontsize = 15)
plt.xlabel("# of NGT tubes", fontsize = 15)

### Interesting, there are 4 columns corresponding to NGT tube placement, but there is no image with more than 2 NGT tubes. And most of the images (21775) don't have NGT tube at all. What NGT tubes appear together?

In [None]:
X_train[X_train["NGT - Normal"] + X_train["NGT - Abnormal"] + X_train["NGT - Borderline"] + X_train["NGT - Incompletely Imaged"] == 2].head()[ngt_target]

### Ah, it seems that there is no correlation between NGT tubes

# ETT (Endotracheal tube)

## How many ETT tube can train image contain?

In [None]:
vals = pd.DataFrame(X_train["ETT - Normal"] + X_train["ETT - Abnormal"] + X_train["ETT - Borderline"]).value_counts().sort_index(ascending = True)
vals = pd.Series(vals.values).sort_values(ascending = False)
plt.figure(figsize = (10, 5))
sns.barplot(x = vals.index.values, y = vals.values, palette = "Blues")
plt.ylabel("# of images", fontsize = 15)
plt.xlabel("# of ETT tubes", fontsize = 15)

### Majority of the images (21626) does not contain ETT tube. Remaining, contain only one ETT tube. Indeed, only one ETT tube can be placed in patient.

## How to split training data into k fold CV?

In [None]:
folds = []
group_kfold = GroupKFold(n_splits=5)

for train_index, test_index in group_kfold.split(X_train, X_train[X_train.columns[1:12]], X_train["PatientID"]):
    folds.append(X_train.iloc[test_index])

In [None]:
for i, fold in enumerate(folds):
    fold['fold'] = f'fold{i + 1}'

### How many images are there in each of the folds?

In [None]:
plt.figure(figsize = (10, 5))
sns.barplot(y = ["fold1", "fold2", "fold3", "fold4", "fold5"], x = [folds[i].shape[0] for i in [0, 1, 2, 3, 4]], palette = "Purples")
plt.xlabel("# of images" ,fontsize = 15)

### Number of positive target classes in each of the folds

In [None]:
df = pd.concat([folds[0], folds[1], folds[2], folds[3], folds[4]])

d = pd.DataFrame(columns=["placement", "fold"])
for col in target_cols:
    for fold in folds:
        a = fold[fold[col] == 1][[col, "fold"]]
        a = a.rename(columns = {col:"placement"})
        a["placement"] = col
        d = pd.concat([d, a])

plt.figure(figsize = (15,5))
ax = sns.countplot(x = "placement", hue = "fold", data = d)
ax.set_xticklabels(ax.get_xticklabels(), rotation = 40, ha = "right")
plt.tight_layout()
plt.show()

### Future work
* To look at correlation of classes in each of the folds. I think, this can lead to overfitting issue if the correlations are different.

## Don't forget to upvote if you find this notebook helpful :)