In [None]:
import os
import cv2
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set()

# Analyzing the Train dataset

In [None]:
path = '/kaggle/input/ranzcr-clip-catheter-line-classification/'
train = pd.read_csv(path + 'train.csv')
train.head(1)

In [None]:
print('Shape of Training File: ', train.shape)
print('Unique Patients: ', len(train['PatientID'].unique()))

In [None]:
classes_to_predict = ['ETT - Abnormal', 'ETT - Borderline','ETT - Normal', 'NGT - Abnormal', 'NGT - Borderline',
                      'NGT - Incompletely Imaged', 'NGT - Normal', 'CVC - Abnormal','CVC - Borderline', 'CVC - Normal', 
                      'Swan Ganz Catheter Present']

for class_ in classes_to_predict:
    number_of_positives = len(train[train[class_] == 1])
    print(class_, '|', number_of_positives, '|', number_of_positives/len(train))

We see that the majority of classes are about normal insertions. However, we see that the percentage is summing up more than 100%, so maybe we can have more than on catheter inserted into someone at the same time, let's verify it.

In [None]:
train.set_index('StudyInstanceUID', inplace=True)
train.drop(columns='PatientID', inplace=True)
train['sum'] = train.sum(axis=1)

In [None]:
train[train['sum'] > 1].tail(4)

In fact, we have patients with more than one insertion and even more than one insertion of the same type, such as a Normal CVC and a Borderline CVC. Let's verify how this is distributed.

In [None]:
plt.figure(figsize=(15,9))
plt.title('Number of Catheters Inserted Distribution')
train['sum'].hist()

So, this csv has only the identification and the classes for each of the images on the train/ directory. As we could see, some images may have more than one positive (1) label and the 'Normal' labels are the majority of the classes, which creates an unbalance on the dataset as could be expected from this kind of problem.

# Analyzing the train annotations

In [None]:
path = '/kaggle/input/ranzcr-clip-catheter-line-classification/'
train_annotations = pd.read_csv(path + 'train_annotations.csv')
train_annotations.head(1)

This seems to be the annotation with the segmentation of the tube inside each image. This could be useful at some point, let's see if all of the training data has this kind of annotation. As this appears to be a long format, let's also verify if we have more than one row per ID.

In [None]:
train_annotations['StudyInstanceUID'].value_counts()

As expected, we indeed have more than one row per id.

In [None]:
annotations_ids = train_annotations['StudyInstanceUID'].unique()
print('Total number of training images: ', len(train.reset_index()['StudyInstanceUID'].unique()))
print('With annotations: ', len(train.reset_index()[train.reset_index()['StudyInstanceUID'].isin(annotations_ids)]['StudyInstanceUID'].unique()))

So, we have less than one third of the training examples with annotations.

## Visual inspection of the training set

In [None]:
for class_ in classes_to_predict:
    ids = train[train[class_] == 1].sample(3).index
    
    img = path + 'train/' + ids[0] + '.jpg'
    img = cv2.imread(img)
    plt.figure(figsize=(15,9))
    plt.imshow(img)
    plt.grid(None)
    plt.title(class_)