In [None]:
import numpy as np
import pandas as pd
import cv2
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import json
from sklearn import preprocessing

IMAGE_DIR_TRAIN = '/kaggle/input/ranzcr-clip-catheter-line-classification/train/'
IMAGE_DIR_TEST = '/kaggle/input/ranzcr-clip-catheter-line-classification/test/'

## About this Notebook

There are good EDA Notebooks published to understand data of competition. Here there are any nice examples:

* [Detailed resource notebook & EDA for beginners](https://www.kaggle.com/bipinkrishnan/detailed-resource-notebook-eda-for-beginners) by [Bipin Krishnan P](https://www.kaggle.com/bipinkrishnan). A good intro to understand basic concepts about the competition.
* [RANZCR - Exploratory Data Analysis](https://www.kaggle.com/ihelon/ranzcr-exploratory-data-analysis) by [Yaroslav Isaienkov](https://www.kaggle.com/ihelon). Good annotations examples.
* [RANZCR-CLiP : One Stop For All EDA Needs](https://www.kaggle.com/foolofatook/ranzcr-clip-one-stop-for-all-eda-needs) by [Aayush Jain](https://www.kaggle.com/foolofatook). Nice EDA. See category overlap section.

However, we think that there are a couple of data that may be of interest to the competition.

## EDA

In [None]:
images = []

for filename in tqdm(os.listdir(IMAGE_DIR_TRAIN)):
    file = IMAGE_DIR_TRAIN + filename
    img = cv2.imread(file)
    UID = os.path.splitext(filename)[0]
    images.append([UID, 'steelblue', filename, img.shape[0], img.shape[1], img.shape[2]])

for filename in tqdm(os.listdir(IMAGE_DIR_TEST)):
    file = IMAGE_DIR_TEST + filename
    img = cv2.imread(file)
    UID = os.path.splitext(filename)[0]
    images.append([UID, 'blue', filename, img.shape[0], img.shape[1], img.shape[2]])

df_files = pd.DataFrame(images, columns=['UID','set','filename','h', 'w', 'c'])

In [None]:
le = preprocessing.LabelEncoder()
le.fit([
    'CVC - Abnormal',
    'CVC - Borderline',
    'CVC - Normal',
    'ETT - Abnormal',
    'ETT - Borderline',
    'ETT - Normal',
    'NGT - Abnormal',
    'NGT - Borderline',
    'NGT - Incompletely Imaged',
    'NGT - Normal',
    'Swan Ganz Catheter Present'
])

ce = preprocessing.LabelEncoder()
ce.fit([
    '.01',
    '.02',
    '.03',
    '.101',
    '.102',
    '.103',
    '.501',
    '.502',
    '.503',
    '.504',
    '.911'
])

df_train = pd.read_csv('../input/ranzcr-clip-catheter-line-classification/train.csv')
df_train.columns = ['UID','ETTA','ETTB','ETTN','NGTA','NGTB','NGTI','NGTN','CVCA','CVCB','CVCN','SGCP','PID']
df_annotations = pd.read_csv('../input/ranzcr-clip-catheter-line-classification/train_annotations.csv')
df_annotations.columns = ['UID','label','data']
df_annotations = df_annotations.join(df_files.set_index('UID'),how='inner',on='UID')

df_annotations['c'] = ce.inverse_transform(le.transform(df_annotations.label))

In [None]:
df = pd.DataFrame()
for i, ann in tqdm(df_annotations.iterrows(), total=len(df_annotations)):
    row_df = pd.DataFrame(json.loads(ann.data),columns=['x1', 'y1'])
    row_df['c'] = ann.c
    row_df['label'] = ann.label
    row_df['x'] = row_df.x1/ann.w
    row_df['y'] = row_df.y1/ann.h
    df = df.append(row_df)

### Image Sizes


In [None]:
df_files[['h','w','set','filename']].groupby(by=['h','w','set'], as_index=False).count().plot(figsize=(20, 10), kind='scatter', x='w', y='h', s='filename', c='set', alpha=0.2)
plt.show()

### Annotations

We want to show the areas in which the annotations are placed on average on an ideal x-ray. Below is an x-ray with some interesting anatomical elements.

In [None]:
plt.figure(figsize=(16, 16))
img = plt.imread("../input/ranzcr-jpgs/Mediastinal_structures_on_chest_X-ray_annotated-1536x1245.jpg")
plt.imshow(img)
plt.show()

(image credits https://www.radiologia2cero.com/dispositivos-en-la-rx-de-torax/)

Now, try to make an imaginative effort and suppose the following images over the previous one.

#### All annotations

In [None]:
df.plot(figsize=(16, 16), kind='scatter', x='x', y='y', c='c', s=100, alpha=.1)
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.gca().invert_yaxis()
plt.show()

#### CVC annotations

In [None]:
df.query("label in('CVC - Abnormal','CVC - Borderline', 'CVC - Normal')").plot(figsize=(16, 16), kind='scatter', x='x', y='y', s=100, alpha=.1)
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.gca().invert_yaxis()
plt.show()

#### ETT annotations

In [None]:
df.query("label in('ETT - Abnormal','ETT - Borderline', 'ETT - Normal')").plot(figsize=(16, 16), kind='scatter', x='x', y='y', s=100, alpha=.1)
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.gca().invert_yaxis()
plt.show()

#### NGT annotations

In [None]:
df.query("label in('NGT - Abnormal','NGT - Borderline', 'NGT - Incompletely Imaged', 'NGT - Normal')").plot(figsize=(16, 16), kind='scatter', x='x', y='y', s=100, alpha=.1)
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.gca().invert_yaxis()
plt.show()

#### Swan Ganz catheter annotations

In [None]:
df.query("label in('Swan Ganz Catheter Present')").plot(figsize=(16, 16), kind='scatter', x='x', y='y', s=100, alpha=.1)
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.gca().invert_yaxis()
plt.show()