In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import warnings
import os
import pydicom
import cv2

## Explaining some confusing aspects of the data
Every image in train_image data is assigned to one study in train_study data (StudyInstanceUID column). But one study can contain multiple images (more info in the **STUDY DATA** section).  

Columns 'boxes' and 'label' in train_image are somewhat redundant:
* boxes contains location and size of the boxes
* label contains location (xmin, ymin, xmax, ymax) of the boxes and confidence (which for the training data is always 1)

So we don't need 'boxes' column, we can just use the info from 'label' column.

### The 'label' column in train_image data
label column has a structure:  
a) for image with no boxes: `none 1 0 0 1 1`  
b) for image with boxes: `opacity 1 <xmin> <ymin> <xmax> <ymax>`
* if there is more than one box, there are multiple labels like this in a row, for example:  
`opacity 1 789.29 582.43 1815.94 2499.73 opacity 1 2245.91 591.21 3340.57 2352.75`
  

### How should the submission file look like?
For images it is consistent with the label column!<br/>
a) for image with no boxes: `id_image,none 1 0 0 1 1`  
b) for image with boxes: `id_image,opacity <confidence> <xmin> <ymin> <xmax> <ymax>`  
And for studies: `id_study,<class> <confidence> 0 0 1 1`

Example from the Kaggle Evaluation tab:
```
Id,PredictionString
2b95d54e4be65_study,negative 1 0 0 1 1
2b95d54e4be66_study,typical 1 0 0 1 1
2b95d54e4be67_study,indeterminate 1 0 0 1 1 atypical 1 0 0 1 1
2b95d54e4be68_image,none 1 0 0 1 1
2b95d54e4be69_image,opacity 0.5 100 100 200 200 opacity 0.7 10 10 20 20
```

Also, if I understand correctly, every study in train and test data is assigned to exactly one class (the classes are mutually exclusive) and it's best if you predict that class. But if you are not sure about your prediction, you can also try multilabel classification, for example: `2b95d54e4be65_study,negative 0.7 0 0 1 1 indeterminate 0.6 0 0 1 1`.

## Let's look at the data!

In [None]:
train_image = pd.read_csv('../input/siim-covid19-detection/train_image_level.csv')
train_study = pd.read_csv('../input/siim-covid19-detection/train_study_level.csv')
print('shape of train_image:', train_image.shape)
train_image.head()

In [None]:
print('shape of train_study:', train_study.shape)
train_study.head()

In [None]:
TRAIN_DIR = '../input/siim-covid19-detection/train'
TEST_DIR = '../input/siim-covid19-detection/test'

# IMAGE DATA

This is how you can get to an image:

In [None]:
# take first example and get path to it
from os import walk, listdir
image_dir = train_image['StudyInstanceUID'][0]
path_to_img = TRAIN_DIR + '/' + image_dir 
path_to_img = path_to_img +'/' + listdir(path_to_img)[0] 
path_to_img = path_to_img + '/' + next(walk(path_to_img))[2][0]

The image files are in DICOM (.dcm) format. This format is often used in the medical images since it contains information about the patient (to avoid mismatching the patients' data). DICOM format can be handled using pydicom package (see more: https://pydicom.github.io/pydicom/stable/index.html )

In [None]:
data = pydicom.dcmread(path_to_img)
# get the pixel information into a numpy array
img = data.pixel_array
print('The image has {} x {} voxels'.format(img.shape[0],
                                            img.shape[1]))

In [None]:
print(img, img.shape)

In [None]:
import matplotlib.pyplot as plt
plt.imshow(img, cmap='gray')
plt.show()

In [None]:
# this can help with increasing the contrast
from skimage import exposure
equ_img = exposure.equalize_hist(img)
plt.imshow(equ_img, cmap='gray')
plt.show()

In [None]:
equ_img

### Let's see the boxes...

In [None]:
box1 = train_image.label[0].split()[:6]
box2 = train_image.label[0].split()[6:]
img = cv2.rectangle(img,(int(float(box1[2])), int(float(box1[3]))), 
                    (int(float(box1[4])), int(float(box1[5]))),
                    color=(0, 0, 0), thickness=15)

img = cv2.rectangle(img,(int(float(box2[2])), int(float(box2[3]))), 
                    (int(float(box2[4])), int(float(box2[5]))),
                    color=(0, 0, 0), thickness=15)
plt.imshow(img, cmap='gray')
plt.show()

In [None]:
# now on image with equalize_hist
equ_img = cv2.rectangle(equ_img,(int(float(box1[2])), int(float(box1[3]))), 
                    (int(float(box1[4])), int(float(box1[5]))),
                    color=(0, 0, 0), thickness=15)

equ_img = cv2.rectangle(equ_img,(int(float(box2[2])), int(float(box2[3]))), 
                    (int(float(box2[4])), int(float(box2[5]))),
                    color=(0, 0, 0), thickness=15)
plt.imshow(equ_img, cmap='gray')
plt.show()

In [None]:
shape1, shape2, ratios = [], [], []
for study_id in set(train_image.StudyInstanceUID):
    path1 = TRAIN_DIR + '/' + study_id 
    for p in listdir(path1):
        path_to_img = path1 +'/' + p
        path_to_img = path_to_img + '/' + next(walk(path_to_img))[2][0]
        data = pydicom.dcmread(path_to_img)
        sh1 = data.Rows
        sh2 = data.Columns
        shape1.append(sh1)
        shape2.append(sh2)
        ratios.append(sh1/sh2)

In [None]:
plt.hist(shape1, bins=20)
plt.title('Height of the images')
plt.show()
plt.hist(shape2, bins=20)
plt.title('Width of the images')
plt.show()
plt.hist(ratios, bins=20)
plt.title('Height to width ratio of the images')
plt.show()

# STUDY DATA

In [None]:
# the studies in train_study_level.csv file aren't multilabeled:
labels_sum = train_study.iloc[:,1:].sum(axis=1)
print('Min number of labels:', min(labels_sum))
print('Max number of labels:', max(labels_sum))

In [None]:
train_study.iloc[:,1:].sum().plot(kind='bar', figsize=(10,6), grid=True, rot=0,
                                  title='Frequency of the labels', width=2/3)
plt.show()

For most of the studies (5822 out of 6334) there is only 1 image but for some there are more (up to 9 images).  
Here is an explanation from Competition Host:
> Most of the studies only have 1 image.  
In some cases, however, there are studies with more than 1 image. In these cases, patients were imaged more than once on the same date/time (same StudyInstanceUID). In some cases, there is motion artifact, so the tech re-took the image. In other cases, different image processing is applied (the images look almost identical, but there is subtle change in contrast). In other cases, there are coverage, image penetration, or other technique issues, presumably resulting in the technologist needing to retake radiographs.  
(...) We are addressing this issue currently regarding the duplicates and test set. We'll let you know when this process is completed.

[source](https://www.kaggle.com/c/siim-covid19-detection/discussion/240250#1322940)

In [None]:
train_image.StudyInstanceUID.value_counts().plot(kind='hist', logy=True, bins=np.arange(1,11)-0.5,
                                           xticks=range(1,10), title='Number of images per study',
                                           figsize=(10,6), rwidth=0.9)
plt.show()
print(train_image.StudyInstanceUID.value_counts().value_counts())

## Boxes
All of the images marked as "Negative for Pneumonia" don't have any boxes. But there are also some images from other categories with no boxes.

In [None]:
n_nan = train_image.boxes.isna().sum()
n_boxes = len(train_image.boxes) - n_nan
print(f'There are {n_boxes} images with boxes and {n_nan} images without any boxes.')

In [None]:
print('Labels of images with no boxes')
train_study[train_study.id.str.split('_').str[0].isin(set(train_image[train_image.boxes.isna()].StudyInstanceUID))].iloc[:,1:].sum()

In [None]:
print('Labels of images with boxes')
train_study[train_study.id.str.split('_').str[0].isin(set(train_image[~train_image.boxes.isna()].StudyInstanceUID))].iloc[:,1:].sum()