In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import pydicom as dicom
import matplotlib
import matplotlib.pylab as plt
from pydicom.pixel_data_handlers.util import apply_voi_lut
import glob
import cv2

In [None]:
class color:
    PURPLE = '\033[1;35;48m'
    CYAN = '\033[1;36;48m'
    BOLD = '\033[1;37;48m'
    BLUE = '\033[1;34;48m'
    GREEN = '\033[1;32;48m'
    YELLOW = '\033[1;33;48m'
    RED = '\033[1;31;48m'
    BLACK = '\033[1;30;48m'
    UNDERLINE = '\033[4;37;48m'
    MAGENTA = "\033[35m"
    WHITE = "\033[97m"
    BOLD = '\033[1m' + '\033[93m'
    END = '\033[0m'
    
    
plt.rcParams['figure.figsize'] = [9, 7]
pd.options.display.max_columns = None

In [None]:
# !conda install -c conda-forge gdcm -y      # just commented out to save commit time

In [None]:
def get_process_csv():
    ''' Read the files from disk, remove redundant strings from columns, merge both
        the dfs to form 1 and few pre-processing steps.
    '''
    
    train_image = pd.read_csv('../input/siim-covid19-detection/train_image_level.csv')
    train_image['id'] = train_image['id'].str.split('_', expand = True)[0]
    
    train_study = pd.read_csv('../input/siim-covid19-detection/train_study_level.csv')
    train_study['id'] = train_study['id'].str.split('_', expand = True)[0]
    
    train = pd.merge(train_image, train_study, left_on='StudyInstanceUID', right_on='id')
    
    train.pop('StudyInstanceUID')
    
    train.rename(columns = {"id_x" : "image_id", "id_y" : "study_id"}, inplace = True)
    
    clmns = train.columns.tolist()
    clmns = [clmns[0], clmns[3], clmns[1], clmns[2]] + clmns[4:]
    train = train[clmns]
    
    return train_image, train_study, train

In [None]:
train_image, train_study, train = get_process_csv()
train_len = len(train)
train.info()

In [None]:
train.describe(include = 'all').T

In [None]:
train_images_path = '../input/siim-covid19-detection/train'
image_files_path = glob.glob(f'{train_images_path}/**/*.dcm', recursive=True)  # extract images from the directory

In [None]:
def add_image_path_col(train):
    '''Add directory path of the image to the dataframe
    '''
    
    train.insert(2, 'image_path', np.nan)
    for image_path in image_files_path:

        image_id = image_path.split('/')[-1].split('.')[0]
        index = train[train['image_id'] == image_id].index
        train.at[index, 'image_path'] = image_path
        
    return train

In [None]:
train = add_image_path_col(train)
train.head()

In [None]:
def dicom2array(path, voi_lut=True, fix_monochrome=True):
    # convert dcm type images into numpy arrays
    
    dcm = dicom.read_file(path)
    if voi_lut:
        try:
            data = apply_voi_lut(dcm.pixel_array, dcm)
        except RuntimeError:
            print('An error occured while de-compressing the file from dicom to array')
            return [1]
    else:
        data = dcm.pixel_array
    if fix_monochrome and dcm.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
    return data

In [None]:
meta_data_cols = ['SpecificCharacterSet', 'SOPClassUID', 'SOPInstanceUID', 'StudyDate', 'StudyTime', 'AccessionNumber',
            'Modality', 'PatientName', 'PatientID', 'PatientSex', 'BodyPartExamined', 'PhotometricInterpretation']

def add_meta_cols(train, meta_cols):
    '''Add meta data from dcm files to the dataframe
    '''
    train[meta_cols] = np.nan
    
    for image_path in image_files_path:
        image_data = dicom.dcmread(image_path)
        
        index = train[train['image_path'] == image_path].index

        for col in meta_cols:
            train.at[index, col] = str(image_data.get(col))
            
    return train

In [None]:
# train = add_meta_cols(train, meta_data_cols)  # will take around 20 min to run this code :(
# train.to_csv('./train_v1.csv', index=False)
train = pd.read_csv('../input/train-v1/train_v1.csv')
train = train.drop('boxes', 1)  # remove the boxes column from dataset
train.head()

In [None]:
def plot_bbox(path, label, part):
    '''Plot the dcm image with bounding boxes
    '''
    
    thickness = 5
    img = dicom2array(path)
    
    if len(img) == 1:
        print('Cannot display this image, try with different one.')
        return 
    
    plt.figure(figsize = (10, 8))
    
    if 'none' in label:
        plt.title(part)
        plt.imshow(img, cmap = 'bone')
        
    else:
        count = label.count('opacity')
        label = label.split()
        label = [float(val) for val in label if val not in ('opacity', '1')]
        
        for k in range(count):
            i = k * 4
            j = (k + 1) * 4
            box = label[i:j]
            cv2.rectangle(img, box, color = [255, 0, 0], thickness = thickness)
        
        plt.title(part)
        plt.imshow(img, cmap = 'bone')

In [None]:
index = np.random.randint(train_len)
plot_bbox(train.loc[index, 'image_path'], train.loc[index, 'label'], train.loc[index, 'BodyPartExamined'])

# EDA

## How many images are there in the dataset?

In [None]:
print(color.BOLD + 'Images in the dataset:' + color.END, train['image_id'].nunique())

## How many study levels does the dataset consist?

In [None]:
print(color.BOLD + 'study Level in the dataset:' + color.END, train['study_id'].nunique())

## How is study level distributed?

In [None]:
a = train['study_id'].value_counts().value_counts().sort_index()
plot = a.plot(kind = 'bar')
plt.xticks(rotation = 0)
plt.xlabel('Number of images in the study', fontsize = 12)
plt.ylabel('Number of study', fontsize = 12)

for bar in plot.patches:
    plot.annotate(bar.get_height(), 
               (bar.get_x() + bar.get_width() / 2, 
                bar.get_height()), ha='center', va='center',
               size=14, xytext=(0, 8),
               textcoords='offset points')

for index in a.index:
    print(f'Studies with {index} images: {a[index]}.',
          f'It has {round(a[index]/train_len * 100, 2)}% images of the dataset')

print(f'\n\nWe can see that about 92% of images in the dataset belong to unqiue studies. Most of the studies consist of just a single image.')
print(f'{color.BOLD}A study at maximum has 9 images.\nA study at minimum has 1 image{color.END}.\n')

## How many images consist of how many bounding boxes and how are they distributed?

In [None]:
a = train['label'].str.count('opacity').value_counts().sort_index()

plot = a.plot(kind = 'bar')
plt.title('How many images consist of how many boxes')
plt.xlabel('Number of boxes in the image', fontsize = 12)
plt.ylabel('Number of images', fontsize = 12)
plt.xticks(rotation = 0)

for bar in plot.patches:
      plot.annotate(bar.get_height(), 
                   (bar.get_x() + bar.get_width() / 2, 
                    bar.get_height()), ha='center', va='center',
                   size=14, xytext=(0, 8),
                   textcoords='offset points')
        
for index in a.index:
    print(f'{round(a[index]/train_len * 100, 2)}% of images consist of {index} bounding boxes\n')
print('\n')

### OOps! I thought images consist of 3 boxes at max. But there is an image which consist of 8 boxes.

### Since there are only 2 examples where boxes are more than 4. Therefore we'll drop these 2 examples from the dataset.

In [None]:
ind_to_drop = train[train['label'].str.count('opacity') > 4].index
train = train.drop(ind_to_drop)
print(train.shape)

In [None]:
# code to check whether the label column consist of two types of labels: 
# 1. label with 'none'
# 2. label with 'opacity'

# train[(train['label'].str.count('opacity') < 1) & (train['label'].str.count('none') != 1 )]

## How is abnormality distributed in the dataset?

In [None]:
study_label = ['Negative for Pneumonia', 'Typical Appearance', 'Indeterminate Appearance', 'Atypical Appearance']
sum_of_abnorm = {}
for col in study_label:
    sum_of_abnorm[col] = train[col].sum(axis = 0)
    

plot = pd.Series(sum_of_abnorm).plot(kind = 'barh')

for bar in plot.patches:
    plot.annotate(str(bar.get_width()) +' ('+ str(round(bar.get_width()/train_len*100, 2))+')' , 
               (bar.get_width() + 120, 
                bar.get_y() + bar.get_height()/2), ha='center', va='center',
                   size=14, xytext=(30, 0),
                   textcoords='offset points')

### 1. The above graph shows that around 47% of images consist of Typical Appearance. 
### 2. Around 27% of images do not show any abnormality.
### 3. The dataset is imbalanced!

## Does each image consist of only one label/abnormality?

In [None]:
subdf = train.loc[:, study_label].astype('object')
subdf['sum'] = 0
subdf['sum'] = subdf.sum(axis = 1).astype('object')
a = len(subdf[subdf['sum'] != 1.0])
print(f'{a} number of times there are more than 1 label for an image\n')
subdf['sum'].plot()
plt.yticks([0, 1, 2])
plt.ylabel('No. of labels for an image', fontsize = 12)
plt.xlabel('Image index', fontsize = 12);

### Each image is just labelled with 1 class. Hence it is a multi-class classification problem at image level.

In [None]:
a = subdf.describe().T
a['percent'] = a['freq']/a['count'] * 100
print(a)

## Does no boxes in an image mean no abnormality?

In [None]:
none_pneu, none_typical, none_indet, none_atypical = 0, 0, 0, 0
box_pneu, box_typical, box_indet, box_atypical = 0, 0, 0, 0

for index, row in train.iterrows():

    if 'none' in row['label']: # if no bounding box in the image
        none_pneu += row['Negative for Pneumonia']
        none_typical += row['Typical Appearance']
        none_indet += row['Indeterminate Appearance']
        none_atypical += row['Atypical Appearance']
        
    else: # if atleast one box is present in the image
        box_pneu += row['Negative for Pneumonia']
        box_typical += row['Typical Appearance']
        box_indet += row['Indeterminate Appearance']
        box_atypical += row['Atypical Appearance']
        
a = {'BBox': ['Absent', 'Present'], 'Negative for Pneumonia': [none_pneu, box_pneu], 'Typical Appearance': [none_typical, box_typical],
    'Indeterminate Appearance': [none_indet, box_indet], 'Atypical Appearance': [none_atypical, box_atypical]}

a = pd.DataFrame(a).set_index('BBox')

plot = a.plot(kind = 'bar')
plt.xticks(rotation = 0)
plt.ylabel('No. of images')

for bar in plot.patches:
    plot.annotate(bar.get_height(), 
               (bar.get_x() + bar.get_width() / 2, 
                bar.get_height()), ha='center', va='center',
                   size=14, xytext=(0, 8),
                   textcoords='offset points')

### From the above plot:
### 1. It is not necessary that if there are no bounding boxes in the image, then there is no abnormality. From the plot we can see that, even when bounding boxes are absent there are labels of abnormalities.
### 2. If there is atleast a single bounding box in the image, then we can say that it consist of some abnormality.

## Does each study consist of only one label?

In [None]:
labels = ['study_id'] + study_label 
subdf = train[labels]
print(subdf.info())

In [None]:
df = pd.DataFrame()
for col in study_label:
    group = subdf.groupby('study_id')[col].sum().sort_values(ascending = False)
    df[col] = group.value_counts()

fig, axes = plt.subplots(2, 2, figsize = (20, 15))

for col, ax in zip(study_label, axes.ravel()):
    plot = df[col].plot(kind = 'bar', ax = ax, rot =0, title = col, sharex = True, sharey = True)
    ax.set_xlabel(f'Number of times the study is positive for abnormality')
    ax.set_ylabel('Number of studies')

    for bar in plot.patches:
        plot.annotate(bar.get_height(), 
                   (bar.get_x() + bar.get_width() / 2, 
                    bar.get_height()), ha='center', va='center',
                       size=14, xytext=(0, 8),
                       textcoords='offset points')

### Most of the studies are highly imbalanced for abnormalities.

## What is Specific Character Set in dcm meta data? Is it useful for modelling?

### Found out that it is some kind of encoding for dcm data. Hence it's of no use for modelling. So we'll drop the column from the df.

In [None]:
# train['StudyDate'].value_counts()

In [None]:
# train.groupby('StudyDate')[['image_id', 'study_id']].count().sort_values(by = 'image_id', ascending = False).T.plot(kind = 'bar', legend = None, rot = 0);

In [None]:
# l = ['study_id'] + study_label
# def func(df):
#     for col in df.columns:
#         if col == 'study_id':
#             df['study_id'] = df['study_id'].count()
#         else:
#             df[col] = df[col].sum()
#     return df
    
# train.groupby('StudyDate')[l].apply(lambda df: func(df)).sort_values(by = 'Negative for Pneumonia', ascending = False).tail(10)

In [None]:
# print(train['AccessionNumber'].nunique())
# print('Equal to number of studies')

In [None]:
# print(train['Modality'].value_counts(normalize = True))

# plot = train.groupby('Modality')[study_label].sum().plot(kind = 'bar', rot = 0)

# for bar in plot.patches:
#     plot.annotate(bar.get_height(), 
#                (bar.get_x() + bar.get_width() / 2, 
#                 bar.get_height()), ha='center', va='center',
#                    size=14, xytext=(0, 8),
#                    textcoords='offset points')

In [None]:
print(train['PatientName'].nunique())

In [None]:
plot = train.groupby('PatientName')['study_id'].count().value_counts().sort_index().plot(kind = 'bar', rot = 0)

plt.xlabel('Number of studies a patient is part of')
plt.ylabel('Number of patients')
for bar in plot.patches:
    plot.annotate(bar.get_height(), 
               (bar.get_x() + bar.get_width() / 2, 
                bar.get_height()), ha='center', va='center',
                   size=14, xytext=(0, 8),
                   textcoords='offset points')

### There are patients who are part of 26 studies.

In [None]:
print(train['PatientID'].nunique())

In [None]:
print(train['PatientSex'].value_counts(normalize = True))

In [None]:
plot = train.groupby('PatientSex')[study_label].sum().plot(kind = 'bar', rot = 0)
plt.ylabel('Number of Patients')

for bar in plot.patches:
    plot.annotate(bar.get_height(), 
               (bar.get_x() + bar.get_width() / 2, 
                bar.get_height()), ha='center', va='center',
                   size=14, xytext=(0, 8),
                   textcoords='offset points')

### Gender seems to be useful, since men tends to have Typical Appearance of Covid-19 more than women in the dataset.

In [None]:
def func(df):
    d = {}
    for col in df.columns:
        if col == 'image_id':
            d['No. of imgs'] = df[col].count()
        else:
            d[col] = df[col].sum()
    d = pd.Series(d)
    return d

train.groupby('BodyPartExamined', dropna = False)[['image_id'] + study_label].apply(func)

### Since the Nan in 'BodyPartExamined' matches with TORAX label, so we'll fill nan values with TORAX

In [None]:
train['BodyPartExamined'].fillna('TORAX', inplace = True)

In [None]:
index_ = train[train['BodyPartExamined'] == 'SKULL'].index
index_ = np.random.choice(index_, 5)

for ind in index_:
    plot_bbox(train.loc[ind, 'image_path'], train.loc[ind, 'label'], train.loc[ind, 'BodyPartExamined'])

In [None]:
print(train['PhotometricInterpretation'].value_counts())   #normalize = True

In [None]:
def func(df):
    d = {}
    for col in df.columns:
        if col == 'image_id':
            d['No. of imgs'] = df[col].count()
        else:
            d[col] = df[col].sum()
    d = pd.Series(d)
    return d

a = train.groupby('PhotometricInterpretation')[['image_id'] + study_label].apply(func)
a

In [None]:
plot = a.plot(kind = 'bar', rot = 0)
plt.ylabel('No. of images')

for bar in plot.patches:
    plot.annotate(bar.get_height(), 
               (bar.get_x() + bar.get_width() / 2, 
                bar.get_height()), ha='center', va='center',
                   size=14, xytext=(0, 8),
                   textcoords='offset points')

## We'll drop the columns which seems of no use for modelling.

In [None]:
train = train.drop(['SpecificCharacterSet', 'SOPClassUID', 'SOPInstanceUID', 'AccessionNumber', 'StudyDate', 'StudyTime',
                   'Modality', 'PatientID', 'PhotometricInterpretation'], 1)
train.to_csv('./TRAIN.csv', index = False)
print('File saved.')

In [None]:
print(train.info())

### Thank you for going through it. Any improvement or correction please let me know.

### All the Best for the competition and if you found this notebook useful, upvotes will be appreciated.