# 0 IMPORT

In [None]:
import numpy as np
import pandas as pd

from fastai.basics import *
from fastai.callback.all import *
from fastai.vision.all import *
from fastai.medical.imaging import *
import pydicom

# 1 LOADING THE DATASET

In [None]:
train_image_path = "../input/siim-covid19-detection/train_image_level.csv"
sample_sub_path = "../input/siim-covid19-detection/sample_submission.csv"
train_study_path = "../input/siim-covid19-detection/train_study_level.csv"
print ("__study level csv__")
train_study_level = pd.read_csv(train_study_path)
print (train_study_level.head())

print ("__sample submission__")
sample_sub = pd.read_csv(sample_sub_path)
print (sample_sub.head())

print ("__train level csv__")
train_image_level = pd.read_csv(train_image_path)
print (train_image_level.head())

# 2 WORKING ON DATAFRAMES

In [None]:
#rename the column to merge the dataframes
train_study_level.rename(columns = {'id': 'StudyInstanceUID'}, inplace =True)
train_study_level[:1]

In [None]:
#remove the _study in 'StudyInstanceUID'
train_study_level['StudyInstanceUID'] = train_study_level['StudyInstanceUID'].str.strip('_study')
train_study_level[:5]

In [None]:
#merging the two dataframes
train_df = train_image_level.merge(train_study_level)
train_df[:1]

We then clean up the dataframe. We drop a few columns and make a single column as class label. We also drop `boxes` since we can use `label` to obtain coordinates for binding boxes.

In [None]:
train_df['id'] = train_df['id'].str.strip('_image')
train_df.loc[train_df['Negative for Pneumonia']==1, 'class_y'] = 'Negative'
train_df.loc[train_df['Typical Appearance']==1, 'class_y'] = 'Typical'
train_df.loc[train_df['Indeterminate Appearance']==1, 'class_y'] = 'Indeterminate'
train_df.loc[train_df['Atypical Appearance']==1, 'class_y'] = 'Atypical'
train_df.drop(['boxes', 'Negative for Pneumonia', 'Typical Appearance', 
             'Indeterminate Appearance', 'Atypical Appearance', 'StudyInstanceUID'], axis=1, inplace=True)
train_df[:5]

In [None]:
train_df.class_y.value_counts()

In [None]:
#identifying number of boxes
num_of_boxes = []
for i in train_df.index:
    label_len = len(train_df.label[i].split(' '))
    num_box = label_len//6
    num_of_boxes.append(num_box)

In [None]:
train_df['num_of_boxes'] = num_of_boxes
train_df.head() 

In [None]:
train_df.num_of_boxes.value_counts()

Now we extract co-ordinates of the binding boxes from `label`

In [None]:

label_COORD = []
for i in train_df.index:
    num_of_boxes = train_df.num_of_boxes[i]
    val = train_df.label[i].split(' ')
    if num_of_boxes == 1 : coord = val[2:6]
    if num_of_boxes == 2 : coord = val[2:6] + val [8:12]
    if num_of_boxes == 3 : coord = val[2:6] + val [8:12] + val [14:18]
    if num_of_boxes == 4 : coord = val[2:6] + val [8:12] + val [14:18] + val[20:24]
    if num_of_boxes == 5 : coord = val[2:6] + val [8:12] + val [14:18] + val[20:24] + val[26:30]
    label_COORD.append(coord)
     

In [None]:
train_df['label_COORD'] = label_COORD
del train_df['label']
train_df.head()

Rename the column `id` to `SOPInstanceUID` to merge the dataframe with meta data

In [None]:
train_df.rename(columns = {'id':'SOPInstanceUID'},inplace = True)
train_df[:1]

# 3 LOADING THE META DATA

[We then load the DICOM metadata that we have obtained](https://www.kaggle.com/slimshadymm/visualizing-dicoms) 

In [None]:
dicom_df = pd.read_pickle('../input/visualizing-dicoms/dicoms_df.pkl')
dicom_df.head()

In [None]:
dicom_merge = pd.merge(dicom_df,train_df, on = 'SOPInstanceUID')
dicom_merge[:1]

Save the dataframe as `.csv` file. Before that, we check that the file path `fname` corresponds to the correct `SOPINstanceUID`

In [None]:
dicom_merge['fname'][100]

In [None]:
dicom_merge['SOPInstanceUID'][100]

In [None]:
dicom_merge.to_csv('dicom_merge.csv', index = False)