# Prepare DICOM Images for ML

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pydicom
import glob
import datetime

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
## First, read all of my DICOM files into a list
mydicoms = glob.glob("/kaggle/input/vinbigdata-chest-xray-abnormalities-detection/train/*.dicom")

### Let's look at the contents of the first DICOM:

In [None]:
dcm1 = pydicom.dcmread(mydicoms[1])

In [None]:
dcm1

Looking at the attributes listed above, I can see that I'm looking to extract the following attributes: 
* Patient's Sex
* Patient's Age
* Patient's Weight
* Patient's Size

In [None]:
# Patient's Sex
dcm1[(0x0010, 0x0040)].value

In [None]:
# Patient's Age
dcm1[(0x0010, 0x1010)].value

In [None]:
# Rows
dcm1[(0x0028, 0x0010)].value

In [None]:
# Cols
dcm1.get((0x0028, 0x0011)).value

In [None]:
sex_key = (0x0010, 0x0040)
def get_patients_sex(dcm):
    if sex_key in dcm:
        return dcm[sex_key].value
    else:
        return None

In [None]:
age_key = (0x0010, 0x1010)
def get_patients_age(dcm):
    if age_key in dcm:
        return dcm[age_key].value
    else:
        return None

In [None]:
weight_key = (0x0010, 0x1030)
def get_patients_weight(dcm):
    if weight_key in dcm:
        return dcm[weight_key].value
    else:
        return None

In [None]:
size_key = (0x0010, 0x1020)
def get_patients_size(dcm):
    if size_key in dcm:
        return dcm[size_key].value
    else:
        return None

## Now, let's create the dataframe that we want, and populate it in a loop with all of our DICOMS:

In [None]:
all_data = pd.read_csv('../input/vinbigdata-chest-xray-abnormalities-detection/train.csv')
sampled_data = all_data # .sample(frac=0.25)
n = len(sampled_data.index)
print('n:', n)
train_dir = '../input/vinbigdata-chest-xray-abnormalities-detection/train'
sexes = []
ages = []
weights = []
sizes = []
i = 0
start_time = datetime.datetime.now()
for index, row in sampled_data.iterrows():
    image_id = row['image_id']
    file_path = train_dir + "/" + image_id + '.dicom'
    dcm = pydicom.dcmread(file_path, stop_before_pixels=True)
    sex = get_patients_sex(dcm)
    sexes.append(sex)
    age = get_patients_age(dcm)
    ages.append(age)
    weight = get_patients_weight(dcm)
    weights.append(weight)
    size = get_patients_size(dcm)
    sizes.append(size)
    i += 1
    if i % 100 == 0:
        print()
        fraction_done = i / n
        print('fraction_done:', fraction_done)
        current_time = datetime.datetime.now()
        elapsed_minutes = int((current_time - start_time).total_seconds()) // 60
        print('elapsed_minutes:', elapsed_minutes)
        if elapsed_minutes > 0:
            records_per_minute = i / elapsed_minutes
            remaining_minutes = (n - i) // records_per_minute
            print('remaining_minutes:', remaining_minutes)
sampled_data['sex'] = sexes
sampled_data['age'] = ages
sampled_data['weight'] = weights
sampled_data['size'] = sizes

sampled_data.to_csv("all_data.csv")

In [None]:
sampled_data.head()