# Problem

DICOM files contain valuable metadata to classify examples and evaluate your model.

This notebook explains how to extract these metadata and convert them into a DataFrame.

Feel free to share your approach to this problem if you find your solution easier or more efficient :)

# Solution

In [None]:
import pathlib
import itertools

In [None]:
import pydicom
import pandas as pd

In [None]:
root = pathlib.Path('/kaggle/input/siim-covid19-detection')

In [None]:
metadata = []
for file in root.glob('**/*.dcm'):
    print('.', end='')
    # read the file, but ignore pixel values
    dcm = pydicom.dcmread(file, stop_before_pixels=True)
    # iterall will magically convert RawElement from DataElement
    elements = itertools.chain(dcm.iterall(), dcm.file_meta.iterall())
    for elem in elements:
        meta = {
            'VM': elem.VM,
            'VR': elem.VR,
            'tag': elem.tag,
            'name': elem.name,
            'keyword': elem.keyword,
            # SQ values are redundant with other elements
            'data': elem.value if elem.VR != 'SQ' else None,
            'value': elem.repval,
            'filename': file.name,
        }
        metadata.append(meta)
metadata = pd.DataFrame(metadata)
print('\nDONE:', len(metadata))
metadata.tail(50)

In [None]:
# display the last dcm for comparison
dcm

In [None]:
# display the metadata schema
metadata.info()

# Good luck with the competition :)

**If you found this notebook useful, please consider leaving an upvote!** 