**Analysis of the image metadata for the [SIIM COVID-19 Detection](https://www.kaggle.com/c/siim-covid19-detection/overview) competition**.

**Conclusions**:
- Some fields are obfuscated and cannot be used as-is.
- The following list gives the potential associated with each field:
    - *ignored*: can be ignored, no potential for other tasks.
    - *technical*: can be used to improve image reading/parsing.
    - *detection*: can be integrated for the object detection task.
    - *classification*: can be integrated for the study classification task.

In [None]:
fields = {
    '': {'ignored'},
    'AccessionNumber': {'ignored'},
    'BitsAllocated': {'ignored'},
    'BitsStored': {'ignored'},
    'BodyPartExamined': {'ignored'},
    'CodeMeaning': {'ignored'},
    'CodeValue': {'ignored'},
    'CodingSchemeDesignator': {'ignored'},
    'CodingSchemeVersion': {'ignored'},
    'Columns': {'ignored'},
    'DeidentificationMethod': {'ignored'},
    'DeidentificationMethodCodeSequence': {'ignored'},
    'FileMetaInformationGroupLength': {'ignored'},
    'FileMetaInformationVersion': {'ignored'},
    'HighBit': {'ignored'},
    'ImageType': {'ignored'},
    'ImagerPixelSpacing': {'ignored'},
    'ImplementationClassUID': {'ignored'},
    'ImplementationVersionName': {'ignored'},
    'InstanceNumber': {'ignored'},
    'MediaStorageSOPClassUID': {'ignored'},
    'MediaStorageSOPInstanceUID': {'ignored'},
    'Modality': {'ignored'},
    'PatientID': {'ignored'},
    'PatientName': {'ignored'},
    'PatientSex': {'classification', 'detection'},
    'PhotometricInterpretation': {'technical'},
    'PixelRepresentation': {'ignored'},
    'Rows': {'ignored'},
    'SOPClassUID': {'ignored'},
    'SOPInstanceUID': {'ignored'},
    'SamplesPerPixel': {'ignored'},
    'SeriesInstanceUID': {'ignored'},
    'SeriesNumber': {'ignored'},
    'SpecificCharacterSet': {'ignored'},
    'StudyDate': {'ignored'},
    'StudyID': {'ignored'},
    'StudyInstanceUID': {'ignored'},
    'StudyTime': {'ignored'},
    'TransferSyntaxUID': {'ignored'},
}

In [None]:
ALLOWED = {'ignored', 'technical', 'detection', 'classification'}
for name, tags in fields.items():
    assert tags.issubset(ALLOWED), name

# CONFS

In [None]:
ROOT = '/kaggle/input/siim-covid19-detection'

# IMPORTS

In [None]:
import pathlib
import pydicom
import itertools
import pandas as pd
import plotly.express as px

# DATASETS

In [None]:
meta = []
for file in pathlib.Path(ROOT).glob('**/*.dcm'):
    print('.', end='') # progress bar
    # read the file, but ignore pixel values
    dcm = pydicom.dcmread(file, stop_before_pixels=True)
    # iterall will magically convert RawElement from DataElement
    elements = itertools.chain(dcm.iterall(), dcm.file_meta.iterall())
    for elem in elements:
        data = {
            'VM': elem.VM,
            'VR': elem.VR,
            'tag': elem.tag,
            'name': elem.name,
            'keyword': elem.keyword,
            # SQ values are redundant with other elements
            'data': elem.value if elem.VR != 'SQ' else None,
            'value': elem.repval,
            'filename': file.name,
        }
        meta.append(data)
df = pd.DataFrame(meta)
print('DONE:', len(df))
df.head()

In [None]:
wide = df.drop_duplicates(subset=['filename', 'keyword'])
wide = wide.pivot(index='filename', columns='keyword', values='value')
wide.head()

# ANALYSIS

In [None]:
df.info()

In [None]:
'#Filename', df['filename'].nunique()

In [None]:
'#Tag', df['tag'].nunique(),'#Name', df['name'].nunique(),'#Keyword', df['keyword'].nunique()

## VM = Value Multiplicity

In [None]:
vm_counts = df['VM'].value_counts()
vm_counts

In [None]:
px.bar(vm_counts)

## VR = Value Representation

https://pydicom.github.io/pydicom/stable/guides/element_value_types.html

In [None]:
vr_counts = df['VR'].value_counts()
vr_counts

In [None]:
px.bar(vr_counts)

## Keywords

In [None]:
keyword_counts = df['keyword'].value_counts()
keyword_counts

In [None]:
px.bar(keyword_counts)

## Associations

In [None]:
for name, values in wide.iteritems():
    fig = px.bar(values.value_counts(dropna=False), title=name)
    fig.show()