In [1]:
from functools import partial
from collections import defaultdict
import pydicom
import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

np.warnings.filterwarnings('ignore')

In [2]:
labels = pd.read_csv('/home/ryan/cs/datasets/rsna/stage_1_train_labels.csv')
details = pd.read_csv('/home/ryan/cs/datasets/rsna/stage_1_detailed_class_info.csv')
# duplicates in details just have the same class so can be safely dropped
details = details.drop_duplicates('patientId').reset_index(drop=True)
labels_w_class = labels.merge(details, how='inner', on='patientId')

In [4]:
details.head()

Unnamed: 0,patientId,class
0,0004cfab-14fd-4e49-80ba-63a80b6bddd6,No Lung Opacity / Not Normal
1,00313ee0-9eaa-42f4-b0ab-c148ed3241cd,No Lung Opacity / Not Normal
2,00322d4d-1c29-4943-afc9-b6754be640eb,No Lung Opacity / Not Normal
3,003d8fa0-6bf1-40ed-b54c-ac657f8495c5,Normal
4,00436515-870c-4b36-a041-de91049b9ab4,Lung Opacity


In [3]:
# get lists of all train/test dicom filepaths
train_dcm_fps = glob.glob('/home/ryan/cs/datasets/rsna/stage_1_train_images/*.dcm')
test_dcm_fps = glob.glob('/home/ryan/cs/datasets/rsna/stage_1_test_images/*.dcm')

# read each file into a list (using stop_before_pixels to avoid reading the image for speed and memory savings)
train_dcms = [pydicom.read_file(x, stop_before_pixels=True) for x in train_dcm_fps]
test_dcms = [pydicom.read_file(x, stop_before_pixels=True) for x in test_dcm_fps]

In [4]:
def parse_dcm_metadata(dcm):
    unpacked_data = {}
    group_elem_to_keywords = {}
    # iterating here to force conversion from lazy RawDataElement to DataElement
    for d in dcm:
        pass
    # keys are pydicom.tag.BaseTag, values are pydicom.dataelem.DataElement
    for tag, elem in dcm.items():
        tag_group = tag.group
        tag_elem = tag.elem
        keyword = elem.keyword
        group_elem_to_keywords[(tag_group, tag_elem)] = keyword
        value = elem.value
        unpacked_data[keyword] = value
    return unpacked_data, group_elem_to_keywords

train_meta_dicts, tag_to_keyword_train = zip(*[parse_dcm_metadata(x) for x in train_dcms])
test_meta_dicts, tag_to_keyword_test = zip(*[parse_dcm_metadata(x) for x in test_dcms])

In [5]:
# join all the dicts
unified_tag_to_key_train = {k:v for dict_ in tag_to_keyword_train for k,v in dict_.items()}
unified_tag_to_key_test = {k:v for dict_ in tag_to_keyword_test for k,v in dict_.items()}

# quick check to make sure there are no different keys between test/train
assert len(set(unified_tag_to_key_test.keys()).symmetric_difference(set(unified_tag_to_key_train.keys()))) == 0

tag_to_key = {**unified_tag_to_key_test, **unified_tag_to_key_train}
tag_to_key

{(8, 5): 'SpecificCharacterSet',
 (8, 22): 'SOPClassUID',
 (8, 24): 'SOPInstanceUID',
 (8, 32): 'StudyDate',
 (8, 48): 'StudyTime',
 (8, 80): 'AccessionNumber',
 (8, 96): 'Modality',
 (8, 100): 'ConversionType',
 (8, 144): 'ReferringPhysicianName',
 (8, 4158): 'SeriesDescription',
 (16, 16): 'PatientName',
 (16, 32): 'PatientID',
 (16, 48): 'PatientBirthDate',
 (16, 64): 'PatientSex',
 (16, 4112): 'PatientAge',
 (24, 21): 'BodyPartExamined',
 (24, 20737): 'ViewPosition',
 (32, 13): 'StudyInstanceUID',
 (32, 14): 'SeriesInstanceUID',
 (32, 16): 'StudyID',
 (32, 17): 'SeriesNumber',
 (32, 19): 'InstanceNumber',
 (32, 32): 'PatientOrientation',
 (40, 2): 'SamplesPerPixel',
 (40, 4): 'PhotometricInterpretation',
 (40, 16): 'Rows',
 (40, 17): 'Columns',
 (40, 48): 'PixelSpacing',
 (40, 256): 'BitsAllocated',
 (40, 257): 'BitsStored',
 (40, 258): 'HighBit',
 (40, 259): 'PixelRepresentation',
 (40, 8464): 'LossyImageCompression',
 (40, 8468): 'LossyImageCompressionMethod'}

In [6]:
# using from_records here since some values in the dicts will be iterables and some are constants
train_df = pd.DataFrame.from_records(data=train_meta_dicts)
test_df = pd.DataFrame.from_records(data=test_meta_dicts)
train_df['dataset'] = 'train'
test_df['dataset'] = 'test'
df = pd.concat([train_df, test_df])

In [7]:
# separating PixelSpacing list to single values
df['PixelSpacing_x'] = df['PixelSpacing'].apply(lambda x: x[0])
df['PixelSpacing_y'] = df['PixelSpacing'].apply(lambda x: x[1])
df = df.drop(['PixelSpacing'], axis='columns')

# x and y are always the same
assert sum(df['PixelSpacing_x'] != df['PixelSpacing_y']) == 0

In [8]:
# ReferringPhysicianName appears to just be empty strings
assert sum(df['ReferringPhysicianName'] != '') == 0

# SeriesDescription appears to be 'view: {}'.format(ViewPosition)
set(df['SeriesDescription'].unique())

# so these two columns don't have any useful info and can be safely dropped

{'view: AP', 'view: PA'}

In [9]:
nunique_all = df.aggregate('nunique')
nunique_all

AccessionNumber                    1
BitsAllocated                      1
BitsStored                         1
BodyPartExamined                   1
Columns                            1
ConversionType                     1
HighBit                            1
InstanceNumber                     1
LossyImageCompression              1
LossyImageCompressionMethod        1
Modality                           1
PatientAge                        97
PatientBirthDate                   1
PatientID                      26684
PatientName                    26684
PatientOrientation                 1
PatientSex                         2
PhotometricInterpretation          1
PixelRepresentation                1
ReferringPhysicianName          8714
Rows                               1
SOPClassUID                        1
SOPInstanceUID                 26684
SamplesPerPixel                    1
SeriesDescription                  2
SeriesInstanceUID              26684
SeriesNumber                       1
S

In [10]:
# drop constant cols and other two from above
df = df.drop(nunique_all[nunique_all == 1].index.tolist() + ['ReferringPhysicianName', 'SeriesDescription'], axis='columns')

# now that we have a clean metadata dataframe we can merge back to our initial tabular data with target and class info
df = df.merge(labels_w_class, how='left', left_on='PatientID', right_on='patientId')

df['PatientAge'] = df['PatientAge'].astype(int)

In [11]:
# df now has multiple rows for some patients (those with multiple bounding boxes in label_w_class)
# so creating one with no duplicates for patients
df_deduped = df.drop_duplicates('PatientID', keep='first')

In [12]:
df.head()

Unnamed: 0,PatientAge,PatientID,PatientName,PatientSex,SOPInstanceUID,SeriesInstanceUID,StudyInstanceUID,ViewPosition,dataset,PixelSpacing_x,PixelSpacing_y,patientId,x,y,width,height,Target,class
0,38,5eaceedc-ba7d-49aa-91f4-2029dfddb831,5eaceedc-ba7d-49aa-91f4-2029dfddb831,F,1.2.276.0.7230010.3.1.4.8323329.17926.15178744...,1.2.276.0.7230010.3.1.3.8323329.17926.15178744...,1.2.276.0.7230010.3.1.2.8323329.17926.15178744...,AP,train,0.168,0.168,5eaceedc-ba7d-49aa-91f4-2029dfddb831,,,,,0.0,Normal
1,23,daeccecd-b911-4da9-b484-c03057e7e883,daeccecd-b911-4da9-b484-c03057e7e883,M,1.2.276.0.7230010.3.1.4.8323329.27941.15178744...,1.2.276.0.7230010.3.1.3.8323329.27941.15178744...,1.2.276.0.7230010.3.1.2.8323329.27941.15178744...,PA,train,0.143,0.143,daeccecd-b911-4da9-b484-c03057e7e883,,,,,0.0,No Lung Opacity / Not Normal
2,40,c57a1cc3-5122-4c6d-a022-743f96e9332f,c57a1cc3-5122-4c6d-a022-743f96e9332f,M,1.2.276.0.7230010.3.1.4.8323329.5685.151787431...,1.2.276.0.7230010.3.1.3.8323329.5685.151787431...,1.2.276.0.7230010.3.1.2.8323329.5685.151787431...,AP,train,0.139,0.139,c57a1cc3-5122-4c6d-a022-743f96e9332f,,,,,0.0,No Lung Opacity / Not Normal
3,79,eb14c72c-7645-4e79-bfc9-6eba7e2d6dbe,eb14c72c-7645-4e79-bfc9-6eba7e2d6dbe,M,1.2.276.0.7230010.3.1.4.8323329.2254.151787429...,1.2.276.0.7230010.3.1.3.8323329.2254.151787429...,1.2.276.0.7230010.3.1.2.8323329.2254.151787429...,PA,train,0.171,0.171,eb14c72c-7645-4e79-bfc9-6eba7e2d6dbe,,,,,0.0,No Lung Opacity / Not Normal
4,61,163b3240-2726-45f0-9054-9119bab7fd88,163b3240-2726-45f0-9054-9119bab7fd88,F,1.2.276.0.7230010.3.1.4.8323329.13360.15178743...,1.2.276.0.7230010.3.1.3.8323329.13360.15178743...,1.2.276.0.7230010.3.1.2.8323329.13360.15178743...,PA,train,0.171,0.171,163b3240-2726-45f0-9054-9119bab7fd88,,,,,0.0,Normal


In [13]:
df_features = df[['PatientAge', 'PatientSex', 'ViewPosition', 'dataset', 'PixelSpacing_x', 'x', 'y', 'width', 'height', 'Target', 'class']]

In [33]:
df_features.head()

Unnamed: 0,PatientAge,PatientSex,ViewPosition,dataset,PixelSpacing_x,x,y,width,height,Target,class
0,38,F,AP,train,0.168,,,,,0.0,Normal
1,23,M,PA,train,0.143,,,,,0.0,No Lung Opacity / Not Normal
2,40,M,AP,train,0.139,,,,,0.0,No Lung Opacity / Not Normal
3,79,M,PA,train,0.171,,,,,0.0,No Lung Opacity / Not Normal
4,61,F,PA,train,0.171,,,,,0.0,Normal


In [37]:
X = df_features[df_features.dataset=='train'][['PatientAge', 'PatientSex', 'ViewPosition', 'PixelSpacing_x']]
y = df_features[df_features.dataset=='train'][['Target']]

In [38]:
X.head()

Unnamed: 0,PatientAge,PatientSex,ViewPosition,PixelSpacing_x
0,38,F,AP,0.168
1,23,M,PA,0.143
2,40,M,AP,0.139
3,79,M,PA,0.171
4,61,F,PA,0.171


In [39]:
X.columns

Index(['PatientAge', 'PatientSex', 'ViewPosition', 'PixelSpacing_x'], dtype='object')

In [40]:
X_one_hot = pd.get_dummies(X)

In [41]:
X_one_hot.head()

Unnamed: 0,PatientAge,PixelSpacing_x,PatientSex_F,PatientSex_M,ViewPosition_AP,ViewPosition_PA
0,38,0.168,1,0,1,0
1,23,0.143,0,1,0,1
2,40,0.139,0,1,1,0
3,79,0.171,0,1,0,1
4,61,0.171,1,0,0,1


In [42]:
from xgboost import XGBRegressor

In [43]:
split = int(0.9 * len(X))
X_trn = X_one_hot[:split]
X_val = X_one_hot[split:]
y_trn = y[:split]
y_val = y[split:]

In [81]:
model = XGBRegressor(n_estimators=100, min_child_weight=1, objective='reg:logistic')

In [82]:
model.fit(X_trn, y_trn)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [83]:
model.predict(X_val)

array([0.51393807, 0.51393807, 0.5303191 , ..., 0.13734649, 0.49654263,
       0.11519517], dtype=float32)

In [84]:
print(y_val.shape)

(2899, 1)


In [85]:
print(type(model.predict(X_val)))

<class 'numpy.ndarray'>


In [86]:
print(np.mean(np.abs(y_val.values[:, 0] - model.predict(X_val))))

0.3482319433711117


In [50]:
model.predict(X_val)

array([0.5203189 , 0.5203189 , 0.520746  , ..., 0.1353971 , 0.49521974,
       0.12526909], dtype=float32)