This notebook aggregates the shape of the image and checks the actual image.  
This shows that the margins at the edges of the image are not consistent and need to be considered for pre-processing.

update:  
ver4: Data that takes a long time to create has been changed to be read from kaggle [Datasets](https://www.kaggle.com/currypurin/osic-image-eda).  
ver7: added crop preprocessing

In [None]:
import os
import pickle
from pathlib import Path
import gc
import pandas as pd
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import pydicom
from pydicom.tag import Tag
import gc

pd.options.display.max_rows=200


In [None]:
INPUT = Path("../input/osic-pulmonary-fibrosis-progression/")
train = pd.read_csv(INPUT / 'train.csv')
dataset_dir = Path("../input/osic-image-eda/")

In [None]:
ls ../input/osic-image-eda/ 

# name and number of dicoms

In [None]:
def get_n_dicom_df(train):
    df_list = []
    for patient_id in train['Patient'].unique():
        patient_dir = INPUT / 'train' / patient_id
        path_list = list(patient_dir.glob("*"))
        n_dicom = len(path_list)
        n_list = [int(str(i).split('/')[-1].split('.')[0]) for i in path_list]
        sort_n_list = sorted(n_list)
        tmp_df = pd.DataFrame({'Patient': [patient_id],
                               'n_dicom': [n_dicom],
                               'n_list': [sort_n_list]})
        df_list.append(tmp_df)
        
    n_dicom_df = pd.concat(df_list, sort=False)
    return n_dicom_df.reset_index(drop=True)

In [None]:
n_dicom_df = get_n_dicom_df(train)
n_dicom_df.to_csv('n_dicom_df.csv', index=False)
n_dicom_df.head(15)

The number of dicoms is different for each patient. Also, the file may not start with one.

In [None]:
plt.hist(n_dicom_df['n_dicom'], bins=20)
plt.title('Number of dicom per patient');

In [None]:
n_dicom_df['n_dicom'].value_counts().head(10)

# image shape

In [None]:
shape_df_path = dataset_dir / "shape_df.csv"
if shape_df_path.is_file():
    shape_df = pd.read_csv(shape_df_path)
else:
    !conda install -c conda-forge gdcm -y
    gr = train.groupby('Patient')
    df_list = []
    for patient_id, group_df in tqdm(gr):
        height_list = []
        width_list = []
        shape_list = []
        tmp_df_list = []
        for dcm_path in (INPUT / 'train' / patient_id).glob("*"):
            try:
                dicom = pydicom.dcmread(dcm_path)
                tmp_df = pd.DataFrame({'Patient': [patient_id],
                                       'height': [dicom[Tag("Rows")].value],
                                       'width': [dicom[Tag("Columns")].value],
                                       'shape': [str(dicom.pixel_array.shape)]})
                tmp_df_list.append(tmp_df)
            except:
                print(dcm_path)
        if len(tmp_df_list) >= 1:
            df = pd.concat(tmp_df_list)
            df.drop_duplicates(inplace=True)
            df_list.append(df.reset_index(drop=True))
    shape_df = pd.concat(df_list)

In [None]:
shape_df.to_csv('shape_df.csv', index=False)
shape_df

In [None]:
shape_df.groupby('Patient').count().max()

In [None]:
shape_list = shape_df['shape'].value_counts().index
shape_df['shape'].value_counts()

shapes with different height and width  
(752, 888) , (734, 888) , (843, 888) , (733, 888) , (1100, 888) ,(788, 888)  

# img

In [None]:
import matplotlib.pyplot as plt

def imshow_dcm(height, width):
    _shape_df = shape_df[shape_df['shape'] == str((height, width))]
    dcm_list = []
    path_list = []
    for i in range(4):
        patient_id = np.random.choice(_shape_df['Patient'])
        dcm_dir = INPUT / f'train/{patient_id}'
        dcm_path = np.random.choice(list(dcm_dir.glob("*")))
        path_list.append(str(dcm_path).split('/')[-2:])
        dicom = pydicom.dcmread(dcm_path)
        dcm_list.append(dicom)
    for i in range(4):
        plt.subplot(2, 2, i+1)
        plt.imshow(dcm_list[i].pixel_array, cmap=plt.cm.bone)
        plt.title(path_list[i])
        

## 512 x 512

In [None]:
plt.figure(figsize=(16, 16))
imshow_dcm(512, 512)

## (768, 768)

In [None]:
plt.figure(figsize=(16, 16))
imshow_dcm(768, 768)


## (752, 888)

In [None]:
plt.figure(figsize=(16, 16))
imshow_dcm(752, 888)


## (632, 632)

In [None]:
plt.figure(figsize=(16, 16))
imshow_dcm(632, 632)

## (734, 888)

In [None]:
plt.figure(figsize=(16, 16))
imshow_dcm(734, 888)

## (843, 888)

In [None]:
plt.figure(figsize=(16, 16))
imshow_dcm(843, 888)

## (733, 888)

In [None]:
plt.figure(figsize=(16, 16))
imshow_dcm(733, 888)

## (1100, 888)

In [None]:
plt.figure(figsize=(16, 16))
imshow_dcm(1100, 888)

## (1302, 1302)

In [None]:
plt.figure(figsize=(16, 16))
imshow_dcm(1302, 1302)

## (788, 888)

In [None]:
plt.figure(figsize=(16, 16))
imshow_dcm(788, 888)

The ones with different height and width seem to have margins and need to be cropped. Let's try cropping them.

# Crop

In [None]:
dcm_path = Path("../input/osic-pulmonary-fibrosis-progression/train/ID00094637202205333947361/8.dcm")
dicom = pydicom.dcmread(dcm_path)
img = dicom.pixel_array
plt.imshow(img, cmap=plt.cm.bone)
plt.title(f"shape: {img.shape}");

In [None]:
# Areas with the same number of pixels on the edges are not required. Crop it.

def crop_image(img: np.ndarray):
    edge_pixel_value = img[0, 0]
    mask = img != edge_pixel_value
    return img[np.ix_(mask.any(1),mask.any(0))]

plt.figure(figsize=(16, 8))
plt.subplot(121)
plt.imshow(img, cmap=plt.cm.bone)
plt.title(img.shape)

plt.subplot(122)
plt.imshow(crop_image(img), cmap=plt.cm.bone)
plt.title(crop_image(img).shape);

This process is based on codes from this great notebook https://www.kaggle.com/ratthachat/aptos-eye-preprocessing-in-diabetic-retinopathy.  
Let's crop the other shapes

In [None]:
shape_df = shape_df.merge(n_dicom_df, on='Patient', how='left')
crop_df = shape_df[shape_df["shape"].isin(["(752, 888)" , "(734, 888)" , "(843, 888)", "(733, 888)" , "(1100, 888)", "(788, 888)"])]
crop_df

In [None]:
# Display the 1.dicom of each Patient.

def get_image_array_from_dicom(patient_id, n):
    dcm_file_path = INPUT / f'train/{patient_id}/{n}.dcm'
    dicom = pydicom.dcmread(dcm_file_path)
    return dicom.pixel_array

for patient_id in crop_df['Patient']:
    n = 1
    image = get_image_array_from_dicom(patient_id, n)
    plt.figure(figsize=(16, 8))
    plt.subplot(121)
    plt.imshow(image, cmap=plt.cm.bone)
    plt.title(image.shape)
    
    plt.subplot(122)
    if image.shape[0] != image.shape[1]:
        image = crop_image(image)
    plt.imshow(image, cmap=plt.cm.bone)
    plt.title(image.shape)    
    plt.show()

# preprocess

First, check the thickness between the images.

In [None]:
shape_df.head()

In [None]:
def get_image_position_diff(patient_id, n_list):
    dicom_path_list = [INPUT / 'train' / patient_id / f'{n}.dcm' for n in n_list]
    dicoms = [pydicom.read_file(path_) for path_ in dicom_path_list]
    diff_list_ = []
    for i in range(len(dicoms)-1):
        try:
            diff = np.abs(dicoms[i].ImagePositionPatient[2] - dicoms[i + 1].ImagePositionPatient[2])
        except AttributeError:
            diff = np.nan
        diff_list_.append(diff)
    return diff_list_

def get_slicethickness_slope_intercept(patient_id, n_list):
    dicom_path_list = [INPUT / 'train' / patient_id / f'{n}.dcm' for n in n_list]
    dicoms = [pydicom.read_file(path_) for path_ in dicom_path_list]
    thickness_list_ = []
    pixelspacing_list_ = []
    slope_list_ = []
    intercept_list_ = []
    for i in range(len(dicoms)):
        try:
            slice_thickness_ = dicoms[i].SliceThickness
        except AttributeError:
            slice_thickness_ = np.nan
        try:
            pixelspacing_ = dicoms[i].PixelSpacing
        except AttributeError:
            pixelspacing_ = np.nan
        try:
            slope_ = dicoms[i].RescaleSlope
        except AttributeError:
            slope_ = np.nan
        try:
            intercept_ = dicoms[i].RescaleIntercept
        except AttributeError:
            intercept_ = np.nan

        thickness_list_.append(slice_thickness_)
        pixelspacing_list_.append(pixelspacing_)
        slope_list_.append(slope_)
        intercept_list_.append(intercept_)
        
    return thickness_list_, pixelspacing_list_, slope_list_, intercept_list_

In [None]:
!conda install -c conda-forge gdcm -y

In [None]:
diff_list = []
for i in tqdm(range(len(shape_df))):
    diff_list.append(get_image_position_diff(shape_df.loc[i, 'Patient'], shape_df.loc[i, 'n_list']))

thickness_list = []
pixelspacing_list = []
slope_list = []
intercept_list = []
for i in tqdm(range(len(shape_df))):
    t, p, s, i = get_slicethickness_slope_intercept(shape_df.loc[i, 'Patient'], shape_df.loc[i, 'n_list'])
    thickness_list.append(t)
    pixelspacing_list.append(p)
    slope_list.append(s)
    intercept_list.append(i)


In [None]:
shape_df['diff_list'] = diff_list
shape_df['thickness_list'] = thickness_list
shape_df['pixelspacing_list'] = pixelspacing_list
shape_df['slope_list'] = slope_list
shape_df['intercept_list'] = intercept_list

## diff

In [None]:
shape_df['diff_list_std'] = shape_df['diff_list'].apply(lambda x: np.array(x).std())
shape_df['diff_list_n_nan'] = shape_df['diff_list'].apply(lambda x: np.sum(pd.Series(x).isna()))

shape_df[['Patient', 'diff_list', 'diff_list_std', 'diff_list_n_nan', 'thickness_list', 'pixelspacing_list']]

## slope and intercept

In [None]:
shape_df[['Patient', 'slope_list', 'intercept_list']]

In [None]:
np.all(shape_df['slope_list'].apply(lambda x:np.all(x)))

In [None]:
with open('shape_df_ver2.pickle', 'wb') as f:
    pickle.dump(shape_df, f)

# Todo

* Create a feature from a dicom image.

# References:

1. https://www.kaggle.com/ratthachat/aptos-eye-preprocessing-in-diabetic-retinopathy
2. https://www.kaggle.com/jameschapman19/pytorch-tabular-qr-histogram