## Generating Features from pixel histograms

preprocessing guided by https://www.kaggle.com/gzuidhof/full-preprocessing-tutorial

In [None]:
!conda install -c conda-forge gdcm --y

In [None]:
import copy
import cv2
from skimage.segmentation import clear_border
from skimage.morphology import ball, disk, dilation, binary_erosion, remove_small_objects, erosion, closing, reconstruction, binary_closing
from skimage.filters import roberts, sobel
from scipy import ndimage as ndi
import os
#import gdcm
from tqdm import tqdm
from skimage import measure, morphology
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
import pandas as pd
from pathlib import Path
import pickle
import pydicom
import pydicom
from scipy.stats import kurtosis
import seaborn as sns
import scipy
pydicom.config.image_handlers = ['gdcm_handler']
#pydicom.config.image_handlers = ['pillow_handler']
from sklearn.model_selection import GroupKFold, GroupShuffleSplit
from torch.utils.data import Dataset
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader, Subset
from torch import optim
import torch.nn as nn
import torch.nn.functional as F
from tqdm import trange
from time import time
import warnings
from scipy.ndimage.interpolation import zoom
from enum import Enum
from torchvision import transforms
from skimage.measure import label, regionprops
from skimage.segmentation import clear_border
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
train=pd.read_csv('../input/osic-pulmonary-fibrosis-progression/train.csv')
test=pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')
submission=pd.read_csv('../input/osic-pulmonary-fibrosis-progression/sample_submission.csv')

In [None]:
train['base_Weeks']=train.groupby(['Patient'])['Weeks'].transform('min')
base=train[train.Weeks==train.base_Weeks]
base = base.rename(columns={'FVC': 'base_FVC','Percent': 'base_Percent'})
base.drop_duplicates(subset=['Patient', 'Weeks'], keep='first',inplace=True)
train=train.merge(base[['Patient','base_FVC','base_Percent']],on='Patient',how='left')
train['Week_passed'] = train['Weeks'] - train['base_Weeks']

In [None]:
test = test.rename(columns={'Weeks': 'base_Weeks', 'FVC': 'base_FVC','Percent': 'base_Percent'})

# Adding Sample Submission
submission = pd.read_csv("../input/osic-pulmonary-fibrosis-progression/sample_submission.csv")

# In submisison file, format: ID_'week', using lambda to split the ID
submission['Patient'] = submission['Patient_Week'].apply(lambda x:x.split('_')[0])

# In submisison file, format: ID_'week', using lambda to split the Week
submission['Weeks'] = submission['Patient_Week'].apply(lambda x:x.split('_')[1]).astype(int)

test = submission.drop(columns = ["FVC", "Confidence"]).merge(test, on = 'Patient')

test['Week_passed'] = test['Weeks'] - test['base_Weeks']

test=test[train.columns.drop(['FVC','Percent'])]

In [None]:
# Load the scans in given folder path
def load_scan(path):

    #slices = [pydicom.read_file(path / s) for s in os.listdir(path)]
    slices = [pydicom.read_file(path / s) for s in os.listdir(path)]
    slices.sort(key = lambda x: float(x.ImagePositionPatient[2]))
    try:
        slice_thickness = np.abs(slices[0].ImagePositionPatient[2] - slices[1].ImagePositionPatient[2])
    except:
        slice_thickness = np.abs(slices[0].SliceLocation - slices[1].SliceLocation)
    if slice_thickness==0:
        slice_thickness=slices[0].SliceThickness
    for s in slices:
        s.SliceThickness = slice_thickness
        
    return slices

In [None]:
def get_pixels_hu(slices):
    image = np.stack([np.array(s.pixel_array,dtype=np.int16) for s in slices])
    # Convert to int16 (from sometimes int16), 
    # should be possible as values should always be low enough (<32k)
    image = image.astype(np.int16)

    # Set outside-of-scan pixels to 0
    # The intercept is usually -1024, so air is approximately 0
    image[image == -2000] = 0
    
    # Convert to Hounsfield units (HU)
    for slice_number in range(len(slices)):
        
        intercept = slices[slice_number].RescaleIntercept
        slope = slices[slice_number].RescaleSlope
        
        if slope != 1:
            image[slice_number] = slope * image[slice_number].astype(np.float64)
            image[slice_number] = image[slice_number].astype(np.int16)
            
        image[slice_number] += np.int16(intercept)
    
    return np.array(image, dtype=np.int16)

In [None]:
def resample(image, scan, new_spacing=[1,1,1]):
    # Determine current pixel spacing
    #spacing = np.array([scan[0].SliceThickness] + scan[0].PixelSpacing, dtype=np.float32)
    spacing = np.array([scan[0].SliceThickness] + list(scan[0].PixelSpacing), dtype=np.float32)
    resize_factor = spacing / new_spacing
    new_real_shape = image.shape * resize_factor
    new_shape = np.round(new_real_shape)
    real_resize_factor = new_shape / image.shape
    new_spacing = spacing / real_resize_factor
    
    image = scipy.ndimage.interpolation.zoom(image, real_resize_factor, mode='nearest')
    return image, new_spacing

In [None]:
def get_segmented_lungs(im, plot=False):
    
    '''
    This funtion segments the lungs from the given 2D slice.
    '''
    if plot == True:
        f, plots = plt.subplots(8, 1, figsize=(5, 40))
    '''
    Step 1: Convert into a binary image. 
    '''
    binary = im < -200
    if plot == True:
        plots[0].axis('off')
        plots[0].imshow(binary, cmap=plt.cm.bone) 
    '''
    Step 2: Remove the blobs connected to the border of the image.
    '''
    cleared = clear_border(binary)
    if plot == True:
        plots[1].axis('off')
        plots[1].imshow(cleared, cmap=plt.cm.bone) 
    '''
    Step 3: Label the image.
    '''
    label_image = label(cleared)
    if plot == True:
        plots[2].axis('off')
        plots[2].imshow(label_image, cmap=plt.cm.bone) 
    '''
    Step 4: Keep the labels with 2 largest areas.
    '''
    areas = [r.area for r in regionprops(label_image)]
    areas.sort()
    if len(areas) > 2:
        for region in regionprops(label_image):
            if region.area < areas[-2]:
                for coordinates in region.coords:                
                       label_image[coordinates[0], coordinates[1]] = 0
    binary = label_image > 0
    if plot == True:
        plots[3].axis('off')
        plots[3].imshow(binary, cmap=plt.cm.bone) 
    '''
    Step 5: Erosion operation with a disk of radius 2. This operation is 
    seperate the lung nodules attached to the blood vessels.
    '''
    selem = disk(2)
    binary = binary_erosion(binary, selem)
    if plot == True:
        plots[4].axis('off')
        plots[4].imshow(binary, cmap=plt.cm.bone) 
    '''
    Step 6: Closure operation with a disk of radius 10. This operation is 
    to keep nodules attached to the lung wall.
    '''
    selem = disk(10)
    binary = binary_closing(binary, selem)
    if plot == True:
        plots[5].axis('off')
        plots[5].imshow(binary, cmap=plt.cm.bone) 
    '''
    Step 7: Fill in the small holes inside the binary mask of lungs.
    '''
    edges = roberts(binary)
    binary = ndi.binary_fill_holes(edges)
    if plot == True:
        plots[6].axis('off')
        plots[6].imshow(binary, cmap=plt.cm.bone) 
    '''
    Step 8: Superimpose the binary mask on the input image.
    '''
    get_high_vals = binary == 0
    im[get_high_vals] = 0
    if plot == True:
        plots[7].axis('off')
        plots[7].imshow(im, cmap=plt.cm.bone) 
        
    return im

In [None]:
import random
root_dir = Path('/kaggle/input/osic-pulmonary-fibrosis-progression')
ctscans_dir=root_dir/'train'
cache_dir = Path('/kaggle/input/osic-cache/cache')
latent_dir = Path('/kaggle/working/latent')
ids=train.Patient.unique()
#index = np.argwhere(ids=='ID00011637202177653955184')
#ids = list(np.delete(ids, index))
#random.shuffle(ids)
ids=np.array(ids)
test_ids=test.Patient.unique()
train_ids,val_ids=np.split(ids, [int(round(0.9 * len(ids), 0))])

In [None]:
def get_kurtosis_stats(ids):
    kurt=[]
    std=[]
    fvc=[]
    mean=[]
    median=[]
    for i in ids:
        print(i)
        try:
            patient_path= ctscans_dir / i
            scan = load_scan(patient_path)
            image=get_pixels_hu(scan)
            image, new_spacing = resample(image, scan, new_spacing=[2,2,2])
            image=np.asarray([get_segmented_lungs(slice) for slice in image])
            kurt_i=kurtosis(image.ravel()[image.ravel() < -200])
            std_i=image.ravel()[image.ravel() < -200].std()
            fvc_i=train.base_FVC[train.Patient==i].values[0]
            mean_i=image.ravel()[image.ravel() < -200].mean()
            median_i=np.median(image.ravel()[image.ravel() < -200])
            print('Kurtosis: ', kurt_i)
            print('Standard Deviation: ', std_i)
            print('FVC: ', fvc_i)
            kurt.append(kurt_i)
            std.append(std_i)
            fvc.append(fvc_i)
            mean.append(mean_i)
            median.append(median_i)
            ax=sns.kdeplot(image.ravel()[(image.ravel() < 0)&(image.ravel() > -1200)], bw=0.5)
            ax.set(xlabel='HU', ylabel='% voxels',title='Histogram of voxel characteristics')
            plt.show()
            plt.imshow(image[round(image.shape[0]/2),:,:])
            plt.show()
        except:
            print('error')
            kurt.append(np.nan)
            std.append(np.nan)
            fvc.append(np.nan)
            mean.append(np.nan)
            median.append(np.nan)
    return kurt,std,fvc,mean,median
    

In [None]:
def plot_ct_scan(scan):
    f, plots = plt.subplots(int(scan.shape[0] / 20) + 1, 4, figsize=(25, 25))
    for i in range(0, scan.shape[0], 5):
        plots[int(i / 20), int((i % 20) / 5)].axis('off')
        plots[int(i / 20), int((i % 20) / 5)].imshow(scan[i], cmap=plt.cm.bone)

In [None]:
def check():
    patient_path= ctscans_dir / ids[8]
    scan = load_scan(patient_path)
    image=get_pixels_hu(scan)
    image, new_spacing = resample(image, scan, new_spacing=[2,2,2])
    image=np.asarray([get_segmented_lungs(slice) for slice in image])
    #plt.imshow(image[50,:,:])
    return image

plot_ct_scan(check())

In [None]:
k,s,f,m,me=get_kurtosis_stats(ids)

In [None]:
plt.scatter(k,f)
plt.title('Scatter Plot of base FVC against pixel histogram kurtosis')
plt.xlabel('Kurtosis')

In [None]:
plt.scatter(s,f)
plt.title('Scatter Plot of base FVC against pixel histogram standard deviation')
plt.xlabel('Standard Deviation')

In [None]:
pixel_stats=train.copy()
pixel_stats=pixel_stats.drop_duplicates(subset=['Patient'])
pixel_stats['kurtosis']=np.array(k)
pixel_stats['std']=np.array(s)
pixel_stats['mean']=np.array(m)
pixel_stats['median']=np.array(me)

In [None]:
train=train.merge(pixel_stats[['Patient','kurtosis','std','mean','median']],how='left',on='Patient')

In [None]:
train.to_csv('train_pixel_stats.csv')