# Model Using Extracted Histogram features

This kernel uses features generated from the pixel histograms. I preprocessed the features for the train set here: https://www.kaggle.com/jameschapman19/histogram-features.

Since in test submission we need to generate all features 'on-the-fly', all of the relevant code to generate these features is reproduced in this kernel too.

In [None]:
import cv2
import copy
from pathlib import Path
from skimage.segmentation import clear_border
from skimage.morphology import ball, disk, dilation, binary_erosion, remove_small_objects, erosion, closing, reconstruction, binary_closing
from skimage.measure import label, regionprops
from skimage.segmentation import clear_border
from skimage.filters import roberts, sobel
from scipy import ndimage as ndi
from skimage import measure, morphology
from scipy.stats import kurtosis
import seaborn as sns
import scipy
import os
from tqdm import tqdm
from skimage import measure, morphology
from tqdm.notebook import tqdm
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import torch.nn as nn
from torch import optim
import random
from torchvision import models
import torch.multiprocessing as mp
from torch.nn import functional as F
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import pydicom.pixel_data_handlers.gdcm_handler as gdcm_handler 
import pydicom
from torch.utils.data import DataLoader, Dataset
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def seed_all(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
seed_all()

In [None]:
def calc_metric_loss(pred_fvc,sigma,true_fvc):
    true_fvc=np.reshape(true_fvc,pred_fvc.shape)
    sigma[sigma<70]=70
    delta=np.abs(pred_fvc-true_fvc)
    delta[delta>1000]=1000
    metric=-(np.sqrt(2)*delta/sigma)-np.log(np.sqrt(2)*sigma)
    return -metric

## Model
Tried to align the model here with the main public kernels for comparison purposes

In [None]:
class main_model(nn.Module):
    def __init__(self, n_additional_features, n_outputs):
        super(main_model, self).__init__()
        self.fc1 = nn.Linear(n_additional_features, 100)
        self.fc2 = nn.Linear(100, 100)
        self.fc3 = nn.Linear(100, n_outputs)

    def forward(self, additional_features):
        out=additional_features
        out = F.relu(self.fc1(out))
        out = F.relu(self.fc2(out))
        out = self.fc3(out)
        return out
    
    def metric_loss(self,pred_fvc,true_fvc):
        #Implementation of the metric in pytorch
        sigma = pred_fvc[:, 2] - pred_fvc[:, 0]
        true_fvc=torch.reshape(true_fvc,pred_fvc[:,1].shape)
        sigma_clipped=torch.clamp(sigma,min=70)
        delta=torch.clamp(torch.abs(pred_fvc[:,1]-true_fvc),max=1000)
        metric=torch.div(-torch.sqrt(torch.tensor([2.0]).to(device))*delta,sigma_clipped)-torch.log(torch.sqrt(torch.tensor([2.0]).to(device))*sigma_clipped)
        return -metric
    
    def fvc_loss(self,pred_fvc,true_fvc):
        #Absolute loss in FVC values
        true_fvc=torch.reshape(true_fvc,pred_fvc[:,1].shape)
        fvc_err=torch.abs(pred_fvc-true_fvc)
        return fvc_err
    
    def quantile_loss(self,preds, target, quantiles):
        #The 'quantile' loss
        assert not target.requires_grad
        assert preds.size(0) == target.size(0)
        losses = []
        for i, q in enumerate(quantiles):
            errors = target - preds[:, i]
            losses.append(torch.max((q - 1) * errors, q * errors).unsqueeze(1))

        loss = torch.mean(torch.sum(torch.cat(losses, dim=1), dim=1))
        return loss

## Image processing steps (for test data)

All credit to https://www.kaggle.com/gzuidhof/full-preprocessing-tutorial and https://www.kaggle.com/arnavkj95/candidate-generation-and-luna16-preprocessing

### Loading Scans and use the first couple of slice locations to estimate thickness for the whole scan

### Convert pixels to hounsfield units

### Resample the images so that all scans have the same size (i.e. higher resolution images downsampled)

### Segment the lungs

### Generate histograms and statistics

In [None]:
# Load the scans in given folder path
def load_scan(path):

    #slices = [pydicom.read_file(path / s) for s in os.listdir(path)]
    slices = [pydicom.read_file(path / s) for s in os.listdir(path)]
    slices.sort(key = lambda x: float(x.ImagePositionPatient[2]))
    try:
        slice_thickness = np.abs(slices[0].ImagePositionPatient[2] - slices[1].ImagePositionPatient[2])
    except:
        slice_thickness = np.abs(slices[0].SliceLocation - slices[1].SliceLocation)
    if slice_thickness==0:
        slice_thickness=slices[0].SliceThickness
    for s in slices:
        s.SliceThickness = slice_thickness
        
    return slices

def get_pixels_hu(slices):
    image = np.stack([np.array(s.pixel_array,dtype=np.int16) for s in slices])
    # Convert to int16 (from sometimes int16), 
    # should be possible as values should always be low enough (<32k)
    image = image.astype(np.int16)

    # Set outside-of-scan pixels to 0
    # The intercept is usually -1024, so air is approximately 0
    image[image == -2000] = 0
    
    # Convert to Hounsfield units (HU)
    for slice_number in range(len(slices)):
        
        intercept = slices[slice_number].RescaleIntercept
        slope = slices[slice_number].RescaleSlope
        
        if slope != 1:
            image[slice_number] = slope * image[slice_number].astype(np.float64)
            image[slice_number] = image[slice_number].astype(np.int16)
            
        image[slice_number] += np.int16(intercept)
    
    return np.array(image, dtype=np.int16)

def resample(image, scan, new_spacing=[1,1,1]):
    # Determine current pixel spacing
    #spacing = np.array([scan[0].SliceThickness] + scan[0].PixelSpacing, dtype=np.float32)
    spacing = np.array([scan[0].SliceThickness] + list(scan[0].PixelSpacing), dtype=np.float32)
    resize_factor = spacing / new_spacing
    new_real_shape = image.shape * resize_factor
    new_shape = np.round(new_real_shape)
    real_resize_factor = new_shape / image.shape
    new_spacing = spacing / real_resize_factor
    
    image = scipy.ndimage.interpolation.zoom(image, real_resize_factor, mode='nearest')
    return image, new_spacing

def get_segmented_lungs(im, plot=False):
    
    '''
    This funtion segments the lungs from the given 2D slice.
    '''
    if plot == True:
        f, plots = plt.subplots(8, 1, figsize=(5, 40))
    '''
    Step 1: Convert into a binary image. 
    '''
    binary = im < -200
    if plot == True:
        plots[0].axis('off')
        plots[0].imshow(binary, cmap=plt.cm.bone) 
    '''
    Step 2: Remove the blobs connected to the border of the image.
    '''
    cleared = clear_border(binary)
    if plot == True:
        plots[1].axis('off')
        plots[1].imshow(cleared, cmap=plt.cm.bone) 
    '''
    Step 3: Label the image.
    '''
    label_image = label(cleared)
    if plot == True:
        plots[2].axis('off')
        plots[2].imshow(label_image, cmap=plt.cm.bone) 
    '''
    Step 4: Keep the labels with 2 largest areas.
    '''
    areas = [r.area for r in regionprops(label_image)]
    areas.sort()
    if len(areas) > 2:
        for region in regionprops(label_image):
            if region.area < areas[-2]:
                for coordinates in region.coords:                
                       label_image[coordinates[0], coordinates[1]] = 0
    binary = label_image > 0
    if plot == True:
        plots[3].axis('off')
        plots[3].imshow(binary, cmap=plt.cm.bone) 
    '''
    Step 5: Erosion operation with a disk of radius 2. This operation is 
    seperate the lung nodules attached to the blood vessels.
    '''
    selem = disk(2)
    binary = binary_erosion(binary, selem)
    if plot == True:
        plots[4].axis('off')
        plots[4].imshow(binary, cmap=plt.cm.bone) 
    '''
    Step 6: Closure operation with a disk of radius 10. This operation is 
    to keep nodules attached to the lung wall.
    '''
    selem = disk(10)
    binary = binary_closing(binary, selem)
    if plot == True:
        plots[5].axis('off')
        plots[5].imshow(binary, cmap=plt.cm.bone) 
    '''
    Step 7: Fill in the small holes inside the binary mask of lungs.
    '''
    edges = roberts(binary)
    binary = ndi.binary_fill_holes(edges)
    if plot == True:
        plots[6].axis('off')
        plots[6].imshow(binary, cmap=plt.cm.bone) 
    '''
    Step 8: Superimpose the binary mask on the input image.
    '''
    get_high_vals = binary == 0
    im[get_high_vals] = 0
    if plot == True:
        plots[7].axis('off')
        plots[7].imshow(im, cmap=plt.cm.bone) 
        
    return im

def get_kurtosis_stats(ids,ctscans_dir):
    kurt=[]
    std=[]
    mean=[]
    median=[]
    for i in ids:
        print(i)
        #try:
        patient_path= ctscans_dir / i
        scan = load_scan(patient_path)
        image=get_pixels_hu(scan)
        image, new_spacing = resample(image, scan, new_spacing=[2,2,2])
        image=np.asarray([get_segmented_lungs(slice) for slice in image])
        kurt_i=kurtosis(image.ravel()[image.ravel() < -200])
        std_i=image.ravel()[image.ravel() < -200].std()
        mean_i=image.ravel()[image.ravel() < -200].mean()
        median_i=np.median(image.ravel()[image.ravel() < -200])
        print('Kurtosis: ', kurt_i)
        print('Standard Deviation: ', std_i)
        kurt.append(kurt_i)
        std.append(std_i)
        mean.append(mean_i)
        median.append(median_i)
        ax=sns.kdeplot(image.ravel()[(image.ravel() < 0)&(image.ravel() > -1200)], bw=0.5)
        ax.set(xlabel='HU', ylabel='% voxels',title='Histogram of voxel characteristics')
        plt.show()
        plt.imshow(image[round(image.shape[0]/2),:,:])
        plt.show()
        #except:
            #print('error')
            #kurt.append(np.nan)
            #std.append(np.nan)
            #mean.append(np.nan)
            #median.append(np.nan)
    return kurt,std,mean,median

## Helper Function to plot training losses

In [None]:
def plot_training_loss(train, val,title='loss'):
    plt.figure()
    plt.plot(train, label='Train')
    plt.plot(val, label='Val')
    if title=='loss':
        plt.title('Model Training Loss')
    else:
        plt.title('Model Metric Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.yscale('log')
    plt.legend()
    plt.savefig('training_loss')

## Load and preprocess Data

In [None]:
train=pd.read_csv('../input/osic-pulmonary-fibrosis-progression/train.csv')
test=pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')
submission=pd.read_csv('../input/osic-pulmonary-fibrosis-progression/sample_submission.csv')

### Prepare Training Data (Tabular)

In [None]:
train['base_Weeks']=train.groupby(['Patient'])['Weeks'].transform('min')
base=train[train.Weeks==train.base_Weeks]
base = base.rename(columns={'FVC': 'base_FVC','Percent': 'base_Percent'})
base.drop_duplicates(subset=['Patient', 'Weeks'], keep='first',inplace=True)
train=train.merge(base[['Patient','base_FVC','base_Percent']],on='Patient',how='left')
train['Week_passed'] = train['Weeks'] - train['base_Weeks']

### Prepare Test Data (tabular)

In [None]:
test = test.rename(columns={'Weeks': 'base_Weeks', 'FVC': 'base_FVC','Percent': 'base_Percent'})

# Adding Sample Submission
submission = pd.read_csv("../input/osic-pulmonary-fibrosis-progression/sample_submission.csv")

# In submisison file, format: ID_'week', using lambda to split the ID
submission['Patient'] = submission['Patient_Week'].apply(lambda x:x.split('_')[0])

# In submisison file, format: ID_'week', using lambda to split the Week
submission['Weeks'] = submission['Patient_Week'].apply(lambda x:x.split('_')[1]).astype(int)

test = submission.drop(columns = ["FVC", "Confidence"]).merge(test, on = 'Patient')

test['Week_passed'] = test['Weeks'] - test['base_Weeks']

test=test[train.columns.drop(['FVC','Percent'])]

### OH Encode Sex and Smoking
With thanks to https://www.kaggle.com/ulrich07/osic-keras-starter-with-custom-metrics

In [None]:
COLS = ['Sex','SmokingStatus']
for col in COLS:
    for mod in train[col].unique():
        train[mod] = (train[col] == mod).astype(int)
        
        test[mod] = (test[col] == mod).astype(int)
    train.drop(col,axis=1,inplace=True)
    test.drop(col,axis=1,inplace=True)

### Rescale based on train data

### Add in the preprocessed histogram features

In [None]:
pixel_stats=pd.read_csv('../input/osic-histogram-features/train_pixel_stats.csv')
train=train.merge(pixel_stats[['Patient','kurtosis','std','mean','median']],how='left',on='Patient')

@alexj21 pointed out I hadn't rescaled the histogram features. A bit ugly but I've done it here and inside the Dataset for the test. 

In [None]:
from sklearn import preprocessing
robust_scaler = preprocessing.RobustScaler()
train[train.columns.difference(['Patient','FVC','Percent','Weeks','base_Weeks'])]=robust_scaler.fit_transform(train[train.columns.difference(['Patient','FVC','Percent','Weeks','base_Weeks'])])

### Create the dataset

In [None]:
class OSIC(Dataset):
    def __init__(self,patient_ids,df,train=True):
        root_dir = Path('/kaggle/input/osic-pulmonary-fibrosis-progression')
        self.df=df[df.Patient.isin(patient_ids)]
        if not train:
            ct_scans_dir=root_dir/'test'
            pixel_stats=self.df.copy()
            k,s,m,me=get_kurtosis_stats(patient_ids,ct_scans_dir)
            pixel_stats=pixel_stats.drop_duplicates(subset=['Patient'])
            pixel_stats['kurtosis']=np.array(k)
            pixel_stats['std']=np.array(s)
            pixel_stats['mean']=np.array(m)
            pixel_stats['median']=np.array(me)
            self.df=self.df.merge(pixel_stats[['Patient','kurtosis','std','mean','median']],how='left',on='Patient')
            self.df[self.df.columns.difference(['Patient','FVC','Percent','Weeks','base_Weeks'])]=robust_scaler.transform(self.df[self.df.columns.difference(['Patient','FVC','Percent','Weeks','base_Weeks'])])
        else:
            ctscans_dir=root_dir/'train'
        self.data=self.df[self.df.columns.difference(['FVC','Patient','Percent'])].values
        col_mean = np.nanmean(self.data, axis=0)
        inds = np.where(np.isnan(self.data))
        self.data[inds] = np.take(col_mean, inds[1])
        self.patients=self.df['Patient'].values
        self.train=train
        if self.train:
            self.fvc=self.df['FVC'].values
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if self.train:
            data = {'fvc': self.fvc[idx],
                   'data': self.data[idx]}
        else:
            
            data = {'data': self.data[idx]}
        return data

## Set some training parameters

In [None]:
epochs=300
batch_size=64
num_workers=3
quantiles = (0.2, 0.5, 0.8)

# Split training data into train and val by patient (80:20)
'ID00011637202177653955184' has no images we can load. Therefore I'm going to drop.

We shuffle the train data

In [None]:
ids=train.Patient.unique()
index = np.argwhere(ids=='ID00011637202177653955184')
ids = list(np.delete(ids, index))
random.shuffle(ids)
ids=np.array(ids)

train_ids,val_ids=np.split(ids, [int(round(0.9 * len(ids), 0))])

train_dataset = OSIC(train_ids,train)  
train_dataloader = DataLoader(train_dataset, batch_size=batch_size,shuffle=True,num_workers=num_workers)

val_dataset = OSIC(val_ids,train)  
val_dataloader = DataLoader(val_dataset, batch_size=batch_size,shuffle=True,num_workers=num_workers)

In [None]:
model = main_model(train_dataset.data.shape[1], 3).to(device)
print('Number of parameters:')
print(sum(p.numel() for p in model.parameters() if p.requires_grad))

## Train Model

In [None]:
optimizer = optim.Adam(model.parameters())
epoch_train_metric=[]
epoch_val_metric=[]
epoch_train_loss=[]
epoch_val_loss=[]
epoch=0
min_val_loss = 1e+100
early_stop = False
patience=10
#Start by training for fvc
while epoch<epochs and not early_stop:
    epoch+=1
    train_loss=0
    train_metric=0
    model.train()
    for batch_idx, data in enumerate(train_dataloader):
        optimizer.zero_grad()
        model_output = model(data['data'].float().to(device))
        quantile_loss = model.quantile_loss(model_output, data['fvc'].to(device), quantiles)
        metric_loss = model.metric_loss(model_output,data['fvc'].to(device)).mean()
        loss=quantile_loss
        loss.backward()
        train_loss += loss.item()
        train_metric += metric_loss.item()
        optimizer.step()
    print('====> Epoch: {} Average train quantile loss: {:.4f}'.format(
                        epoch, train_loss / len(train_dataloader)))
    print('====> Epoch: {} Average train metric: {:.4f}'.format(
                        epoch, train_metric / len(train_dataloader)))
    
    val_loss=0
    val_metric=0
    model.eval()
    with torch.no_grad():
        for batch_idx, data in enumerate(val_dataloader):
            model_output = model(data['data'].float().to(device))
            quantile_loss = model.quantile_loss(model_output, data['fvc'].to(device), quantiles)
            metric_loss = model.metric_loss(model_output,data['fvc'].to(device)).mean()
            loss=quantile_loss
            val_loss += loss.item()
            val_metric += metric_loss.item()
        print('====> Epoch: {} Average val quantile loss: {:.4f}'.format(
                        epoch, val_loss / len(val_dataloader)))
        print('====> Epoch: {} Average val metric: {:.4f}'.format(
                        epoch, val_metric / len(val_dataloader)))
    
    epoch_train_loss.append(train_loss/ len(train_dataloader))
    epoch_val_loss.append(val_loss / len(val_dataloader))
    epoch_train_metric.append(train_metric/ len(train_dataloader))
    epoch_val_metric.append(val_metric / len(val_dataloader))
    
    if val_loss < min_val_loss:
        min_val_loss = val_loss
        best_model = copy.deepcopy(model.state_dict())
        print('Min loss %0.2f' % min_val_loss)
        epochs_no_improve = 0

    else:
        epochs_no_improve += 1
        # Check early stopping condition
        if epochs_no_improve == patience:
            print('Early stopping!')
            early_stop = True
            model.load_state_dict(best_model)



## Plot training curves

In [None]:
plot_training_loss(epoch_train_loss, epoch_val_loss)

possibly a learning rate problem/something to do with the different scaling of percent and fvc

In [None]:
plot_training_loss(epoch_train_metric, epoch_val_metric,title='metric')

## Test Data

In [None]:
submission=pd.read_csv('../input/osic-pulmonary-fibrosis-progression/sample_submission.csv')

In [None]:
test_ids=test.Patient.unique()
test_dataset = OSIC(test_ids,test,train=False)  
test_dataloader = DataLoader(test_dataset, batch_size=batch_size,shuffle=False,num_workers=num_workers)
model.eval()
fvc_pred = []
sigma_pred = []
with torch.no_grad():
    for batch_idx, data in enumerate(test_dataloader):
        model_output = model(data['data'].float().to(device))
        fvc_pred.append(model_output[:,1])
        sigma_pred.append(model_output[:,2]-model_output[:,0])
fvc_pred=torch.cat(fvc_pred, dim=0)
sigma_pred=torch.cat(sigma_pred, dim=0)
test['FVC']=fvc_pred.cpu().numpy()
test['Confidence']=sigma_pred.cpu().numpy()

In [None]:
test['Patient_Week']=test["Patient"] + '_' + test['Weeks'].apply(str)

In [None]:
submission=submission[['Patient_Week']].merge(test[['Patient_Week','FVC','Confidence']],on='Patient_Week')

In [None]:
submission.to_csv('submission.csv', index=False, float_format='%.1f')

# Post-Match Analysis

## Test Predictions

In [None]:
plt.scatter(submission['FVC'],submission['Confidence'])
plt.title('Test')
plt.xlabel('FVC')
plt.ylabel('Confidence')

## Train

In [None]:
fvc_pred = []
sigma_pred = []
train_dataloader = DataLoader(train_dataset, batch_size=batch_size,shuffle=False,num_workers=num_workers)
with torch.no_grad():
    for batch_idx, data in enumerate(train_dataloader):
        model_output = model(data['data'].float().to(device))
        fvc_pred.append(model_output[:,1])
        sigma_pred.append(model_output[:,2]-model_output[:,0])
fvc_pred_train=torch.cat(fvc_pred, dim=0)
sigma_pred_train=torch.cat(sigma_pred, dim=0)

print('train metric', calc_metric_loss(fvc_pred_train.cpu().numpy(),sigma_pred_train.cpu().numpy(),train_dataset.fvc).mean())

plt.scatter(fvc_pred_train.cpu().numpy(),sigma_pred_train.cpu().numpy())
plt.title('Train')
plt.xlabel('FVC')
plt.ylabel('Confidence')


## Val

In [None]:
fvc_pred = []
sigma_pred = []
val_dataloader = DataLoader(val_dataset, batch_size=batch_size,shuffle=False,num_workers=num_workers)
with torch.no_grad():
    for batch_idx, data in enumerate(val_dataloader):
        model_output = model(data['data'].float().to(device))
        fvc_pred.append(model_output[:,1])
        sigma_pred.append(model_output[:,2]-model_output[:,0])
fvc_pred_val=torch.cat(fvc_pred, dim=0)
sigma_pred_val=torch.cat(sigma_pred, dim=0)

print('val metric', calc_metric_loss(fvc_pred_val.cpu().numpy(),sigma_pred_val.cpu().numpy(),val_dataset.fvc).mean())

plt.scatter(fvc_pred_val.cpu().numpy(),sigma_pred_val.cpu().numpy())
plt.title('Val')
plt.xlabel('FVC')
plt.ylabel('Confidence')

## All

In [None]:
plt.hist(submission['FVC'], alpha=0.5,label='test')
plt.hist(fvc_pred_train.cpu().numpy(), alpha=0.5,label='train')
plt.hist(fvc_pred_val.cpu().numpy(), alpha=0.5,label='val')
plt.legend()
plt.title('Histogram of FVC predictions')

In [None]:
plt.hist(submission['Confidence'], alpha=0.5,label='test')
plt.hist(sigma_pred_train.cpu().numpy(), alpha=0.5,label='train')
plt.hist(sigma_pred_val.cpu().numpy(), alpha=0.5,label='val')
plt.legend()
plt.title('Histogram of Confidence predictions')