This notebook is based on @ulrich07 https://www.kaggle.com/ulrich07/osic-multiple-quantile-regression-starter and code with pytorch.If you are not familiar with tensorflow, refer to this notebook.
btw,Pytorch is a bit slow than tensorflow.

In [None]:
import numpy as np
import pandas as pd
import pydicom
import os
import random
import matplotlib.pyplot as plt
from tqdm import tqdm
import glob
import sys
import cv2

from PIL import Image
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
#import gdcm
import warnings
warnings.filterwarnings("ignore")

In [None]:
sys.path.append('../input/efficientnet-pytorch/EfficientNet-PyTorch-master')
sys.path.append('../input/pretrainedmodels/pretrainedmodels-0.7.4/')
sys.path.append('../input/segmentation-models-pytorch/')
import segmentation_models_pytorch as smp

In [None]:
import torch.nn as nn
import torch
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader

from scipy.ndimage import zoom
from scipy.stats import kurtosis
from scipy.stats import skew

In [None]:
def seed_everything(seed=2020):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)#set all gpus seed
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False#if input data type and channels' changes arent' large use it improve train efficient
        torch.backends.cudnn.enabled = True
    
seed_everything(42)

In [None]:
ROOT = "../input/osic-pulmonary-fibrosis-progression"
device = torch.device('cuda')

tr = pd.read_csv(f"{ROOT}/train.csv")
tr.drop_duplicates(keep=False, inplace=True, subset=['Patient','Weeks'])
feature_ct = pd.read_csv('../input/2020osic/CT_21feature.csv')

tr = tr.merge(feature_ct, on='Patient')

chunk = pd.read_csv(f"{ROOT}/test.csv")

In [None]:
def load_scan(path,resize_type='no'):
    """
    Loads scans from a folder and into a list.
    
    Parameters: path (Folder path)
    
    Returns: slices (List of slices)
    """
    slices = [pydicom.read_file(path + '/' + s) for s in os.listdir(path)]
    slices.sort(key = lambda x: int(x.InstanceNumber))
    
    try:
        slice_thickness = abs(slices[-1].ImagePositionPatient[2] - slices[0].ImagePositionPatient[2])/(len(slices))
    except:
        try:
            slice_thickness = abs(slices[-1].SliceLocation - slices[0].SliceLocation)/(len(slices))
        except:
            slice_thickness = slices[0].SliceThickness
        
    for s in slices:
        s.SliceThickness = slice_thickness
        if resize_type == 'resize':
            s.PixelSpacing = s.PixelSpacing*(s.Rows/512)  
    return slices

def transform_to_hu(slices):
    """
    transform dicom.pixel_array to Hounsfield.
    Parameters: list dicoms
    Returns:numpy Hounsfield
    """
    
    images = np.stack([file.pixel_array for file in slices])
    images = images.astype(np.int16)

    # convert ouside pixel-values to air:
    # I'm using <= -1000 to be sure that other defaults are captured as well
    images[images <= -1000] = 0
    
    # convert to HU
    for n in range(len(slices)):
        
        intercept = slices[n].RescaleIntercept
        slope = slices[n].RescaleSlope
        
        if slope != 1:
            images[n] = slope * images[n].astype(np.float64)
            images[n] = images[n].astype(np.int16)
            
        images[n] += np.int16(intercept)
    
    return np.array(images, dtype=np.int16)

In [None]:
class Test_Generate(Dataset):
    def __init__(self,imgs_dicom):
        self.imgs_dicom = imgs_dicom
    def __getitem__(self,index):
        metainf = self.imgs_dicom[index]
        slice_img = metainf.pixel_array
        slice_img = (slice_img-slice_img.min())/(slice_img.max()-slice_img.min())
        slice_img = (slice_img*255).astype(np.uint8)
            
        if metainf.Rows!=512 or metainf.Columns!=512:
            slice_img = cv2.resize(slice_img,(512,512))
        
        slice_img = slice_img[None,:,:]
        slice_img = (slice_img/255).astype(np.float32)
        return slice_img
        
    def __len__(self):
        return len(self.imgs_dicom)

In [None]:
def crop_image(img: np.ndarray):
    edge_pixel_value = img[0, 0]
    mask = img != edge_pixel_value
    return img[np.ix_(mask.any(1),mask.any(0))]

def resize_image(img: np.ndarray,reshape=(512,512)):
    img = cv2.resize(img,(512,512))
    return img

def preprocess_img(img,resize_type):
    if resize_type == 'resize':
        img = [resize_image(im) for im in img]
    if resize_type == 'crop':
        img = [crop_image(im) for im in img]
        
    return np.array(img, dtype=np.int64)


def caculate_lung_volume(patient_scans,patient_masks):
    """
    caculate volume of lung from mask
    Parameters: list dicom scans,list patient CT Mask
    Returns: volume cm³　(float)
    """
    lung_volume = 0
    for i in range(len(patient_masks)):
        
        pixel_spacing = patient_scans[i].PixelSpacing
        slice_thickness = patient_scans[i].SliceThickness
        lung_volume += np.count_nonzero(patient_masks[i])*pixel_spacing[0]*pixel_spacing[1]*slice_thickness
        
    return lung_volume*0.001

def caculate_histgram_statistical(patient_images,patient_masks,thresh = [-600,-250]):
    """
    caculate hisgram kurthosis of lung hounsfield
    Parameters: list patient CT image 512*512,thresh divide lung
    Returns: histgram statistical characteristic(Mean,Skew,Kurthosis)
    """
    statistical_characteristic = dict(Mean=0,Median=0,Skew=0,Kurthosis=0,HAA=0,midMean=0,
                                      midSkew=0,midKurthosis=0,midMedian=0,midHAA=0)
    num_slices = len(patient_images)
    
    #patient_images = patient_images[int(num_slices*0.1):int(num_slices*0.9)]
    #patient_masks = patient_masks[int(num_slices*0.1):int(num_slices*0.9)]
    patient_images = patient_masks*patient_images
    patient_images_nonzero = patient_images[np.nonzero(patient_images)]
    s_pixel = patient_images_nonzero.flatten()
    haa_pixel = s_pixel[np.where((s_pixel>-1000)&(s_pixel<0))]
    
    mid_index = np.argsort(np.sum(patient_masks,axis=(1,2)))[-1]
    
    mid_image = patient_images[mid_index]
    mid_images_nonzero = mid_image[np.nonzero(mid_image)]
    mid_pixel = mid_images_nonzero.flatten()
    midhaa_pixel = mid_pixel[np.where((mid_pixel>thresh[0])&(mid_pixel<thresh[1]))]
    
    
    statistical_characteristic['Mean'] = np.mean(s_pixel)
    statistical_characteristic['Median'] = np.median(s_pixel)
    statistical_characteristic['Skew'] = skew(s_pixel)
    statistical_characteristic['Kurthosis'] = kurtosis(s_pixel)
    statistical_characteristic['HAA'] = len(haa_pixel)/len(s_pixel)
    
    statistical_characteristic['midMean'] = np.mean(mid_pixel)
    statistical_characteristic['midMedian'] = np.median(mid_pixel)
    statistical_characteristic['midSkew'] = skew(mid_pixel)
    statistical_characteristic['midKurthosis'] = kurtosis(mid_pixel)
    statistical_characteristic['midHAA'] = len(midhaa_pixel)/len(mid_pixel)
    
    for r in range(0,1000,100):
        area_pixel = s_pixel[np.where((s_pixel>-r-100)&(s_pixel<-r))]
        statistical_characteristic[f'pro_{r}'] = len(area_pixel)/len(s_pixel)
    return statistical_characteristic

In [None]:
device =  torch.device('cuda:0')
checkpoint = '../input/2020osic/best_lung_Unet_densenet121_my.pth'
Model_M = smp.Unet('densenet121', classes=1, in_channels=1,activation='sigmoid',encoder_weights=None).to(device)
Model_M.load_state_dict(torch.load(checkpoint))
#Model_M.eval()

def Unet_mask(model_m: nn.Module,input_data: DataLoader):
    model_m.eval()
    outs = []
    for idx, sample in enumerate(test_loader):
        image = sample
        image = image.to(device)
        with torch.no_grad():
            out = model_m(image)
        out = out.cpu().data.numpy()
        out = np.where(out>0.5,1,0)
        out = np.squeeze(out,axis=1)
        outs.append(out)

    outs = np.concatenate(outs)
    return outs

In [None]:
ct_root_path = '../input/osic-pulmonary-fibrosis-progression/test/'

lung_stat_pd = pd.DataFrame(columns=['Patient','Volume','Mean','Median','Skew','Kurthosis','HAA',
                                     'midMean','midMedian','midSkew','midKurthosis','midHAA',
                                    'pro_0','pro_100','pro_200','pro_300','pro_400','pro_500',
                                     'pro_600','pro_700','pro_800','pro_900'])
haa_thresh = [-600,-250]
for i,p in enumerate(tqdm(pd.unique(chunk['Patient']))):
    
    lung_stat_pd.loc[i,'Patient'] = p    
    patient_scans = load_scan(ct_root_path + p)
    test_db = Test_Generate(patient_scans)
    test_loader = DataLoader(test_db, batch_size=8, shuffle=False, num_workers=4)
    masks = Unet_mask(Model_M,test_loader)
    
    patient_images = transform_to_hu(patient_scans)
    if patient_images[0].shape!=(512,512):
        patient_images = preprocess_img(patient_images,'resize')
    
    lung_stat_pd.loc[i,'Volume'] = caculate_lung_volume(patient_scans,masks)                           
   
    statistical_characteristic = caculate_histgram_statistical(patient_images,masks,haa_thresh)
    
    lung_stat_pd.loc[i,'Mean'] = statistical_characteristic['Mean']
    lung_stat_pd.loc[i,'Median'] = statistical_characteristic['Median']
    lung_stat_pd.loc[i,'Skew'] = statistical_characteristic['Skew']
    lung_stat_pd.loc[i,'Kurthosis'] = statistical_characteristic['Kurthosis']
    lung_stat_pd.loc[i,'HAA'] = statistical_characteristic['HAA']
    
    lung_stat_pd.loc[i,'midMean'] = statistical_characteristic['midMean']
    lung_stat_pd.loc[i,'midMedian'] = statistical_characteristic['midMedian']
    lung_stat_pd.loc[i,'midSkew'] = statistical_characteristic['midSkew']
    lung_stat_pd.loc[i,'midKurthosis'] = statistical_characteristic['midKurthosis']
    lung_stat_pd.loc[i,'midHAA'] = statistical_characteristic['midHAA']
    
    lung_stat_pd.loc[i,'pro_0'] = statistical_characteristic['pro_0']
    lung_stat_pd.loc[i,'pro_100'] = statistical_characteristic['pro_100']
    lung_stat_pd.loc[i,'pro_200'] = statistical_characteristic['pro_200']
    lung_stat_pd.loc[i,'pro_300'] = statistical_characteristic['pro_300']
    lung_stat_pd.loc[i,'pro_400'] = statistical_characteristic['pro_400']
    lung_stat_pd.loc[i,'pro_500'] = statistical_characteristic['pro_500']
    lung_stat_pd.loc[i,'pro_600'] = statistical_characteristic['pro_600']
    lung_stat_pd.loc[i,'pro_700'] = statistical_characteristic['pro_700']
    lung_stat_pd.loc[i,'pro_800'] = statistical_characteristic['pro_800']
    lung_stat_pd.loc[i,'pro_900'] = statistical_characteristic['pro_900']

lung_stat_pd.head()

In [None]:
chunk = chunk.merge(lung_stat_pd)

print("add infos")
sub = pd.read_csv(f"{ROOT}/sample_submission.csv")
sub['Patient'] = sub['Patient_Week'].apply(lambda x:x.split('_')[0])
sub['Weeks'] = sub['Patient_Week'].apply(lambda x: int(x.split('_')[-1]))
sub =  sub[['Patient','Weeks','Confidence','Patient_Week']]
sub = sub.merge(chunk.drop('Weeks', axis=1), on="Patient")
sub.head()

In [None]:
tr['WHERE'] = 'train'
chunk['WHERE'] = 'val'
sub['WHERE'] = 'test'
data = tr.append([chunk, sub])

In [None]:
data['min_week'] = data['Weeks']
data.loc[data.WHERE=='test','min_week'] = np.nan
data['min_week'] = data.groupby('Patient')['min_week'].transform('min')

In [None]:
data.head()

In [None]:
base = (
    data
    .loc[data.Weeks == data.min_week][['Patient','FVC']]
    .rename({'FVC': 'min_FVC'}, axis=1)
    .groupby('Patient')
    .first()
    .reset_index()
)

In [None]:
data = data.merge(base, on='Patient', how='left')
data['base_week'] = data['Weeks'] - data['min_week']
del base

base = (
    data
    .loc[data.Weeks == data.min_week][['Patient','Percent']]
    .rename({'Percent': 'min_Percent'}, axis=1)
    .groupby('Patient')
    .first()
    .reset_index()
)
data = data.merge(base, on='Patient', how='left')
del base

data.head()

In [None]:
"""
data['age'] = (data['Age'] - data['Age'].min() ) / ( data['Age'].max() - data['Age'].min() )
#data['BASE'] = (data['min_FVC'] - data['min_FVC'].min() ) / ( data['min_FVC'].max() - data['min_FVC'].min())
data['week'] = data['base_week']# - data['base_week'].min() ) / ( data['base_week'].max() - data['base_week'].min() )
data['percent'] = (data['min_Percent'] - data['min_Percent'].min() ) / ( data['min_Percent'].max() - data['min_Percent'].min())

data['volume'] = (data['Volume'] - tr['Volume'].min() ) / ( tr['Volume'].max() - tr['Volume'].min())
data['mean'] = (data['Mean'] - tr['Mean'].min()) / (tr['Mean'].max() - tr['Mean'].min())
data['skew'] = (data['Skew'] - tr['Skew'].min())/(tr['Skew'].max() - tr['Skew'].min())
data['median'] = (data['Median'] - tr['Median'].min()) / (tr['Median'].max() - tr['Median'].min())
data['kurthosis'] = (data['Kurthosis'] - tr['Kurthosis'].min())/(tr['Kurthosis'].max() - tr['Kurthosis'].min())
data['haa'] = (data['HAA'] - tr['HAA'].min())/(tr['HAA'].max() - tr['HAA'].min())


data['midmedian'] = (data['midMedian'] - tr['midMedian'].min() ) / ( tr['midMedian'].max() - tr['midMedian'].min())
data['midmean'] = (data['midMean'] - tr['midMean'].min()) / (tr['midMean'].max() - tr['midMean'].min())
data['midskew'] = (data['midSkew'] - tr['midSkew'].min())/(tr['midSkew'].max() - tr['midSkew'].min())
data['midkurthosis'] = (data['midKurthosis'] - tr['midKurthosis'].min())/(tr['midKurthosis'].max() - tr['midKurthosis'].min())
data['midhaa'] = (data['midHAA'] - tr['midHAA'].min())/(tr['midHAA'].max() - tr['midHAA'].min())

data['pro_0'] = (data['pro_0']-tr['pro_0'].min())/(tr['pro_0'].max() - tr['pro_0'].min())
data['pro_100'] = (data['pro_100']-tr['pro_100'].min())/(tr['pro_100'].max() - tr['pro_100'].min())
data['pro_200'] = (data['pro_200']-tr['pro_200'].min())/(tr['pro_200'].max() - tr['pro_200'].min())
data['pro_300'] = (data['pro_300']-tr['pro_300'].min())/(tr['pro_300'].max() - tr['pro_300'].min())
data['pro_400'] = (data['pro_400']-tr['pro_400'].min())/(tr['pro_400'].max() - tr['pro_400'].min())
data['pro_500'] = (data['pro_500']-tr['pro_500'].min())/(tr['pro_500'].max() - tr['pro_500'].min())
data['pro_600'] = (data['pro_600']-tr['pro_600'].min())/(tr['pro_600'].max() - tr['pro_600'].min())
data['pro_700'] = (data['pro_700']-tr['pro_700'].min())/(tr['pro_700'].max() - tr['pro_700'].min())
data['pro_800'] = (data['pro_800']-tr['pro_800'].min())/(tr['pro_800'].max() - tr['pro_800'].min())
data['pro_900'] = (data['pro_900']-tr['pro_900'].min())/(tr['pro_900'].max() - tr['pro_900'].min())

data['res_fvc'] = data['min_FVC']-data['FVC']


data.loc[:,"Sex"] = pd.factorize(data.Sex)[0]
data.loc[:,"SmokingStatus"] = pd.factorize(data.SmokingStatus)[0]
data['Sex'] = (data['Sex'] - data['Sex'].min() ) / ( data['Sex'].max() - data['Sex'].min() )
data['SmokingStatus'] = (data['SmokingStatus'] - data['SmokingStatus'].min() ) / ( data['SmokingStatus'].max() - data['SmokingStatus'].min())
"""                                                                            
"""
data['age'] = data['Age']
data['BASE'] = data['min_FVC']#-data['FVC']
data['res_FVC'] = data['min_FVC']-data['FVC']
data['week'] = data['base_week']# - data['base_week'].min()
data['percent'] = data['Percent']

data.head()

#FE += ['age','percent','week','BASE','volume']
#FE += ['age','percent','week','BASE','volume','mean','skew','kurthosis']
#FE += ['Age','Percent','base_week','min_FVC','Volume','Mean','Skew','Kurthosis']
"""

In [None]:

data['age'] = (data['Age'] - data['Age'].min() ) / ( data['Age'].max() - data['Age'].min() )
#data['BASE'] = (data['min_FVC'] - data['min_FVC'].min() ) / ( data['min_FVC'].max() - data['min_FVC'].min())
data['week'] = data['base_week']# - data['base_week'].min() ) / ( data['base_week'].max() - data['base_week'].min() )
data['percent'] = (data['min_Percent'] - data['min_Percent'].min() ) / ( data['min_Percent'].max() - data['min_Percent'].min())

data['volume'] = (data['Volume'] - data['Volume'].min() ) / ( data['Volume'].max() - data['Volume'].min())
data['mean'] = (data['Mean'] - data['Mean'].min()) / (data['Mean'].max() - data['Mean'].min())
data['skew'] = (data['Skew'] - data['Skew'].min())/(data['Skew'].max() - data['Skew'].min())
data['median'] = (data['Median'] - data['Median'].min()) / (data['Median'].max() - data['Median'].min())
data['kurthosis'] = (data['Kurthosis'] - data['Kurthosis'].min())/(data['Kurthosis'].max() - data['Kurthosis'].min())
data['haa'] = (data['HAA'] - data['HAA'].min())/(data['HAA'].max() - data['HAA'].min())


data['midmedian'] = (data['midMedian'] - data['midMedian'].min() ) / ( data['midMedian'].max() - data['midMedian'].min())
data['midmean'] = (data['midMean'] - data['midMean'].min()) / (data['midMean'].max() - data['midMean'].min())
data['midskew'] = (data['midSkew'] - data['midSkew'].min())/(data['midSkew'].max() - data['midSkew'].min())
data['midkurthosis'] = (data['midKurthosis'] - data['midKurthosis'].min())/(data['midKurthosis'].max() - data['midKurthosis'].min())
data['midhaa'] = (data['midHAA'] - data['midHAA'].min())/(data['midHAA'].max() - data['midHAA'].min())

data['pro_0'] = (data['pro_0']-data['pro_0'].min())/(data['pro_0'].max() - data['pro_0'].min())
data['pro_100'] = (data['pro_100']-data['pro_100'].min())/(data['pro_100'].max() - data['pro_100'].min())
data['pro_200'] = (data['pro_200']-data['pro_200'].min())/(data['pro_200'].max() - data['pro_200'].min())
data['pro_300'] = (data['pro_300']-data['pro_300'].min())/(data['pro_300'].max() - data['pro_300'].min())
data['pro_400'] = (data['pro_400']-data['pro_400'].min())/(data['pro_400'].max() - data['pro_400'].min())
data['pro_500'] = (data['pro_500']-data['pro_500'].min())/(data['pro_500'].max() - data['pro_500'].min())
data['pro_600'] = (data['pro_600']-data['pro_600'].min())/(data['pro_600'].max() - data['pro_600'].min())
data['pro_700'] = (data['pro_700']-data['pro_700'].min())/(data['pro_700'].max() - data['pro_700'].min())
data['pro_800'] = (data['pro_800']-data['pro_800'].min())/(data['pro_800'].max() - data['pro_800'].min())
data['pro_900'] = (data['pro_900']-data['pro_900'].min())/(data['pro_900'].max() - data['pro_900'].min())

data['res_fvc'] = data['min_FVC']-data['FVC']


data.loc[:,"Sex"] = pd.factorize(data.Sex)[0]
data.loc[:,"SmokingStatus"] = pd.factorize(data.SmokingStatus)[0]
data['Sex'] = (data['Sex'] - data['Sex'].min() ) / ( data['Sex'].max() - data['Sex'].min() )
data['SmokingStatus'] = (data['SmokingStatus'] - data['SmokingStatus'].min() ) / ( data['SmokingStatus'].max() - data['SmokingStatus'].min())
                                                                                  

"""
data['age'] = data['Age']
data['BASE'] = data['min_FVC']#-data['FVC']
data['res_FVC'] = data['min_FVC']-data['FVC']
data['week'] = data['base_week']# - data['base_week'].min()
data['percent'] = data['Percent']

data.head()

#FE += ['age','percent','week','BASE','volume']
#FE += ['age','percent','week','BASE','volume','mean','skew','kurthosis']
#FE += ['Age','Percent','base_week','min_FVC','Volume','Mean','Skew','Kurthosis']
"""

Start get lung volume feature

In [None]:

FE = ['percent','SmokingStatus','Sex','age','week','volume','mean','skew','kurthosis','haa','median',
     'midmean','midmedian','midskew','midkurthosis','midhaa',
     'pro_0','pro_100','pro_200','pro_300','pro_400','pro_500',
     'pro_600','pro_700','pro_800','pro_900']

#FE = ['percent','age','week','volume','haa','pro_0','pro_700','pro_800','pro_900']

In [None]:
tr = data[data.WHERE=='train']
test = data[data.WHERE=='test']
sub = data.loc[data.WHERE=='test']
del data
#tr = tr.merge(feature_ct, on='Patient')
#data = data.merge(lung_stat_pd, on='Patient', how='left')
tr.head()

### BASELINE NN 

In [None]:
import torch.nn.functional as F

class MishFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x):
        ctx.save_for_backward(x)
        return x * torch.tanh(F.softplus(x))   # x * tanh(ln(1 + exp(x)))

    @staticmethod
    def backward(ctx, grad_output):
        x = ctx.saved_variables[0]
        sigmoid = torch.sigmoid(x)
        tanh_sp = torch.tanh(F.softplus(x)) 
        return grad_output * (tanh_sp + x * sigmoid * (1 - tanh_sp * tanh_sp))

class Mish(nn.Module):
    def forward(self, x):
        return MishFunction.apply(x)

def to_Mish(model):
    for child_name, child in model.named_children():
        if isinstance(child, nn.ReLU):
            setattr(model, child_name, Mish())
        else:
            to_Mish(child)

In [None]:
C1, C2 = torch.tensor(70,dtype=torch.float),torch.tensor(1000,dtype=torch.float)
C1, C2 = C1.to(device),C2.to(device)

def score_np(fvc_true, y_pred):
    sigma = y_pred[:,2] - y_pred[:,0]
    fvc_pred = y_pred[:,1]
    sigma_clip = np.maximum(sigma, 70) # changed from 70, trie 66.7 too
    delta = np.abs(fvc_true - fvc_pred)
    delta = np.minimum(delta, 1000)
    sq2 = np.sqrt(2)
    metric = (delta / sigma_clip)*sq2 + np.log(sigma_clip* sq2)
    return np.mean(metric)

#=============================#
def score(y_true, y_pred):
    y_true = y_true.to(torch.float)
    y_pred = y_pred.to(torch.float)
    
    sigma = y_pred[:,2] - y_pred[:,0]
    fvc_pred = y_pred[:,1]
    #sigma_clip = sigma + C1
    sigma_clip = torch.max(sigma, C1)
    delta = torch.abs(y_true[:,0] - fvc_pred)
    delta = torch.min(delta, C2)
    sq2 = torch.sqrt(torch.tensor(2, dtype=torch.float))
    metric = (delta / sigma_clip)*sq2 + torch.log(sigma_clip* sq2)
    return torch.mean(metric)

#============================#
def qloss(y_true, y_pred):
    # Pinball loss for multiple quantiles
    device = y_true.device
    qs = [0.25, 0.50, 0.75]
    q = torch.tensor(np.array([qs]), dtype=torch.float32)
    q = q.to(device)
    e = y_true - y_pred
    v = torch.max(q*e, (q-1)*e)
    return torch.mean(v)


#=============================#
def mloss(_lambda):
    def loss(y_true, y_pred):
        return _lambda * qloss(y_true, y_pred) + (1 - _lambda)*score(y_true, y_pred)
    return loss
#=================
"""
class make_model(nn.Module):
    def __init__(self, in_ch, out_ch=3):
        super(make_model, self).__init__()
        self.fc1 = nn.Sequential(
            nn.Linear(in_ch, 160),
            Mish(),
            #nn.BatchNorm1d(160),
            nn.Dropout(0.3),
        )
        self.fc2 = nn.Sequential(
            nn.Linear(160, 128),
            Mish(),
            #nn.BatchNorm1d(128),
            nn.Dropout(0.25),
        )
        #self.fc3_p1 = nn.Linear(128, out_ch)
        self.fc3_p1 = nn.Sequential(
            nn.Linear(128, out_ch),
            Mish()
        )
        
        self.fc3_p2 = nn.Sequential(
            nn.Linear(128, out_ch),
            Mish()
        )
        #self.dropout = nn.Dropout(0.5)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        x1 = self.fc3_p1(x)
        x2 = self.fc3_p2(x)
        x = x1 + torch.cumsum(x2,dim=1)
        return x
"""

class GaussianNoise(nn.Module):
    """Gaussian noise regularizer.

    Args:
        sigma (float, optional): relative standard deviation used to generate the
            noise. Relative means that it will be multiplied by the magnitude of
            the value your are adding the noise to. This means that sigma can be
            the same regardless of the scale of the vector.
        is_relative_detach (bool, optional): whether to detach the variable before
            computing the scale of the noise. If `False` then the scale of the noise
            won't be seen as a constant but something to optimize: this will bias the
            network to generate vectors with smaller values.
    """
    def __init__(self, sigma=0.1, is_relative_detach=True):
        super().__init__()
        self.sigma = sigma
        self.is_relative_detach = is_relative_detach
        self.register_buffer('noise', torch.tensor(0))

    def forward(self, x):
        if self.training and self.sigma != 0:
            scale = self.sigma * x.detach() if self.is_relative_detach else self.sigma * x
            sampled_noise = self.noise.expand(*x.size()).float().normal_() * scale
            x = x + sampled_noise
        return x
    
class Model(nn.Module):
    def __init__(self, num_features, num_targets, hidden_size=512):
        super(Model, self).__init__()
        self.batch_norm1 = nn.BatchNorm1d(num_features)
        self.dropout1 = nn.Dropout(0.2)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_features, hidden_size))
        
        self.batch_norm2 = nn.BatchNorm1d(hidden_size)
        self.dropout2 = nn.Dropout(0.5)
        self.dense2 = nn.utils.weight_norm(nn.Linear(hidden_size, hidden_size))
        
        self.batch_norm3 = nn.BatchNorm1d(hidden_size)
        self.dropout3 = nn.Dropout(0.5)
        self.dense3 = nn.utils.weight_norm(nn.Linear(hidden_size, num_targets))
    
    def forward(self, x):
        
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = F.relu(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = F.relu(self.dense2(x))
        
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.dense3(x)
        
        return x
    
class make_big_model(nn.Module):
    def __init__(self, in_ch, out_ch=3):
        super(make_big_model, self).__init__()
        self.gasus_nosie = GaussianNoise()
        self.fc1 = nn.Sequential(
            nn.Linear(in_ch, 256),
            Mish()
        )
        self.fc2 = nn.Sequential(
            nn.Linear(256, 256),
            Mish(),
            nn.Dropout(0.5)
        )
        self.fc3_p1 = nn.Sequential(
            nn.Linear(256, 128),
            Mish(),
            nn.Dropout(0.5)
        )
    
        self.fc3_p2 = nn.Linear(128, out_ch)
        
        
        
    def forward(self, x):
        x = self.gasus_nosie(x)
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3_p1(x)
        #x = self.dropout(x)
        x = self.fc3_p2(x)
        return x
    
class make_model(nn.Module):
    def __init__(self, in_ch, out_ch=3):
        super(make_model, self).__init__()
        self.fc1 = nn.Sequential(
            nn.Linear(in_ch, 160),
            Mish()
        )
        self.fc2 = nn.Sequential(
            nn.Linear(160, 128),
            Mish()
        )
        self.fc3_p1 = nn.Sequential(
            nn.Linear(128, 64),
            Mish()
        )
        #self.dropout = nn.Dropout(0.2)
        self.fc3_p2 = nn.Linear(128, out_ch)
        
        
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        #x = self.fc3_p1(x)
        #x = self.dropout(x)
        x = self.fc3_p2(x)
        return x
    

In [None]:
NFOLD = 5
kf = KFold(n_splits=NFOLD)
pd_patient = pd.DataFrame({"Patient":tr["Patient"].unique()})

for idx, (tr_idx, val_idx) in enumerate(kf.split(pd_patient)):
    pd_patient.loc[val_idx,"fold"] = idx
pd_patient.head()

In [None]:
tr = tr.merge(pd_patient, on='Patient', how='left')
tr.head()

#tr['fold'] = -1
#for i in range(len(pd_patient)):
#    tr.fold[tr.Patient==pd_patient.loc[i,"Patient"]] = pd_patient.loc[i,"fold"]
#tr.head()

In [None]:

batch = 128

In [None]:
class Data_Generate(Dataset):
    def __init__(self,data,label=None):
        self.data = data
        self.label = label
        
    def __getitem__(self,index):
        z_ = self.data[index]
        if self.label is not None:
            y_ = self.label[index]
            y_ = y_[None,]
            return z_,y_
        else:
            return z_
         
    def __len__(self):
        return len(self.data)

In [None]:
class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt'):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
            path (str): Path for the checkpoint to be saved to.
                            Default: 'checkpoint.pt'
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path

    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            #print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

In [None]:
FE

In [None]:

%%time
cnt = 0
EPOCHS = 1000
criterion = mloss(0.3)
#for tr_idx, val_idx in kf.split(z):
for fold in range(NFOLD):
    list_train_loss,list_val_loss,list_train_score,list_val_score = [],[],[],[]
    val_out = []
    print(f"FOLD {fold+1}")
    #==================load data kfold==========================#
    tr_z = tr[FE][tr.fold!=fold].values.astype(np.float32)
    tr_y = tr.res_fvc[tr.fold!=fold].values.astype(np.float32)
    val_z = tr[FE][tr.fold==fold].values.astype(np.float32)
    val_y = tr.res_fvc[tr.fold==fold].values.astype(np.float32)
    base_tr = tr['min_FVC'][tr.fold!=fold].values.astype(np.float32)
    base_val = tr['min_FVC'][tr.fold!=fold].values.astype(np.float32)
    train_db = Data_Generate(tr_z,tr_y)
    train_loader = DataLoader(train_db, batch_size=batch, shuffle=True, num_workers=4)
    val_db = Data_Generate(val_z,val_y)
    val_loader = DataLoader(val_db, batch_size=batch, shuffle=False, num_workers=4)
    
    #==================prepare model==========================#
    tr_num_batch = len(train_loader)
    val_num_batch = len(val_loader)
    net = make_big_model(len(FE),3).to(device)
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=0.001 ,weight_decay=5e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=20, min_lr=1e-8, verbose=False)
    early_stopping = EarlyStopping(patience=80,path=f'Osic-NN-fold_{fold}.pth',verbose=False)

    for epoch in tqdm(range(EPOCHS)):
        train_loss,train_score,val_loss,val_score = 0,0,0,0
        #==================train ==========================#
        net.train()
        for idx, sample in enumerate(train_loader):
            feature, label = sample
          
            feature, label = feature.to(device), label.to(device)
            out = net(feature)
            loss = criterion(label, out)
            score_ = score(label ,out)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()/tr_num_batch
            train_score += score_.item()/tr_num_batch
        list_train_loss.append(train_loss)
        list_train_score.append(train_score)
        #==================val ==========================#
        net.eval()   
        for idx, sample in enumerate(val_loader):
            feature, label = sample
            feature, label = feature.to(device), label.to(device)
            with torch.no_grad():
                out = net(feature)
            #val_out.append(out.cpu().numpy())
            loss = criterion(label, out)
            score_ = score(label, out)
            val_loss += loss.item()/val_num_batch
            val_score += score_.item()/val_num_batch
        list_val_loss.append(val_loss)
        list_val_score.append(val_score)
        early_stopping(val_score, net)
        if early_stopping.early_stop:
            print("Early stopping")
            break
        scheduler.step(val_loss)
             
    print(f"train loss: {min(list_train_loss)}  train score: {min(list_train_score)}\n \
          val loss: {min(list_val_loss)} val score: {min(list_val_score)}\n \
          final lr: {optimizer.param_groups[0]['lr']}"
         )
    

In [None]:
def score_np(fvc_true, fvc_pred, sigma):
    sigma_clip = np.maximum(sigma, 70) 
    delta = np.abs(fvc_true - fvc_pred)
    delta = np.minimum(delta, 1000)
    sq2 = np.sqrt(2)
    metric = (delta / sigma_clip)*sq2 + np.log(sigma_clip* sq2)
    return np.mean(metric)


scores,sigma,mean_FVC,true_y = [],[],[],[]

#tr = new_tr
tr['pred_FVC']=-1
tr['Confidence']=-1
preds = []
for fold in range(NFOLD):
    pred = []
    net = make_big_model(in_ch=len(FE)).to(device)
    net.load_state_dict(torch.load(f"Osic-NN-fold_{fold}.pth"))
    net.eval()
    
    tr_z = tr[FE][tr.fold==fold].values.astype(np.float32)
    tr_y = tr.FVC[tr.fold==fold].values.astype(np.float32)
    base_val = tr['min_FVC'][tr.fold==fold].values.astype(np.float32)
    valid_db = Data_Generate(tr_z,tr_y)
    valid_loader = DataLoader(valid_db, batch_size=batch, shuffle=False, num_workers=4)
    
    for idx, sample in enumerate(valid_loader):
        feature, label = sample
        feature, label = feature.to(device), label.to(device)
        with torch.no_grad():
            out = net(feature)
        out = out.cpu().numpy() 
        pred.append(out)
    pred = np.concatenate(pred)# + base_val[:,None]
    preds.append(pred)
    tr.loc[tr.fold==fold,'pred_FVC'] = base_val - pred[:,1]
    tr.loc[tr.fold==fold,'pred_Confidence'] = abs(pred[:,2]-pred[:,0])
    
    
order_tr = tr.groupby(tr['Patient']).apply(lambda x:x.sort_values("Weeks",ascending=False))
p_id = pd.unique(tr.Patient)

for i in range(len(tr)):
    sigma.append(tr.iloc[i].pred_Confidence)
    mean_FVC.append(tr.iloc[i].pred_FVC)
    true_y.append(tr.iloc[i].FVC)

score_np(np.array(true_y),np.array(mean_FVC),np.array(sigma))
#np.array(scores).mean()

In [None]:
all_mean,last_mean = [],[]
for i in tr.Patient.unique():
    sigma = tr[tr.Patient==i].pred_Confidence
    mean_FVC = tr[tr.Patient==i].pred_FVC
    true_y = tr[tr.Patient==i].FVC
    all_mean.append(score_np(np.array(true_y),np.array(mean_FVC),np.array(sigma)))
    last_mean.append(score_np(np.array(true_y[-3:]),np.array(mean_FVC[-3:]),np.array(sigma[-3:])))
    
print(np.mean(all_mean),np.mean(last_mean))

In [None]:
tr_1 = tr.copy()

In [None]:
"""
scores,sigma,mean_FVC,true_y = [],[],[],[]
for i in p_id:
    last3_index = order_tr.loc[i,'Weeks'].index[:3]
    sigma.append(order_tr.iloc[last3_index].pred_Confidence)
    mean_FVC.append(order_tr.iloc[last3_index].pred_FVC)
    true_y.append(order_tr.iloc[last3_index].FVC)
    
    scores.append(score_np(np.array(true_y),np.array(mean_FVC),np.array(sigma)))

np.array(scores).mean()
"""

In [None]:
test

In [None]:
test = pd.read_csv(f'{ROOT}/test.csv')
for i in test.Patient:
    sub.loc[sub.Patient==i,'week'] = sub[sub.Patient==i].Weeks.values-test[test.Patient==i].Weeks.values
    
sub

In [None]:
ze = (sub[FE].values).astype(np.float32)
base_eval = (sub['min_FVC'].values).astype(np.float32)
pe = np.zeros((ze.shape[0], 3))

test_db = Data_Generate(ze)
test_loader = DataLoader(test_db, batch_size=batch, shuffle=False, num_workers=4)

In [None]:
print("predict test...")
pe = 0
for k in range(NFOLD):
    if k == 2:
        continue
    pred = []
    base_fvc = sub.FVC.values.astype(np.float32)
    net = make_big_model(len(FE)).to(device)
    net.load_state_dict(torch.load(f"Osic-NN-fold_{k}.pth"))
    net.eval()
    for idx, sample in enumerate(test_loader):
        data = sample
        data = data.to(device)
        with torch.no_grad():
            out = net(data)
        out = out.cpu().numpy()
        pred.append(out)
    pred = base_fvc[:,None] - np.concatenate(pred)
    pe += pred / 4

In [None]:
sub['FVC1'] = pe[:, 1]
sub['Confidence1'] = abs(pe[:, 2] - pe[:, 0])

In [None]:
subm = sub[['Patient_Week','FVC1','Confidence1']].copy()
subm.rename(columns={'FVC1':'FVC','Confidence1':'Confidence'},inplace=True) 
subm[["Patient_Week","FVC","Confidence"]].to_csv("submission.csv", index=False)

In [None]:
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor
import joblib

In [None]:
ROOT = "../input/osic-pulmonary-fibrosis-progression"

tr = pd.read_csv(f"{ROOT}/train.csv")
tr.drop_duplicates(keep=False, inplace=True, subset=['Patient','Weeks'])
feature_ct = pd.read_csv('../input/2020osic/CT_21feature.csv')

tr = tr.merge(feature_ct, on='Patient')
chunk = pd.read_csv(f"{ROOT}/test.csv")

print("add infos")
sub = pd.read_csv(f"{ROOT}/sample_submission.csv")
sub['Patient'] = sub['Patient_Week'].apply(lambda x:x.split('_')[0])
sub['Weeks'] = sub['Patient_Week'].apply(lambda x: int(x.split('_')[-1]))
sub =  sub[['Patient','Weeks','Confidence','Patient_Week']]
sub = sub.merge(chunk.drop('Weeks', axis=1), on="Patient")

tr['WHERE'] = 'train'
chunk['WHERE'] = 'val'
sub['WHERE'] = 'test'

data = tr.append([chunk, sub])
data['min_week'] = data['Weeks']
data.loc[data.WHERE=='test','min_week'] = np.nan
data['min_week'] = data.groupby('Patient')['min_week'].transform('min')

base = (
    data
    .loc[data.Weeks == data.min_week][['Patient','FVC']]
    .rename({'FVC': 'min_FVC'}, axis=1)
    .groupby('Patient')
    .first()
    .reset_index()
)

data = data.merge(base, on='Patient', how='left')
data['base_week'] = data['Weeks'] - data['min_week']
del base

base = (
    data
    .loc[data.Weeks == data.min_week][['Patient','Percent']]
    .rename({'Percent': 'min_Percent'}, axis=1)
    .groupby('Patient')
    .first()
    .reset_index()
)
data = data.merge(base, on='Patient', how='left')
del base

data.loc[:,"Sex"] = pd.factorize(data.Sex)[0]
data.loc[:,"SmokingStatus"] = pd.factorize(data.SmokingStatus)[0]

train_df = data[data.WHERE=='train']

train_df.head()

In [None]:
train_reg = pd.DataFrame(columns=['Patient', 'q2_Slope', 'q5_Slope','q8_Slope','slp'])

def model_slope(x):
    b = x[0][0]
    slope_1 = [(t[0]-b)/t[1] for t in x[1:]]
    sort_slope = sorted(slope_1,reverse=True)
    length = len(slope_1)
    return np.mean(slope_1),np.mean(sort_slope[:length//2]),np.mean(sort_slope[length//2:])
    
for idx,i in enumerate(pd.unique(train_df['Patient'])):
    FVC_Weeks = train_df[train_df.Patient==i][['FVC','base_week']].values
    slope_m,slope_f,slope_l = model_slope(FVC_Weeks)
    
 
    sub = train_df.loc[train_df.Patient == i, :] 
    fvc = sub.FVC.values
    weeks = sub.Weeks.values
    c = np.vstack([weeks, np.ones(len(weeks))]).T
    a, b = np.linalg.lstsq(c, fvc)[0]

    
    train_reg.loc[idx] = [i,slope_m,slope_f,slope_l,a]
train_reg.head()

In [None]:
final_df = pd.DataFrame(columns=train_df.columns)

for idx,i in enumerate(pd.unique(train_df['Patient'])):
    temp_df = train_df[train_df.Patient==i]
    final_df = final_df.append(temp_df[temp_df.Weeks==temp_df.min_week].iloc[0,:])

train_reg = train_reg.merge(final_df)
train_reg = train_reg.drop(['min_week','base_week'],axis=1)
train_reg.head()

In [None]:
train_reg.loc[:,"Sex"] = pd.factorize(train_reg.Sex)[0]
train_reg.loc[:,"SmokingStatus"] = pd.factorize(train_reg.SmokingStatus)[0]

train_reg['SmokingStatus'] = (train_reg['SmokingStatus'] - train_reg['SmokingStatus'].min() ) / ( train_reg['SmokingStatus'].max() - train_reg['SmokingStatus'].min())

train_reg.head()

In [None]:
NFOLD = 5
kf = KFold(n_splits=NFOLD)
train_reg['fold']=-1
for idx, (tr_idx, val_idx) in enumerate(kf.split(train_reg)):
    train_reg.loc[val_idx,"fold"] = idx
train_reg.head()

In [None]:
train_reg = train_reg.merge(feature_ct)
train_reg_bp = train_reg.copy()
train_reg.head()

In [None]:
train_reg['age'] = (train_reg['Age'] - train_reg['Age'].min() ) / ( train_reg['Age'].max() - train_reg['Age'].min() )
train_reg['BASE'] = (train_reg['min_FVC'] - train_reg['min_FVC'].min() ) / ( train_reg['min_FVC'].max() - train_reg['min_FVC'].min())
train_reg['percent'] = (train_reg['min_Percent'] - train_reg['min_Percent'].min() ) / ( train_reg['min_Percent'].max() - train_reg['min_Percent'].min())
train_reg['volume'] = (train_reg['Volume'] - train_reg['Volume'].min() ) / ( train_reg['Volume'].max() - train_reg['Volume'].min())
train_reg['mean'] = (train_reg['Mean'] - train_reg['Mean'].min()) / (train_reg['Mean'].max() - train_reg['Mean'].min())
train_reg['skew'] = (train_reg['Skew'] - train_reg['Skew'].min())/(train_reg['Skew'].max() - train_reg['Skew'].min())
train_reg['median'] = (train_reg['Median'] - train_reg['Median'].min()) / (train_reg['Median'].max() - train_reg['Median'].min())
train_reg['kurthosis'] = (train_reg['Kurthosis'] - train_reg['Kurthosis'].min())/(train_reg['Kurthosis'].max() - train_reg['Kurthosis'].min())
train_reg['haa'] = (train_reg['HAA'] - train_reg['HAA'].min())/(train_reg['HAA'].max() - train_reg['HAA'].min())


train_reg['midmedian'] = (train_reg['midMedian'] - train_reg['midMedian'].min() ) / ( train_reg['midMedian'].max() - train_reg['midMedian'].min())
train_reg['midmean'] = (train_reg['midMean'] - train_reg['midMean'].min()) / (train_reg['midMean'].max() - train_reg['midMean'].min())
train_reg['midskew'] = (train_reg['midSkew'] - train_reg['midSkew'].min())/(train_reg['midSkew'].max() - train_reg['midSkew'].min())
train_reg['midkurthosis'] = (train_reg['midKurthosis'] - train_reg['midKurthosis'].min())/(train_reg['midKurthosis'].max() - train_reg['midKurthosis'].min())
train_reg['midhaa'] = (train_reg['midHAA'] - train_reg['midHAA'].min())/(train_reg['midHAA'].max() - train_reg['midHAA'].min())

train_reg['pro_0'] = (train_reg['pro_0']-train_reg['pro_0'].min())/(train_reg['pro_0'].max() - train_reg['pro_0'].min())
train_reg['pro_100'] = (train_reg['pro_100']-train_reg['pro_100'].min())/(train_reg['pro_100'].max() - train_reg['pro_100'].min())
train_reg['pro_200'] = (train_reg['pro_200']-train_reg['pro_200'].min())/(train_reg['pro_200'].max() - train_reg['pro_200'].min())
train_reg['pro_300'] = (train_reg['pro_300']-train_reg['pro_300'].min())/(train_reg['pro_300'].max() - train_reg['pro_300'].min())
train_reg['pro_400'] = (train_reg['pro_400']-train_reg['pro_400'].min())/(train_reg['pro_400'].max() - train_reg['pro_400'].min())
train_reg['pro_500'] = (train_reg['pro_500']-train_reg['pro_500'].min())/(train_reg['pro_500'].max() - train_reg['pro_500'].min())
train_reg['pro_600'] = (train_reg['pro_600']-train_reg['pro_600'].min())/(train_reg['pro_600'].max() - train_reg['pro_600'].min())
train_reg['pro_700'] = (train_reg['pro_700']-train_reg['pro_700'].min())/(train_reg['pro_700'].max() - train_reg['pro_700'].min())
train_reg['pro_800'] = (train_reg['pro_800']-train_reg['pro_800'].min())/(train_reg['pro_800'].max() - train_reg['pro_800'].min())
train_reg['pro_900'] = (train_reg['pro_900']-train_reg['pro_900'].min())/(train_reg['pro_900'].max() - train_reg['pro_900'].min())

train_reg['Sex'] = (train_reg['Sex'] - train_reg['Sex'].min() ) / ( train_reg['Sex'].max() - train_reg['Sex'].min() )
train_reg['SmokingStatus'] = (train_reg['SmokingStatus'] - train_reg['SmokingStatus'].min() ) / ( train_reg['SmokingStatus'].max() - train_reg['SmokingStatus'].min())

In [None]:
"""
######################xgboost##############################
from xgboost import XGBRegressor
import xgboost as xgb
from sklearn.metrics import mean_absolute_error

FE = ['SmokingStatus','Sex','age','BASE','volume','mean','skew','kurthosis','haa','median',
     'midmean','midmedian','midskew','midkurthosis','midhaa',
     'pro_0','pro_100','pro_200','pro_300','pro_400','pro_500',
     'pro_600','pro_700','pro_800','pro_900']

pred_xgboost = []
train_reg['pre_s_xgb'] = -1
m,last3 = [],[]
paras={
    'booster':'gbtree',
 
    'objective':'reg:squarederror',
    'gamma':0.05,#树的叶子节点下一个区分的最小损失，越大算法模型越保守
    'lambda':8,#L2正则项权重
    'subsample':0.8,#采样训练数据，设置为0.5
    'colsample_bytree':0.7,#构建树时的采样比率
    'min_child_weight':1,#节点的最少特征数
    'eta':0.05,#类似学习率
    'seed':0,
    'nthread':4,#cpu线程数
    'eval_metric':'mae'
}

for n in range(NFOLD):
        
    print(f"FOLD {n+1}")
    #==================load data kfold==========================#
    tr_x,val_x = train_reg[train_reg.fold!=n][FE].values,train_reg[train_reg.fold==n][FE].values
    tr_y,val_y = train_reg[train_reg.fold!=n]['slp'].values,train_reg[train_reg.fold==n]['slp'].values

 
    my_model = XGBRegressor(**paras,n_estimators=1000)
    dtrain = xgb.DMatrix(tr_x, label=tr_y)
    dval = xgb.DMatrix(val_x, label=val_y)
    evallist = [(dval, 'eval'), (dtrain, 'train')]
    num_round = 10
    my_model.fit(tr_x, tr_y, early_stopping_rounds=10, 
             eval_set=[(val_x, val_y)], verbose=False)
    y_pred = my_model.predict(val_x)
    pred_xgboost.append(y_pred)
    print('The rmse of prediction is:', mean_absolute_error(val_y, y_pred))
    index = train_reg[train_reg.fold==n].index
    
    train_reg.loc[index,['pre_s_xgb']] = my_model.predict(val_x)
    for p in train_reg[train_reg.fold==n].Patient:
        
        percent_true = train_df.Percent.values[train_df.Patient == p]
        fvc_true = train_df.FVC.values[train_df.Patient == p]
        weeks_true = train_df.Weeks.values[train_df.Patient == p]
        a = train_reg[train_reg.Patient==p].pre_s_xgb.values
        
        fvc = a * (weeks_true - weeks_true[0]) + fvc_true[0]
        percent = percent_true[0] - a * abs(weeks_true - weeks_true[0])
        train_df.loc[train_df.Patient==p,'img_fvc_xgb'] = fvc
        train_df.loc[train_df.Patient==p,'img_con_xgb'] = percent
        m.append(score_np(fvc_true, fvc,percent))
        last3.append(score_np(fvc_true[-3:], fvc[-3:],percent[-3:]))
    print(f'all_mean:{(np.mean(m))},last3_mean:{np.mean(last3)}')
"""

In [None]:
FE = ['percent','SmokingStatus','Sex','age','BASE','volume','mean','skew','kurthosis','haa','median',
     'midmean','midmedian','midskew','midkurthosis','midhaa',
     'pro_0','pro_100','pro_200','pro_300','pro_400','pro_500',
     'pro_600','pro_700','pro_800','pro_900']

train_reg['pre_s'] = -1
m,last3 = [],[]
for n in range(NFOLD):
    tr_x,val_x = train_reg[train_reg.fold!=n][FE].values,train_reg[train_reg.fold==n][FE].values
    tr_y,val_y = train_reg[train_reg.fold!=n]['slp'].values,train_reg[train_reg.fold==n]['slp'].values

 
    randomforest = RandomForestRegressor(n_estimators = 500, oob_score = True, n_jobs = -1,
                                    max_features = "sqrt", min_samples_leaf = 5, max_samples = 0.8)
       
    randomforest.fit(tr_x, tr_y)

    joblib.dump(randomforest, f'./all_rf_{n}.pkl')
    
    index = train_reg[train_reg.fold==n].index
    
    train_reg.loc[index,['pre_s']] = randomforest.predict(val_x)
    
    s = np.mean(abs(train_reg.loc[index,['pre_s']].values-train_reg.loc[index,['slp']].values))
    print(f"{n}th model's out-of-bag score : ", s)
    
    for p in train_reg[train_reg.fold==n].Patient:
        
        percent_true = train_df.Percent.values[train_df.Patient == p]
        fvc_true = train_df.FVC.values[train_df.Patient == p]
        weeks_true = train_df.Weeks.values[train_df.Patient == p]
        a = train_reg[train_reg.Patient==p].pre_s.values
        
        fvc = a * (weeks_true - weeks_true[0]) + fvc_true[0]
        percent = percent_true[0] - a * abs(weeks_true - weeks_true[0])
        train_df.loc[train_df.Patient==p,'img_fvc'] = fvc
        train_df.loc[train_df.Patient==p,'img_con'] = percent
        m.append(score_np(fvc_true, fvc,percent))
        last3.append(score_np(fvc_true[-3:], fvc[-3:],percent[-3:]))
    print(f'all_mean:{(np.mean(m))},last3_mean:{np.mean(last3)}')

In [None]:
test = pd.read_csv(f"{ROOT}/test.csv")
test_df = test.merge(lung_stat_pd,)
test_df.head()

In [None]:
train_reg = train_reg_bp
test_df['age'] = (test_df['Age'] - train_reg['Age'].min() ) / ( train_reg['Age'].max() - train_reg['Age'].min() )
test_df['BASE'] = (test_df['FVC'] - train_reg['min_FVC'].min() ) / ( train_reg['min_FVC'].max() - train_reg['min_FVC'].min())
test_df['percent'] = (test_df['Percent'] - train_reg['min_Percent'].min() ) / ( train_reg['min_Percent'].max() - train_reg['min_Percent'].min())
test_df['volume'] = (test_df['Volume'] - train_reg['Volume'].min() ) / ( train_reg['Volume'].max() - train_reg['Volume'].min())
test_df['mean'] = (test_df['Mean'] - train_reg['Mean'].min()) / (train_reg['Mean'].max() - train_reg['Mean'].min())
test_df['skew'] = (test_df['Skew'] - train_reg['Skew'].min())/(train_reg['Skew'].max() - train_reg['Skew'].min())
test_df['median'] = (test_df['Median'] - train_reg['Median'].min()) / (train_reg['Median'].max() - train_reg['Median'].min())
test_df['kurthosis'] = (test_df['Kurthosis'] - train_reg['Kurthosis'].min())/(train_reg['Kurthosis'].max() - train_reg['Kurthosis'].min())
test_df['haa'] = (test_df['HAA'] - train_reg['HAA'].min())/(train_reg['HAA'].max() - train_reg['HAA'].min())


test_df['midmedian'] = (test_df['midMedian'] - train_reg['midMedian'].min() ) / ( train_reg['midMedian'].max() - train_reg['midMedian'].min())
test_df['midmean'] = (test_df['midMean'] - train_reg['midMean'].min()) / (train_reg['midMean'].max() - train_reg['midMean'].min())
test_df['midskew'] = (test_df['midSkew'] - train_reg['midSkew'].min())/(train_reg['midSkew'].max() - train_reg['midSkew'].min())
test_df['midkurthosis'] = (test_df['midKurthosis'] - train_reg['midKurthosis'].min())/(train_reg['midKurthosis'].max() - data['midKurthosis'].min())
test_df['midhaa'] = (test_df['midHAA'] - train_reg['midHAA'].min())/(train_reg['midHAA'].max() - train_reg['midHAA'].min())

test_df['pro_0'] = (test_df['pro_0']-train_reg['pro_0'].min())/(train_reg['pro_0'].max() - train_reg['pro_0'].min())
test_df['pro_100'] = (test_df['pro_100']-train_reg['pro_100'].min())/(train_reg['pro_100'].max() - train_reg['pro_100'].min())
test_df['pro_200'] = (test_df['pro_200']-train_reg['pro_200'].min())/(train_reg['pro_200'].max() - train_reg['pro_200'].min())
test_df['pro_300'] = (test_df['pro_300']-train_reg['pro_300'].min())/(train_reg['pro_300'].max() - train_reg['pro_300'].min())
test_df['pro_400'] = (test_df['pro_400']-train_reg['pro_400'].min())/(train_reg['pro_400'].max() - train_reg['pro_400'].min())
test_df['pro_500'] = (test_df['pro_500']-train_reg['pro_500'].min())/(train_reg['pro_500'].max() - train_reg['pro_500'].min())
test_df['pro_600'] = (test_df['pro_600']-train_reg['pro_600'].min())/(train_reg['pro_600'].max() - train_reg['pro_600'].min())
test_df['pro_700'] = (test_df['pro_700']-train_reg['pro_700'].min())/(train_reg['pro_700'].max() - train_reg['pro_700'].min())
test_df['pro_800'] = (test_df['pro_800']-train_reg['pro_800'].min())/(train_reg['pro_800'].max() - train_reg['pro_800'].min())
test_df['pro_900'] = (test_df['pro_900']-train_reg['pro_900'].min())/(train_reg['pro_900'].max() - train_reg['pro_900'].min())


test_df.loc[:,"Sex"] = pd.factorize(test_df.Sex)[0]
test_df.loc[:,"SmokingStatus"] = pd.factorize(test_df.SmokingStatus)[0]
test_df['Sex'] = (test_df['Sex'] - train_reg['Sex'].min() ) / ( train_reg['Sex'].max() - train_reg['Sex'].min() )
test_df['SmokingStatus'] = (test_df['SmokingStatus'] - train_reg['SmokingStatus'].min() ) / ( train_reg['SmokingStatus'].max() - train_reg['SmokingStatus'].min())

In [None]:
sub = pd.read_csv(f"{ROOT}/sample_submission.csv")
sub['Patient'] = sub['Patient_Week'].apply(lambda x:x.split('_')[0])
sub['Weeks'] = sub['Patient_Week'].apply(lambda x: int(x.split('_')[-1]))
sub =  sub[['Patient','Weeks','Confidence','Patient_Week']]
sub = sub.merge(chunk.drop('Weeks', axis=1), on="Patient")
sub.head()

FE = ['percent','SmokingStatus','Sex','age','BASE','volume','mean','skew','kurthosis','haa','median',
     'midmean','midmedian','midskew','midkurthosis','midhaa',
     'pro_0','pro_100','pro_200','pro_300','pro_400','pro_500',
     'pro_600','pro_700','pro_800','pro_900']

Y = ['slp']
pred_y = []

for n in range(NFOLD):
    val_x = test_df[FE]

    randomforest = joblib.load( f'./all_rf_{n}.pkl') 
    
    pred_y.append(randomforest.predict(val_x))

pred_y = np.array(pred_y).mean(0)
test_df.loc[:,'preslp'] = pred_y

for p in test_df.Patient:
    a = test_df[test_df.Patient==p].preslp.values
    weeks_base = test_df[test_df.Patient==p].Weeks.values
    weeks_true = sub[sub.Patient==p].Weeks.values
    percent_true = sub[sub.Patient==p].Percent.values
    fvc_true = sub[sub.Patient==p].FVC.values
    
    fvc = a * (weeks_true - weeks_base[0]) + fvc_true[0]
    percent = weeks_base[0] - a * abs(weeks_true - weeks_base[0])
       
    sub.loc[sub.Patient==p,'FVC'] = fvc
    sub.loc[sub.Patient==p,'Confidence'] = percent

sub.head()

subs = sub[["Patient_Week","FVC","Confidence"]]

In [None]:
l = 0.
m,last3 = [],[]
for i in train_df.Patient:
    true_fvc = tr_1[tr_1.Patient==i].FVC
    pred_fvc = (train_df[train_df.Patient==i].img_fvc)*l + (1-l)*tr_1[tr_1.Patient==i].pred_FVC
    pred_sig = (train_df[train_df.Patient==i].img_con)*l + (1-l)*tr_1[tr_1.Patient==i].pred_Confidence
    m.append(score_np(true_fvc, pred_fvc,pred_sig))
    last3.append(score_np(true_fvc[-3:], pred_fvc[-3:],pred_sig[-3:]))
print(f'all_mean:{(np.mean(m))},last3_mean:{np.mean(last3)}')