# XY's solution

## Features
1. Basic features like age,sex,..
2. Features generated by using lungmask (https://github.com/JoHof/lungmask/) as disclosed in https://www.kaggle.com/c/osic-pulmonary-fibrosis-progression/discussion/164883
3. Segmented images using method as described in https://www.kaggle.com/aadhavvignesh/lung-segmentation-by-marker-controlled-watershed

## Models to predict FVC
1. Custom Neural Network to predict slant of FVC decay
2. 3 LGBMs to predict slant of FVC decay ( differs in calculation of slant )
3. LGBM to predict FVC directly

## Procedure to predict Confidence
1. OOF prediction FVC is calculated.
2. OOF prediction error is calculated.
3. Calculate optimal confidence value to get best score when get that prediction error.
4. Train fitter to predict that optimal confidnece value form features.
5. Use that confience fitter to predict test confidence from test features.

## Custom Neural Network
- Inputs are features, original images, and segmented images.
- Sequence of images are encoded using part of u-net(R231), which is the bottom of u-net conv layer having the lowest resolution output.
- Encoded values are feed to LSTM
- Final output if calculated by using table features and LSTM outputs

## How this notebook works
- Precalculated features are loaded (Datasets lungmasks5, classicsegmentedjpg )
- Pretrained custom neural network's weights are loaded (Datasets osicnn0927segonlyseed0, 1, 2, 3, 4) ; pls don't care about the name "segonly" which actualy uses both orignal images and segmented images.
- The other predictors are trained to calculate parameters
- To generate submission file, first calculate features, then prediction by using above models.

## Two submission file
- One submission file is generated by above models, another is generated by ensemble of above models by changing 5 different fold splits.


---



## Config

In [None]:
import numpy as np

CNF={
    'lstm_mem':64,
    'mixup':True,
    'lung_area_th':0.2, # maxの何割以下を落とすか
    'calcAmethod':'simpleLinearRegress',#'huber'# last3 # last3からtrueAを計算する
    'jpgPath':"../input/classicsegmentedjpg/classic_segmented/classic_segmented",
    'isTrain':False,
    'verbose':True,
    'DBG':True,
    'num_sequence':12,
    'LUNG_MODEL':'R231',
    'batch_size':3,  
    'mskDfPath':'../input/lungmasks5/LTRC_feat_df.csv',
    'trnPath':'../input/osic-pulmonary-fibrosis-progression/train',
    'mskjpgSizePklPath':'../input/lungmasks/cropped_orig_size_dict.pkl',
    'useLungVolume':True,
    'maxEpoch':20,
    'EvalOnlyLast3':True,
    'aug':False, # Falseでも一応アスペクト対応だけはする
    'kl_weight':0.01,
    'EMA':True,
    
    
    # -------------------------------------------
    
    'prefixes':[[f"osicnn0927segonlyseed{_seed}",f"A0LGBMv1seed{_seed}",f"ARLGBMv1seed{_seed}",f"LGBMv1seed{_seed}",f"weekLGBMv1seed{_seed}"] for _seed in range(5)],
    'ensembleWeight':[[3,1,2,1,2] for _ in range(5)],
    
    # -------------------------------------------
    
    
    'n_tta':1,
    'useLGBM':True,
    'quality':75, # jpg quality
 
}

CNF['ensembleWeight']=[ np.array(_ws)/np.sum(_ws) for _ws in CNF['ensembleWeight'] ]

assert len(CNF['prefixes'])==len(CNF['ensembleWeight'])

LUNG_CNF={
    'R231':{'pth':'../input/pthunetlungmask/unet_r231-d5d2fc3d.pth','nc':3},
    'COVID19':{'pth':'../input/pthunetlungmask/unet_r231covid-0de78a7e.pth','nc':3},
    'LTRC':{'pth':'../input/pthunetlungmask/unet_ltrclobes-3a07043d.pth','nc':6}
}

!cp {CNF['jpgPath']+'/../../createClassicSegmented.py'} .


In [None]:
# %%capture
# !conda install -c conda-forge -y gdcm

In [None]:
%%capture
%cd ../input/python3gdcm
!dpkg -i build_1-1_amd64.deb
!apt-get install -f
!cp /usr/local/lib/gdcm.py /opt/conda/lib/python3.7/site-packages/.
!cp /usr/local/lib/gdcmswig.py /opt/conda/lib/python3.7/site-packages/.
!cp /usr/local/lib/_gdcmswig.so /opt/conda/lib/python3.7/site-packages/.
!cp /usr/local/lib/libgdcm* /opt/conda/lib/python3.7/site-packages/.
!ldconfig

In [None]:
%cd -

In [None]:
# !pip install pydicom==2.0

In [None]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import os,sys,random,gc
import seaborn as sns

import cv2
from matplotlib import pyplot as plt
import matplotlib.cm as cm
import pydicom
from pydicom.pixel_data_handlers.util import apply_modality_lut
import pandas as pd
from osicutil import UNet
from scipy.stats import median_absolute_deviation as mad
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold,GroupKFold

from osicutil import ModelEMA
from osicutil import LinearARD,ELBOLoss,get_ard_reg,_get_dropped_params_cnt,_get_params_cnt,get_dropped_params_ratio # ARD
from osicutil import calcScore
from osicutil import _save,_load

def seed_everything(seed: int):
    random.seed(seed);os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed);torch.manual_seed(seed);torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

class HorizontalDisplay:
    def __init__(self, *args):
        self.args = args
    def _repr_html_(self):
        template = '<div style="float: left; padding: 10px;">{0}</div>'
        return "\n".join(template.format(arg._repr_html_()) for arg in self.args)


## create lung mask features

In [None]:
def ceateVerticalVolumeFeatDf(path,prefixXorY):
    
    print(f"{path}")
    ret_df = pd.DataFrame()
    tmp_dict = dict()
    
    _eps=1e-6
    
    for fname in os.listdir(path):
        
        if not fname.endswith('.pkl'):
            continue
        
        pid = fname[:-4]
        
        volume = _load(f"{path}/{fname}")
        
        tmp_dict = {'Patient':pid}
        
        
        # mask6 & masked image6 & lung area1 total 13 class
        
        # 最後は肺領域 range(7) 
        for cls_id in [1,2]: # 3,4,5は意味ないっぽい
            
            volume = volume[:,50:-50]
            # ltrc mask
            tmp_dict[f"vert{prefixXorY}_msk_cls{cls_id}_mad"]  = mad(volume[cls_id],axis=-1)
            
            # ltrc mask*image
            tmp_dict[f"vert{prefixXorY}_mskdimage_cls{cls_id}_mad"]  = mad(volume[cls_id+6],axis=-1)
            
            # ratio
            tmp_dict[f"vert{prefixXorY}_ratio_cls{cls_id}_mad"]  = mad(volume[cls_id+6]/(volume[cls_id]+_eps),axis=-1)
            
        
        ret_df = pd.concat([ret_df,pd.DataFrame(tmp_dict,index=[-1])]) # 間違いないようにindexに-1を設定
    
    return ret_df,ret_df.columns.drop('Patient').tolist()


## Confidence fitter

In [None]:
class ConfidenceFitterOptimizedLinear:
    
    def __init__(self,feat,allFVC,allPredFVC):
        
        from scipy.optimize import minimize
        from sklearn.linear_model import LinearRegression
        
        self.reg = LinearRegression().fit(feat,abs(allFVC-allPredFVC))
        
        init_param = np.concatenate([self.reg.coef_, [self.reg.intercept_] ])
        
        def confidence_loss_func(param):
            estimated_simga = feat@param[:-1]+param[-1]
            return calcScore(allFVC, allPredFVC, estimated_simga)
        
        self.res = minimize(confidence_loss_func, init_param, method='Nelder-Mead', tol=1e-9)
            
    def predict(self,feat):
        return feat@self.res.x[:-1]+self.res.x[-1]


## Add Feature

In [None]:
from sklearn.linear_model import HuberRegressor
import glob


def addFeatureDf(df,mskDfPath,dicomPath,is_test=False): 
    
    feat_cols=['Age','Female', 'Male', 'Currently smokes', 'Ex-smoker', 'Never smoked','cat_coef','FVC0','Week0is0']
    
    df['Week0'] = df['Weeks'] # ----------------- append Week0,FVC0,rWeeks(week-week0)
    df['Week0'] = df.groupby('Patient')['Week0'].transform('min')
    df['Week0is0'] = (df['Week0']==0).astype('int')
    
    base = df.loc[df.Weeks == df.Week0]
    base = base[['Patient', 'FVC','Percent']].copy()
    base.columns = ['Patient', 'FVC0','Percent']
    base['nb'] = 1
    base['nb'] = base.groupby('Patient')['nb'].transform('cumsum')
    base['cat_coef'] = base['FVC0'].values/base['Percent'].values
    base = base[base.nb == 1]
    base.drop(['nb','Percent'], axis=1, inplace=True)
    df = df.merge(base, on='Patient', how='left')

    df['rWeeks'] = df['Weeks'] - df['Week0'] 
    
    df = pd.concat([df, pd.get_dummies(df.Sex), pd.get_dummies(df.SmokingStatus)], axis=1).drop(columns=['Sex','SmokingStatus']) #  one-hot enc
    for col in ['Female', 'Male', 'Currently smokes', 'Ex-smoker', 'Never smoked']: # if there is no such values in df, create and set 0
        df[col] = 0 if col not in df.columns else df[col]
        df[col] = df[col].astype('int')
    
    df.Age = ( df.Age - 30 ) / 30 
    
    if CNF['useLungVolume']: # -------------------- Create Lung Volume Features
        
        _df = pd.read_csv(mskDfPath)
        if not is_test:
            _df = _df.drop(index=_df.loc[(_df.Patient=='ID00078637202199415319443')&(_df.file_id>509)].index).reset_index(drop=True) # remove duplicated sequence 
        
        # drop non lung image
        _df['lung_area'] = _df['lung_area']*_df['dx']*_df['dy']
        _lung_area_df = _df[['Patient','file_id','lung_area']]
        #_df = _df.drop(index=_df.loc[(_df.lung_area<400)].index).reset_index(drop=True) 
        
        for p in _lung_area_df.Patient.unique(): # ----------------------------------------------------  Drop small area
            _th = _lung_area_df[_lung_area_df.Patient==p].lung_area.max()*CNF['lung_area_th']
            _df = _df.drop(index=_df.loc[(_df.Patient==p)&(_df.lung_area<_th)].index).reset_index(drop=True) # new th for new mask
        
        
        cls_ids = [ int(col[-1]) for col in _df.columns.tolist() if col.startswith('area_cls')]
        
        for cls_id in cls_ids:
            _df[f"area_cls{cls_id}"]*=_df['dx']*_df['dy']
            _df[f"mskd_img_sum_cls{cls_id}"]*=_df['dx']*_df['dy']
            _df[f"mskd_ratio_cls{cls_id}"]=_df[f"mskd_img_sum_cls{cls_id}"]/_df[f"area_cls{cls_id}"]

        mask_feat_cols = [ f"area_cls{cls_id}" for cls_id in cls_ids ]+[ f"mskd_img_sum_cls{cls_id}" for cls_id in cls_ids ]+[ f"mskd_ratio_cls{cls_id}" for cls_id in cls_ids ] 
        
        mask_feat = _df.groupby('Patient')[mask_feat_cols].agg(['median','mean',mad]).reset_index()
        
        mask_feat.columns = ["_".join(pair) if '' not in pair else pair[0] for pair in mask_feat.columns ] #マルチインデックス解消
        
        df=df.merge(mask_feat,how='left',on='Patient')
        feat_cols+=mask_feat.columns[1:-1].tolist() # 最後にはr231_lung_area
        
        # dicom info 
        aux_df = _df[['Patient','file_id']].groupby('Patient')['file_id'].apply(max).reset_index().rename(columns={'file_id':'max_file_id'})
        df=df.merge(aux_df ,how='left',on='Patient')
        feat_cols+=['max_file_id']
        
        
    if not is_test: # ------------- Create Reg. Coef. 'A' ; FVC = A*(week-week0)+FVC0
    
        df['trueA']=np.nan # slant model fvc = trueA*(week-week0)+fvc0 where trueA is solved as A of y=Ax+B
        df['trueB']=np.nan
        
        df['trueAR']=np.nan # Ratio model fvc = fvc0*trueAR*(week-week0)+fvc0
        df['trueA0']=np.nan # week0 model fvc = trueA0*(week-week0)+fvc0 かならず ( weeek0, fvc0 )を通る
        
        print(CNF['calcAmethod'])
        
        for i, p in enumerate(df.Patient.unique()):
        
            if CNF['calcAmethod']=='last3':
                
                sub = df.loc[(df.Patient == p)&(df.is_last3 == 1), :] 
                fvc = sub.FVC.values - sub.FVC0
                c = np.vstack([sub.rWeeks.values]).T
                df.loc[df.Patient == p, 'trueA'] = np.linalg.lstsq(c, fvc, rcond=None)[0]

            elif CNF['calcAmethod']=='simpleLinearRegress':
                
                sub = df.loc[(df.Patient == p), :] 
                fvc = sub.FVC.values 
                c = np.vstack([sub.Weeks.values,np.ones(len(sub.Weeks.values))]).T
                df.loc[df.Patient == p, 'trueA'],df.loc[df.Patient == p, 'trueB'] = np.linalg.lstsq(c, fvc, rcond=None)[0]
                
                df.loc[df.Patient == p, 'w0'] = 1/np.sqrt(np.mean(np.abs(sub.FVC.values -  df.loc[df.Patient == p,'trueA'].values*sub.Weeks.values-df.loc[df.Patient == p, 'trueB'].values  )))
                
                
                # AR Retio model
                c = np.vstack([sub.rWeeks.values]).T
                df.loc[df.Patient == p, 'trueAR'] = np.linalg.lstsq(c, (fvc-sub.FVC0.values)/sub.FVC0.values, rcond=None)[0]
                
                df.loc[df.Patient == p, 'w0'] = 1/np.sqrt(np.mean(np.abs(sub.FVC.values -  df.loc[df.Patient == p,'FVC0'].values*df.loc[df.Patient == p,'trueAR'].values*sub.Weeks.values )))
                
                # A0  model
                c = np.vstack([sub.rWeeks.values]).T
                df.loc[df.Patient == p, 'trueA0'] = np.linalg.lstsq(c, (fvc-sub.FVC0.values), rcond=None)[0]
                
                df.loc[df.Patient == p, 'w0'] = 1/np.sqrt(np.mean(np.abs(sub.FVC.values -  df.loc[df.Patient == p,'trueA'].values*sub.Weeks.values  )))
                
                
            elif CNF['calcAmethod']=='huber':
                
                sub = df.loc[(df.Patient == p), :] 
                linear = HuberRegressor().fit(sub.Weeks.values[:,np.newaxis],sub.FVC.values )
                
                df.loc[df.Patient == p, 'trueA'] = linear.coef_
                df.loc[df.Patient == p, 'trueB'] = linear.intercept_
                
            else:
                sub = df.loc[(df.Patient == p), :] 
                fvc = sub.FVC.values - sub.FVC0
                c = np.vstack([sub.rWeeks.values]).T
                df.loc[df.Patient == p, 'trueA'] = np.linalg.lstsq(c, fvc, rcond=None)[0]
    

    for XorY in ['X','Y']:# ------------ vertical features
        
        if is_test:
            vert_df,_vertical_feat_cols = ceateVerticalVolumeFeatDf(f"./test_vertical{XorY}",prefixXorY=XorY)            
        else:
            vert_df,_vertical_feat_cols = ceateVerticalVolumeFeatDf(f"{'/'.join(mskDfPath.split('/')[:-1])}/vertical{XorY}/vertical{XorY}",prefixXorY=XorY)
        
        df=df.merge(vert_df ,how='left',on='Patient')
        feat_cols+=_vertical_feat_cols

    return df,feat_cols


In [None]:
def markLast3(df):
    
    # trainはweek順にきちんと整列してある（確認済）ことを利用して、各Patientの最初の最後の３つだけ残す
    df['nb']=1
    df['nb'] = df.groupby('Patient')['nb'].transform('cumsum')
    df['max_nb'] = df.groupby('Patient')['nb'].transform('max')
    df['is_last3'] = (df['nb']-df['max_nb']>-3).astype(int)
    df.drop(['nb','max_nb'], axis=1, inplace=True)
    
    return df

In [None]:
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")

## OsicDataset for NN

In [None]:
# 患者ごとに　画像スタック、特徴量、ラベル（A）を返す

from createClassicSegmented import separate_lungs

class OsicDataset4NN(Dataset):
    
    def __init__(self,table_df,mskDfPath,target_col,is_train=True,num_slice=12,dbg=False,tta_id=0,image_path='train',transform=None,mixup=False):
        
        # tta_idはスライスの最初の番号、全スライスが100で10分割するなら0～9
        
        assert (image_path=='train')|(image_path=='test')
        assert transform!=None
        
        self.table_df = table_df.copy() ; self.dbg = dbg ; self.num_slice = num_slice 
        self.is_train = is_train ; self.tta_id = tta_id ; self.image_path = image_path 
        self.transform = transform
        self.mixup = mixup
        self.target_col = target_col
        
        if 'cat_coef' in self.table_df.columns:
            self.table_df['cat_coef']=self.table_df['cat_coef']/50
        
        self.p2fids = self.mkPatient2Files(mskDfPath,num_slice=self.num_slice) # 有効肺画像のdict
        self.dcmlist=[]
        self.curryMask={}
        
        if self.dbg:
            allDir = ["ID00419637202311204720264","ID00132637202222178761324","ID00009637202177434476278"]
        else:
            
            allDir = self.table_df.Patient.unique()
        
        for Patient in allDir:
            
            paths=[]
            jpgpaths=[]
            for cnt,fids in enumerate(self.p2fids[Patient]):

                paths.append( [ f"../input/osic-pulmonary-fibrosis-progression/{self.image_path}/{Patient}/{fid}.dcm" for fid in fids ] )
                
                #
                # CNFがグローバル変数であることに注意
                #
                
                jpgpaths.append( [ f"{CNF['jpgPath']}/{Patient}/{fid}.jpg" for fid in fids ] )
                
                dcm = pydicom.read_file(f"../input/osic-pulmonary-fibrosis-progression/{self.image_path}/{Patient}/{fids[0]}.dcm")
                
                # カレーちゃんのマスク
                edge_pixel_value = dcm.pixel_array[0, 0]
                if cnt==0:
                   self.curryMask[Patient]=(dcm.pixel_array != edge_pixel_value)
                else:
                   self.curryMask[Patient]|=(dcm.pixel_array != edge_pixel_value)
    
            self.dcmlist.append({'Patient':Patient,'paths':paths,'jpgpaths':jpgpaths})
        
        #
        # jpgがない場合は作る
        #

        for dcm_info in self.dcmlist:
            
            for _jpgpaths in dcm_info['jpgpaths']:
                
                jpgpath = _jpgpaths[0]
                file_id = int(jpgpath.split('/')[-1][:-4])
                p = dcm_info['Patient']

                if os.path.isfile(jpgpath)==False: 
                    
                    _dir = '/'.join( jpgpath.split('/')[:-1] )
                    os.makedirs(f"{_dir}", exist_ok=True)

                    dcm = pydicom.read_file(f"../input/osic-pulmonary-fibrosis-progression/{self.image_path}/{p}/{file_id}.dcm")
                    
                    _img,_,_ = self.getNormalized4LungMaskImageFromDCM(dcm,self.curryMask[p],out_size=512)

                    _img = separate_lungs(_img) 
                    _img = (_img-(-1500))/(600-(-1500))
                    _img = ((255*_img)).clip(0,255).astype('uint8')

                    cv2.imwrite(jpgpath,_img,[cv2.IMWRITE_JPEG_QUALITY, CNF['quality']])
    
    def __len__(self):
        return len(self.dcmlist)

    def seq_aug(self,images):
        
        # とりあえずDCMもJPGもtransformするか
        
        #DCM
        _arg ={};_arg['image']=images[0,0]
        for _i in range(1,CNF['num_sequence']):
            _arg[f"image{_i-1}"]=images[0,_i]
        
        #JPG
        for cnt,_i in enumerate(range(CNF['num_sequence'],2*CNF['num_sequence'])):
            _arg[f"image{_i-1}"]=images[1,cnt]
        
        _imgs = self.transform(**_arg)
        
        # 最初がDCMで次がJPGの順番
        transformed_images =  np.stack( [ _imgs['image'] ] + [_imgs[f"image{_i-1}"] for _i in range(1,2*CNF['num_sequence']) ] )
        transformed_images = np.stack([transformed_images[:CNF['num_sequence']],transformed_images[CNF['num_sequence']:]  ])
        
        return transformed_images
        
    def __getitem__(self, idx):
        
        idx = [idx] if type(idx)==int else idx
        idx = idx.tolist() if torch.is_tensor(idx) else idx
        # augmentaion 
        
        images = np.stack([ self.seq_aug(self.readImages(self.dcmlist[_idx]))  for _idx in idx ])
        
        Patients = [ self.dcmlist[_idx]['Patient'] for _idx in idx]
        tables = pd.concat([ self.table_df[self.table_df.Patient==p] for p in Patients ])
        tables.drop(columns=['Patient'],inplace=True)
        labels = tables.pop(self.target_col)
        
        return torch.FloatTensor(images),torch.FloatTensor(tables.values.astype('float')),torch.FloatTensor(labels.values.astype('float'))
    
    def _window_image4LungMask(self,img, img_min, img_max,out_size=256):
   
        img[img<img_min] = img_min
        img[img>img_max] = img_max
        
        # LungMask 特有
        if out_size!=512: # out_size==512 つまりseg用の時は正規化をしない
            img = np.divide( ( img + 1024 ), 1624 ) # -1024,600でclipした場合は0-1に正規化することになる
        
        if self.transform:            # ---- 一枚ごとに異なるaug
             img = self.transform(image=img)['image']
        else:
            img = cv2.resize(img, (out_size, out_size))
            
        if out_size==512: # seg用の時はint化
            img = np.int16(img)

        return img

    def getNormalized4LungMaskImageFromDCM(self,dcm,curry_mask,out_size=256):
        
        img = apply_modality_lut(dcm.pixel_array, dcm)
        
        # 異常値対応
        if dcm.PatientID in ["ID00026637202179561894768","ID00128637202219474716089"]:
            img=img+1024  
        elif dcm.PatientID in ["ID00132637202222178761324"]:
            img=img+4096-2048
        else:
            pass
        
        # カレーちゃんのクロップ
        img = img[np.ix_(curry_mask.any(1),curry_mask.any(0))]
        
        croppedImageSize = img.shape
        
        img = self._window_image4LungMask(img,-1024,600,out_size=out_size)
        
        dxyz = np.array([float(dcm.PixelSpacing[0]),float(dcm.PixelSpacing[1]),float(dcm.SliceThickness)])
        dxyz[0]*=(croppedImageSize[0]/out_size)
        dxyz[1]*=(croppedImageSize[1]/out_size)
        

        return img.astype('float'), croppedImageSize, dxyz

    def readImages(self,dcmInfo):
        
        Patient = dcmInfo['Patient']

        imgs=[]
        jpgimgs=[]
        
        if self.is_train:
            
            # random choice for train とはいってもスライス順番は守る
            seq_path = [random.choice(paths) for paths in dcmInfo['paths']]
        
        else:
            seq_path = [paths[self.tta_id%len(paths)] for paths in dcmInfo['paths']]
        
        for path in seq_path:
            
            jpgfname = ( path.split('/')[-1][:-3]+'jpg' )
            
            jpgpath = f"{CNF['jpgPath']}/{Patient}/{jpgfname}"
            
            dcm = pydicom.read_file(path)
            
            img, _, _ = self.getNormalized4LungMaskImageFromDCM(dcm,self.curryMask[Patient])
            
            img = np.stack(img)
            jpgimg = np.stack(cv2.resize(cv2.imread(jpgpath, cv2.IMREAD_GRAYSCALE), (256, 256)))
            
            imgs.append(img)
            jpgimgs.append(jpgimg)
            
        imgs = np.stack([imgs,jpgimgs])
            
        return imgs
    
    # 計算済のLungMaskDfから面積の小さい肺スライス画像を除く
    def mkPatient2Files(self,mskDfPath,num_slice=12):

        _dict={}
        
        _df = pd.read_csv(mskDfPath)
        _df['lung_area'] = _df['lung_area']*_df['dx']*_df['dy']
        _df = _df.drop(index=_df.loc[(_df.lung_area<400)].index).reset_index(drop=True) 
        
        # 2回繰り返しも片方だけにする
        if self.is_train:
            _df = _df.drop(index=_df.loc[(_df.Patient=='ID00078637202199415319443')&(_df.file_id>509)].index).reset_index(drop=True) # remove duplicated sequence 
        
        
        for p in _df.Patient.unique():

            file_ids = np.sort(_df.loc[_df.Patient==p,'file_id'].values)
            
            if len(file_ids)<num_slice:
                _dict[p] = [ [fid] for fid in file_ids[np.linspace(0,len(file_ids)-1,num=num_slice).round(0).astype('int')] ]
            
            else:
                _dict[p] = [ file_ids[vid] for tid,vid in KFold(n_splits=num_slice).split(file_ids) ] # KFoldでfile_idsをnum_sliceに分割
        
        # sub時はたくさんsegできないので12スライスのみsegしてjpg生成しとく
        if self.image_path=='test': 
            
            print("submit mode : dcm is gonna be reducedto 12")
            
            for p in _df.Patient.unique():
                _dict[p] =  [ np.array([_paths[int(len(_paths)/2)]]) for _paths in _dict[p] ]

        return _dict

    def collate_fn(self,batch):
        
        if self.mixup:
            
            # 最後にmixupを付け加える
            images, tables, labels = list(zip(*batch))
            
            mix_image = (images[-1]+images[-2])/2.0
            mix_table = (tables[-1]+tables[-2])/2.0
            mix_label = (labels[-1]+labels[-2])/2.0
            
            images = torch.stack(images+(mix_image,)).squeeze(1)
            tables = torch.stack(tables+(mix_table,)).squeeze(1)
            labels = torch.stack(labels+(mix_label,)).squeeze(1)
            
        else:
            
            images, tables, labels = list(zip(*batch))
            images = torch.stack(images).squeeze(1)
            tables = torch.stack(tables).squeeze(1)
            labels = torch.stack(labels).squeeze(1)

        return images, tables, labels    
    

## Cusutom Neural Network

In [None]:
import torch
from torch import nn
import torch.nn.functional as F

class OSICNetR231(nn.Module):
    
    def __init__(self,num_sequence,num_table_ft,table_only=False,initialize_with_r231=True):
    
        super(OSICNetR231, self).__init__()
        
        self.dual = True
        
        self.table_only = table_only
        if table_only:
            print(f"TABLE ONLY MODE")

        self.base = UNet(n_classes=3, padding=True, depth=5, up_mode='upsample', batch_norm=True, residual=False)
        if initialize_with_r231:
            self.base.load_state_dict(torch.load('../input/pthunetlungmask/unet_r231-d5d2fc3d.pth'))
        
        del self.base.softmax, self.base.last, self.base.up_path
        
        
        self.lstm = nn.LSTM(2*1024, CNF['lstm_mem'], 1,bidirectional=True) 
        
        self.dropout = nn.Dropout(p=0.5)
        
        if self.table_only:
            self.linear = nn.Linear(0+num_table_ft, 1)
            self.bn0    = nn.BatchNorm1d(0+num_table_ft)
        else:
            self.linear = nn.Linear(2*num_sequence+num_table_ft, 1)
            self.bn0    = nn.BatchNorm1d(2*num_sequence+num_table_ft)

            
    def forward(self, dual_images, table_ft):
        
        # dual_images[:,0]:dcm ,dual_images[:,1]:classic_segmentated
        
        images = torch.stack( (  dual_images[:,0,0,:,:], dual_images[:,1,0,:,:],
                                 dual_images[:,0,1,:,:], dual_images[:,1,1,:,:],
                                 dual_images[:,0,2,:,:], dual_images[:,1,2,:,:],
                                 dual_images[:,0,3,:,:], dual_images[:,1,3,:,:],
                                 dual_images[:,0,4,:,:], dual_images[:,1,4,:,:],
                                 dual_images[:,0,5,:,:], dual_images[:,1,5,:,:],
                                 dual_images[:,0,6,:,:], dual_images[:,1,6,:,:],
                                 dual_images[:,0,7,:,:], dual_images[:,1,7,:,:],
                                 dual_images[:,0,8,:,:], dual_images[:,1,8,:,:],
                                 dual_images[:,0,9,:,:], dual_images[:,1,9,:,:],
                                 dual_images[:,0,10,:,:], dual_images[:,1,10,:,:],
                                 dual_images[:,0,11,:,:], dual_images[:,1,11,:,:]) )
        
        images = images.permute(1, 0, 2,3)
        images = images.unsqueeze(2) 

        seq=[]
        for x in images:

            blocks=[]
            for i, down in enumerate(self.base.down_path):

                x = down(x)

                if i != len(self.base.down_path) - 1:
                    blocks.append(x)
                    x = F.avg_pool2d(x, 2)

                # この時点で[1024, 16, 16]

            x = torch.cat( ( F.avg_pool2d(x, (16,16)).squeeze(),F.max_pool2d(x, (16,16)).squeeze() ),dim=-1 )

            seq.append(x)

        lstm_input = torch.stack(seq)
        
        lstm_input = lstm_input.squeeze(2)
        lstm_out,_ = self.lstm(lstm_input)
        image_ft = torch.mean(lstm_out,-1) # seqごとの特徴量の平均値
        
        if self.table_only:
        
            ft = self.bn0(table_ft)
        
        else:
            
            ft = torch.cat([image_ft,table_ft],1)
            ft = self.bn0(ft)

        output=self.linear(ft).squeeze(1)
        
        
        return output


## FVCFitter using LGBM ( includes Weeks as feature )

In [None]:
import lightgbm as lgb
class FVCFitterWeekLGBM():
    
    # Aを推定するLGBM
    
    def __init__(self,CNF,fold_id,prefix="osicLGBMweek"):
        
        self.target_col = 'FVC'
        print(f"Estimation mid target {self.target_col}")
        
        self.CNF = CNF
        self.fold_id = fold_id
        self.reg = lgb.LGBMRegressor(objective='l1', num_leaves=5, importance_type='gain') 
        self.prefix = prefix
        
        
    # train 
    def fit(self,trn_df,val_df,target):
        
        seed_everything(2020)
        
        # Estimate A, so we don't need Weeks
        
        val_df = val_df[['Patient']+self.CNF['feat_cols']+[self.target_col]+['Weeks']].reset_index(drop=True)
        
            
        trn_df = trn_df[['Patient']+self.CNF['feat_cols']+[self.target_col]+['Weeks']].reset_index(drop=True)

        self.reg.fit(trn_df[self.CNF['feat_cols']+['Weeks']],trn_df[self.target_col],
                     eval_set=(val_df[self.CNF['feat_cols']+['Weeks']],val_df[self.target_col]),early_stopping_rounds=100,verbose=False)

        _save(self.reg,f"{self.prefix}/{self.prefix}_model_fold{self.fold_id}.pkl")
        
        del self.reg
        
    def predict(self,tst_df,image_path):
        
        self.reg = _load(f"{self.prefix}/{self.prefix}_model_fold{self.fold_id}.pkl")
        
        seed_everything(2020)
        
        _df = tst_df[['Patient']+self.CNF['feat_cols']+['Weeks']].reset_index(drop=True).copy()
        _df[self.target_col] = np.nan # dummmy
        
        pred_df = _df[['Patient']].copy()
        
        pred_df['_predFVC'] = self.reg.predict(_df[self.CNF['feat_cols']+['Weeks']])
        
        gc.collect()
        
        return pred_df['_predFVC'].values

## FVCFitter using LGBM

In [None]:
import lightgbm as lgb
class FVCFitterLGBM():
    
    # Aを推定するLGBM
    
    def __init__(self,CNF,fold_id,prefix="osicLGBM"):
        
        if ('A0' in prefix)|('a0' in prefix):
            self.target_col = 'trueA0'
        elif ('AR' in prefix)|('ar' in prefix):
            self.target_col = 'trueAR'
        else:
            self.target_col = 'trueA'
            
        if 'w0_' in prefix:
            self.weight_col = 'w0'
        else:
            self.weight_col = None
            
        assert self.target_col in ['trueA','trueAR','trueA0']
        
        print(f"Estimation mid target {self.target_col}")
        
        self.CNF = CNF
        self.fold_id = fold_id
        self.reg = lgb.LGBMRegressor(objective='l1', num_leaves=5, importance_type='gain') 
        self.prefix = prefix
        
        
    # train 
    def fit(self,trn_df,val_df,target):
        
        seed_everything(2020)
        
        # Estimate A, so we don't need Weeks
        
        val_df = val_df[['Patient']+self.CNF['feat_cols']+[self.target_col]].drop_duplicates().reset_index(drop=True)
        
        if self.weight_col!=None:
            
            trn_df = trn_df[['Patient']+self.CNF['feat_cols']+[self.target_col]+[self.weight_col]].drop_duplicates().reset_index(drop=True)
            
            self.reg.fit(trn_df[self.CNF['feat_cols']],trn_df[self.target_col],sample_weight=trn_df[self.weight_col],
                         eval_set=(val_df[self.CNF['feat_cols']],val_df[self.target_col]),early_stopping_rounds=100,verbose=False)
        else:
            
            trn_df = trn_df[['Patient']+self.CNF['feat_cols']+[self.target_col]].drop_duplicates().reset_index(drop=True)
            
            self.reg.fit(trn_df[self.CNF['feat_cols']],trn_df[self.target_col],
                         eval_set=(val_df[self.CNF['feat_cols']],val_df[self.target_col]),early_stopping_rounds=100,verbose=False)

        _save(self.reg,f"{self.prefix}/{self.prefix}_model_fold{self.fold_id}.pkl")
        
        del self.reg
        
    def predict(self,tst_df,image_path):
        
        self.reg = _load(f"{self.prefix}/{self.prefix}_model_fold{self.fold_id}.pkl")
        
        seed_everything(2020)
        
        _df = tst_df[['Patient']+self.CNF['feat_cols']].drop_duplicates().reset_index(drop=True).copy()
        _df[self.target_col] = np.nan # dummmy
        
        pred_df = _df[['Patient']].copy()
        
        pred_df['predA'] = self.reg.predict(_df[self.CNF['feat_cols']])
        
        gc.collect()
        
        _tst_df = tst_df.merge(pred_df,on='Patient').copy()
        
        if self.target_col=='trueAR':
            return ( _tst_df['FVC0'] + _tst_df['FVC0']*_tst_df['predA']*(_tst_df['Weeks']-_tst_df['Week0']) ).values
        else: 
            return ( _tst_df['FVC0'] + _tst_df['predA']*(_tst_df['Weeks']-_tst_df['Week0']) ).values  # 'trueA','trueA0' use same eq


## FVCFitter using custom Neural Network

In [None]:
class FVCFitterNN01():
    
    # Aを推定するNN
    
    def __init__(self,CNF,fold_id,prefix="osicNN"):
        
        if ('A0' in prefix)|('a0' in prefix):
            self.target_col = 'trueA0'
        elif ('AR' in prefix)|('ar' in prefix):
            self.target_col = 'trueAR'
        else:
            self.target_col = 'trueA'
            
        assert self.target_col in ['trueA','trueAR','trueA0']
        
        print(f"Estimation mid target {self.target_col}")
        
        self.CNF = CNF
        self.fold_id = fold_id
        self.prefix = prefix
        self.num_table_ft = len(CNF['feat_cols'])
        
        from albumentations import Compose, Resize, HorizontalFlip,RandomBrightnessContrast,ShiftScaleRotate,RandomResizedCrop,VerticalFlip

        self.trnTransform = Compose([ Compose([HorizontalFlip(always_apply=False, p=1),VerticalFlip(always_apply=False, p=1)],p=0.1),#時々180度回転してるやつがある
                                 ShiftScaleRotate(shift_limit=0.0625, scale_limit=(0.1,0.3), rotate_limit=10, interpolation=1, border_mode=cv2.BORDER_CONSTANT, value=0, mask_value=None,p=1),
                                 RandomResizedCrop(256, 256, scale=(1, 1), ratio=(1.0, 1.0), interpolation=1, always_apply=True, p=1.0), # アスペクト１じゃない場合はセンターでアスペクト１でクロップ 
                               ],additional_targets = { f"image{_i}":'image' for _i in range(0,CNF['num_sequence']-1) } )

        self.valTransform = Compose([ ShiftScaleRotate(shift_limit=0, scale_limit=(0.2,0.2), rotate_limit=0, interpolation=1, border_mode=cv2.BORDER_CONSTANT, value=0, mask_value=None,p=1),
                                 RandomResizedCrop(256, 256, scale=(1, 1), ratio=(1.0, 1.0), interpolation=1, always_apply=True, p=1.0),
                               ],additional_targets = { f"image{_i}":'image' for _i in range(0,CNF['num_sequence']-1) })

        self.noTransform = Compose([ RandomResizedCrop(256, 256, scale=(1, 1), ratio=(1.0, 1.0), interpolation=1, always_apply=True, p=1.0),
                               ],additional_targets = { f"image{_i}":'image' for _i in range(0,CNF['num_sequence']-1) })
        
    # train NN then save pth
    def fit(self,trn_df,val_df,target):
        
        if not self.CNF['isTrain']:
            print('isTrain is set False,skip training')
            return True
        
        print(f"preparing data")
        
        seed_everything(2020)
        
        # Estimate A, so we don't need Weeks
        trn_df = trn_df[['Patient']+self.CNF['feat_cols']+[self.target_col]].drop_duplicates().reset_index(drop=True)
        val_df = val_df[['Patient']+self.CNF['feat_cols']+[self.target_col]].drop_duplicates().reset_index(drop=True)
        
        # mixup有効時はcollate_fun内でmixupする5+1
        trn_dset    = OsicDataset4NN(trn_df,self.CNF['mskDfPath'],target_col=self.target_col,is_train=True,num_slice=self.CNF['num_sequence'],dbg=False,image_path='train',transform=self.trnTransform if self.CNF['aug'] else self.noTransform,mixup=self.CNF['mixup'])
        trn_dloader = DataLoader(trn_dset, batch_size=self.CNF['batch_size']-int(self.CNF['mixup']),  shuffle=True, num_workers=2,collate_fn=trn_dset.collate_fn,drop_last=True) 

        val_dset    = OsicDataset4NN(val_df,self.CNF['mskDfPath'],target_col=self.target_col,is_train=False,num_slice=self.CNF['num_sequence'],dbg=False,image_path='train',tta_id=0,transform=self.valTransform if self.CNF['aug'] else self.noTransform,mixup=False)
        val_dloader = DataLoader(val_dset, batch_size=self.CNF['batch_size'],  shuffle=False, num_workers=2,collate_fn=val_dset.collate_fn,drop_last=False,)

        model = OSICNetR231(num_sequence=self.CNF['num_sequence'],num_table_ft=self.num_table_ft,table_only=False).to(device)
        plist = [{'params': model.parameters(), 'lr': 0.001}]
        optimizer = optim.Adam(plist, lr=0.001) 
        
        # ---------------------------- comment out for MSE LOSS -------------------------------------
        
        criterion = torch.nn.L1Loss()
            
        val_criterion = torch.nn.L1Loss()
        
        if self.CNF['EMA']:
            ema = ModelEMA(model)

        trnLoss=[]
        valLoss=[]

        best_vloss=np.inf
        
        print(f"start training")
        
        for ep in range(2):
        #for ep in range(self.CNF['maxEpoch']):

            model.train()

            tloss=[]
            for images,tables,labels in trn_dloader:
                
                images = images.to(device); tables = tables.to(device); labels = labels.to(device)
                
                pred = model(images,tables)
                loss = criterion(pred,labels)
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                if self.CNF['EMA']:
                    ema.update(model)

                tloss.append(loss.detach().cpu().numpy())
            
            trnLoss.append(np.mean(tloss))

            model.eval()

            with torch.no_grad():

                vloss=[]
                for images,tables,labels in val_dloader:

                    images = images.to(device); tables = tables.to(device); labels = labels.to(device)

                    pred = model(images,tables)
                    loss = val_criterion(pred,labels)

                    vloss.append(loss.detach().cpu().numpy())

            if self.CNF['verbose']:
                print(f"EP{ep:02d} trnLoss:{np.mean(tloss):.4f} valLoss:{np.mean(vloss):.4f}")
            
            valLoss.append(np.mean(vloss))

            if ep>0:

                if valLoss[-1] < best_vloss:

                    best_vloss = valLoss[-1]
                    os.makedirs(f"{self.prefix}", exist_ok=True)

                    torch.save(model.state_dict(), f"{self.prefix}/{self.prefix}_model_fold{self.fold_id}.pth")


        del model, plist, optimizer, criterion,trn_dset,val_dset,trn_dloader,val_dloader
        gc.collect()
        torch.cuda.empty_cache()
        
        print(f"FOLD{self.fold_id} {np.min(valLoss):.4f}@ep{np.argmin(valLoss)}")            
        
        plt.plot(trnLoss,label='trn')
        plt.plot(valLoss,label='val')
        plt.xlabel('epoch')
        plt.ylabel('L1 loss')
        plt.title(f"Fold{self.fold_id} best_vloss {best_vloss:.4f}@ep{np.argmin(valLoss)}")
        plt.legend()
        plt.show()
        
            
    def predict(self,tst_df,image_path):
        
        print("start prediction")
        seed_everything(2020)
        
        model = OSICNetR231(num_sequence=self.CNF['num_sequence'],num_table_ft=self.num_table_ft,table_only=False).to(device)
        
        _df = tst_df[['Patient']+self.CNF['feat_cols']].drop_duplicates().reset_index(drop=True).copy()
        
        _df[self.target_col] = np.nan # dummmy
        
        pred_df = _df[['Patient']].copy()
        pred_df['predA'] = np.nan
        
        

        if self.CNF['isTrain']:
            model.load_state_dict(torch.load(f"{self.prefix}/{self.prefix}_model_fold{self.fold_id}.pth"))
        else:
            
            #
            # awsの場合はコメントアウト
            #
            capital_prefix = self.prefix.replace('a0','A0').replace('ar','AR')
            model.load_state_dict(torch.load(f"../input/{self.prefix}/{capital_prefix}_model_fold{self.fold_id}.pth"))
            
            #model.load_state_dict(torch.load(f"../input/{self.prefix}/{self.prefix}_model_fold{self.fold_id}.pth"))

        model.eval()

        pred_over_tta=[]
        
        for tta_id in range(CNF['n_tta']):
            
            tst_dset    = OsicDataset4NN(_df,self.CNF['mskDfPath'],target_col=self.target_col,is_train=False,num_slice=self.CNF['num_sequence'],dbg=False,image_path=image_path,tta_id=tta_id,transform=self.valTransform if CNF['aug'] else self.noTransform,mixup=False)
            tst_dloader = DataLoader(tst_dset, batch_size=self.CNF['batch_size'],  shuffle=False, num_workers=2,collate_fn=tst_dset.collate_fn,drop_last=False)

            fold_pred=[]
            with torch.no_grad():

                for images,tables,labels in tst_dloader:
                    
                    images = images.to(device); tables = tables.to(device); labels = labels.to(device)

                    pred = model(images,tables)
                    pred=pred.cpu().numpy()
                    fold_pred.append(pred)
                    

            pred_over_tta.append( np.concatenate(fold_pred) )
            
            del tst_dset,tst_dloader
            gc.collect()

        pred_df['predA'] = np.mean(pred_over_tta,axis=0)
        
        del model
        gc.collect()
        torch.cuda.empty_cache()
        
        _tst_df = tst_df.merge(pred_df,on='Patient').copy()
        
        if self.target_col == 'trueAR':
        
            return ( _tst_df['FVC0'] + _tst_df['predA']*(_tst_df['Weeks']-_tst_df['Week0']) ).values
            
            
        else: # 'trueA','trueA0' use same eq
        
            return ( _tst_df['FVC0'] + _tst_df['predA']*(_tst_df['Weeks']-_tst_df['Week0']) ).values


# Load train and add features

In [None]:
train = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/train.csv') 
train = markLast3(train)
# dupおとして
train = train.drop_duplicates(subset=['Patient','Weeks'],keep='last')
train = train.reset_index(drop=True)
train, feat_cols = addFeatureDf(train,CNF['mskDfPath'],CNF['trnPath'],is_test=False)

## Single Models Fit and Predict

In [None]:
import warnings
warnings.simplefilter('ignore')

seed_everything(2020)

feature_importance_df = pd.DataFrame()

nMegaFold = 5  

CNF['feat_cols']=feat_cols

train_orig = train.copy()

trainModelGroup = [ train.copy() for _ in range(len(CNF['prefixes'])) ]


for prefixsModelGroup,weightsModelGroup,train in zip(CNF['prefixes'],CNF['ensembleWeight'],trainModelGroup):
    
    _isImportancePlot=False
    
    for prefix in prefixsModelGroup+['']:
        train['predFVC'+prefix] = 0

    for prefix,ensemble_weight in zip(prefixsModelGroup,weightsModelGroup):

        if CNF['isTrain']|('LGBM' in prefix):

            random_seed = int(prefix.split('seed')[1])

            print(f"SEED:{random_seed}")

            unique_patients = train.Patient.unique()
            valPatSplit5 = [ unique_patients[val_patient] for trn_patient,val_patient in KFold(n_splits=5,shuffle=True,random_state=random_seed).split(unique_patients) ]

            os.makedirs(f"{prefix}", exist_ok=True)
            _save(valPatSplit5,f"{prefix}/valPatSplit5.pkl")

        else:

            valPatSplit5 = _load(f"../input/{prefix}/valPatSplit5.pkl")

        #
        # train or validate FVCfitter
        #

        for megaFold in range(nMegaFold):

            print(f"megaFold:{megaFold}")

            patTrn = np.concatenate([ valPatSplit5[(_i+megaFold)%5] for _i in range(4) ])
            patVal = np.concatenate([ valPatSplit5[(_i+megaFold)%5] for _i in [4] ])

            dataTrn,dataVal = train[train.Patient.isin(patTrn)],train[train.Patient.isin(patVal)]


            # train fvc_fitter on dataTrn
            if 'LGBM' in prefix:
                if 'week' in prefix:
                    fvc_fitter = FVCFitterWeekLGBM(CNF,fold_id=megaFold,prefix=prefix)
                else:
                    fvc_fitter = FVCFitterLGBM(CNF,fold_id=megaFold,prefix=prefix)
                
            else:
                fvc_fitter = FVCFitterNN01(CNF,fold_id=megaFold,prefix=prefix)

            fvc_fitter.fit(trn_df=dataTrn, val_df=dataVal, target=dataTrn['FVC']) 

            _pred = fvc_fitter.predict(dataVal.drop(columns='FVC'),image_path='train')

            train.loc[train.Patient.isin(patVal),'predFVC'] += ensemble_weight*_pred
            train.loc[train.Patient.isin(patVal),'predFVC'+prefix] += _pred
            
            if ('LGBM' in prefix)&('week' not in prefix):
                _isImportancePlot=True
                fold_importance_df = pd.DataFrame()
                fold_importance_df["Feature"] = feat_cols
                fold_importance_df["importance"] = fvc_fitter.reg.feature_importances_
                fold_importance_df["fold"] = f"{prefix}_fold{megaFold}"
                feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)


            del fvc_fitter
            gc.collect()
        
    if _isImportancePlot:
        plt.figure(figsize=(8, 20))
        sns.barplot(x="importance", y="Feature", data=feature_importance_df.sort_values(by="importance", ascending=False));

## Train and predict preliminary confidence value for just calculate oof 

In [None]:
#
# train and validate Pre-Confidenvefitter
#

score_dfs = []

for prefixsModelGroup,weightsModelGroup,train in zip(CNF['prefixes'],CNF['ensembleWeight'],trainModelGroup):
    
    scores = {}
    ensembleW = {}

    conf_feat = ['mskd_img_sum_cls4_mean','Male'] # 仮のconf feat


    for prefix in prefixsModelGroup+['']: # 必ず最後に''を入れること じゃないとtestの時にバグるはず

        pred_df = train[['rWeeks','FVC','predFVC'+prefix,'is_last3']]
        pred_df['dFVC'+prefix]=pred_df['predFVC'+prefix]-pred_df['FVC']
        err_df = pred_df.groupby('rWeeks')['dFVC'+prefix].apply(lambda x: np.sqrt(np.mean(x**2))).reset_index()
        err_df.columns = ['rWeeks', 'rmseFVC']

        train['dFVC'+prefix] = abs(train['predFVC'+prefix]-train['FVC'])

        # features correlated to FVC error
        if prefix=='':
            #pass
            print(train.loc[train.is_last3==1].corr()['dFVC'+prefix].sort_values(ascending=False)[:20])

        # features used for confidence fit


        confidence_fitter = ConfidenceFitterOptimizedLinear(train.loc[train.is_last3==1,conf_feat].values,train.loc[train.is_last3==1,'FVC'].values,train.loc[train.is_last3==1,'predFVC'+prefix].values)

        #
        # plot result
        #

        # FVC vs predFVC
        pred_df['sigma'+prefix] = confidence_fitter.predict(train[conf_feat])
        if prefix=='':
            pass
            #plt.figure(figsize=(16,5))
            #plt.subplot(122);plt.plot([500,7000],[500,7000],c='orange',alpha=0.5)
            #sc=plt.scatter( train.loc[train.is_last3==1,'FVC'],  train.loc[train.is_last3==1,'predFVC'+prefix],c=pred_df.loc[pred_df.is_last3==1,'sigma'+prefix],cmap=cm.seismic,alpha=0.2 );plt.colorbar(sc)
            #plt.xlabel('FVC');plt.ylabel('predFVC'+prefix);plt.show()

        #
        # CV
        #

        if CNF['EvalOnlyLast3']:
            pred_df = pred_df[ pred_df.is_last3==1 ]

        scores[prefix] = calcScore(pred_df['FVC'].values, pred_df['predFVC'+prefix].values, pred_df['sigma'+prefix].values)
        
    _score_df = pd.DataFrame.from_dict(scores, orient='index').rename(index={'': 'Ensemble'}).reset_index()
    _score_df.columns=['model','score']
    _score_df['weight']=np.concatenate([weightsModelGroup,np.array([0])])

    _score_df.reindex(columns=['model', 'weight', 'score'])
    
    score_dfs.append(_score_df)

display(HorizontalDisplay(*score_dfs))

## Last Confidence Fitter used to generate final confidence value

In [None]:
# fold sanity check and get each model group's split

valPatSplit5ModelGroup = []

for prefixsModelGroup,weightsModelGroup,train in zip(CNF['prefixes'],CNF['ensembleWeight'],trainModelGroup):
    
    all_splits=[]
    for prefix in prefixsModelGroup:
        path = f"{prefix}/valPatSplit5.pkl" if CNF['isTrain']|('LGBM' in prefix) else f"../input/{prefix}/valPatSplit5.pkl"
        all_splits.append(_load(path))

    sanity_ok=True
    from itertools import permutations
    for fold in range(5):
        for _i,_j in permutations(np.arange(len(prefixsModelGroup)),2):
            if set(all_splits[_i][fold])!=set(all_splits[_j][fold]):
                assert False,'invalid fold splits'

    valPatSplit5ModelGroup.append( all_splits[0] )

In [None]:
# find ideal confidence

import scipy as sp
from tqdm.notebook import tqdm
from functools import partial
import math

def loss_func(weight, row):
    confidence = weight
    sigma_clipped = max(confidence, 70)
    diff = abs(row['FVC'] - row['predFVC'])
    delta = min(diff, 1000)
    score = -math.sqrt(2)*delta/sigma_clipped - np.log(math.sqrt(2)*sigma_clipped)
    return -score

for train in trainModelGroup:
    
    results = [] # ----------------------------------- 正解のConfidenceを作成する
    #tk0 = tqdm(train.iterrows(), total=len(train))
    for _, row in train.iterrows():
        loss_partial = partial(loss_func, row=row)
        weight = [100]
        result = sp.optimize.minimize(loss_partial, weight, method='SLSQP')
        x = result['x']
        results.append(x[0])

    train['trueConfidence']=results
    train['trueConfidence']=np.maximum(train['trueConfidence'].values,70)

### check if confidence values are perfectly predicted, how good CV is

In [None]:
for cnt,train in enumerate(trainModelGroup):
    print(f"ModelGroup{cnt} Possible CV {calcScore(train['FVC'].values, train['predFVC'].values, train['trueConfidence'].values):.4f}") # 完璧にConfidenceが推定できた場合

In [None]:
from sklearn.linear_model import Ridge,Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR
from sklearn.ensemble import RandomForestRegressor

class ConfidenceFitter():
    
    # trueConfidenceを推定するRidge or SVR
    
    def __init__(self,CNF,fold_id,model_name,prefix="confRidge",target_col='trueConfidence'):
        
        self.target_col = target_col
        self.scaler = StandardScaler()
        
        self.CNF = CNF
        self.fold_id = fold_id
        
        if model_name == 'ridge':
            self.reg = Ridge(alpha=1.0) 
        elif model_name == 'svr':
            self.reg =NuSVR()
        else:
            assert False
            
        self.prefix = prefix
        self.feat_cols = None
        
    # train 
    def fit(self,trn_df,val_df,feat_cols):
        
        seed_everything(2020)
        self.feat_cols = feat_cols
        
        trn_df = trn_df.reset_index(drop=True)
        val_df = val_df.reset_index(drop=True)
        
        self.scaler.fit(trn_df[self.feat_cols])
        
        self.reg.fit(self.scaler.transform( trn_df[self.feat_cols] ),  trn_df[self.target_col])
        
        os.makedirs(f"{self.prefix}", exist_ok=True)
        _save(self.reg,f"{self.prefix}/{self.prefix}_model_fold{self.fold_id}.pkl")
        _save(self.scaler,f"{self.prefix}/{self.prefix}_scaler_fold{self.fold_id}.pkl")
        _save(self.feat_cols,f"{self.prefix}/{self.prefix}_feat_fold{self.fold_id}.pkl")
        
        del self.reg
        del self.scaler
        
    def predict(self,tst_df):
        
        tst_df= tst_df.reset_index(drop=True).copy()
        
        self.reg = _load(f"{self.prefix}/{self.prefix}_model_fold{self.fold_id}.pkl")
        self.scaler = _load(f"{self.prefix}/{self.prefix}_scaler_fold{self.fold_id}.pkl")
        self.feat_cols = _load(f"{self.prefix}/{self.prefix}_feat_fold{self.fold_id}.pkl")
        
        seed_everything(2020)
        
        _df = tst_df[['Patient']+self.feat_cols].copy()
        
        _df['predConfidence'] = self.reg.predict(self.scaler.transform(_df[self.feat_cols]))
        
        gc.collect()
        
        return _df['predConfidence'].values
        
        

## Train last confidence fitter and predict confidence

In [None]:

conf_cols = ['mskd_img_sum_cls4_mean','Male','mskd_img_sum_cls2_median_absolute_deviation','Currently smokes','rWeeks']


for cnt,(valPatSplit5,train) in enumerate(zip(valPatSplit5ModelGroup,trainModelGroup)):

    confidence_fitter_prefix=f"lastconfModelGroup{cnt}"
    
    train['predConfidence']=0

    for fold in range(5):

        patTrn = np.concatenate([ valPatSplit5[(_i+fold)%5] for _i in range(4) ])
        patVal = np.concatenate([ valPatSplit5[(_i+fold)%5] for _i in [4] ])

        dataTrn,dataVal = train[(train.is_last3==1)&(train.Patient.isin(patTrn))],train[(train.is_last3==1)&(train.Patient.isin(patVal))]

        conf_fitterRidge = ConfidenceFitter(CNF,fold_id=fold,model_name='ridge',prefix=confidence_fitter_prefix+'RIDGE')
        conf_fitterRidge.fit(trn_df=dataTrn.drop(columns='FVC'), val_df=dataVal.drop(columns='FVC'),feat_cols=conf_cols) 

        conf_fitterSVR = ConfidenceFitter(CNF,fold_id=fold,model_name='svr',prefix=confidence_fitter_prefix+'SVR')
        conf_fitterSVR.fit(trn_df=dataTrn.drop(columns='FVC'), val_df=dataVal.drop(columns='FVC'),feat_cols=conf_cols) 

        _predRidge = conf_fitterRidge.predict(dataVal)
        _predSVR   = conf_fitterSVR.predict(dataVal)

        _pred = np.sqrt( 0.2*(_predSVR**2) + 0.8*(_predRidge**2))

        train.loc[(train.is_last3==1)&(train.Patient.isin(patVal)),'predConfidence'] = _pred

        del conf_fitterRidge,conf_fitterSVR
        gc.collect()


In [None]:
model_group_scores=[]
for cnt,(valPatSplit5,train) in enumerate(zip(valPatSplit5ModelGroup,trainModelGroup)):

    plt.figure(figsize=(16,5))
    plt.subplot(121);plt.plot([500,7000],[500,7000],c='orange',alpha=0.5)
    sc=plt.scatter( train.loc[train.is_last3==1,'FVC'],  train.loc[train.is_last3==1,'predFVC'],c=train.loc[train.is_last3==1,'predConfidence'],cmap=cm.seismic,alpha=0.2 );plt.colorbar(sc)
    plt.xlabel('FVC');plt.ylabel('predFVC'+prefix)
    plt.subplot(122);plt.scatter(train.loc[train.is_last3==1,'predConfidence'],train.loc[train.is_last3==1,'trueConfidence']);plt.xlabel('predConfidence');plt.ylabel('trueConfidence')
    plt.plot([0,600],[0,600],c='orange');plt.show()
    
    model_group_scores.append( calcScore(train.loc[train.is_last3==1,'FVC'].values, train.loc[train.is_last3==1,'predFVC'].values, train.loc[train.is_last3==1,'predConfidence'].values) )
    print(f"CV {model_group_scores[-1]}")
    
a=np.array(model_group_scores)

# CV

In [None]:
final_oof_df = pd.DataFrame({'predFVC':np.average([train.predFVC for train in trainModelGroup],0),
              'FVC':np.mean([train.FVC for train in trainModelGroup],0),
              'predConfidence':np.sqrt( np.average([train.predConfidence**2 for train in trainModelGroup],0)),
              'is_last3':np.mean([train.is_last3 for train in trainModelGroup],0),
                        })

print(f"CV {calcScore(final_oof_df.loc[final_oof_df.is_last3==1,'FVC'].values, final_oof_df.loc[final_oof_df.is_last3==1,'predFVC'].values, final_oof_df.loc[final_oof_df.is_last3==1,'predConfidence'].values)}")

# Submission

In [None]:
gc.collect()

if CNF['isTrain']==False:
    
    !cp {'/'.join(CNF['mskDfPath'].split('/')[:-1])}/createLungMaskFeat.py .
    !python createLungMaskFeat.py -path ../input/osic-pulmonary-fibrosis-progression/test -out LTRC_feat_test_df.csv -outvertical ./test_vertical
    
    # read test csv
    test = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv') 
    

    # add Featreus
    test,_ = addFeatureDf(test,'./LTRC_feat_test_df.csv','../input/osic-pulmonary-fibrosis-progression/test',is_test=True)
    
    test = test.drop(columns=['Weeks','rWeeks'])

    # add Weeks from sub
    _sub  = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/sample_submission.csv')
    _sub['Patient'] = _sub['Patient_Week'].apply(lambda x:x.split('_')[0])
    _sub['Weeks']   = _sub['Patient_Week'].apply(lambda x:x.split('_')[1]).astype('int')
    _sub.drop(columns=['FVC','Confidence'],inplace=True)

    # update test
    test = _sub.merge(test,on='Patient')
    test['rWeeks'] = test['Weeks']-test['Week0']
    
    # make prediction
    test['predFVC']=0
    
    
    #
    # predict FVC and Confidence
    #
    
    test_orig = test.copy()
    testModelGroup = [ test.copy() for _ in range(len(CNF['prefixes'])) ]

    CNF['jpgPath']='./segmented'
    for prefixsModelGroup,weightsModelGroup,test in zip(CNF['prefixes'],CNF['ensembleWeight'],testModelGroup):
        
        for prefix,ensemble_weight in zip(prefixsModelGroup,weightsModelGroup):

            for megaFold in range(nMegaFold):

                # fitterをtest用に設定
                if 'LGBM' in prefix:
                    if 'week' in prefix:
                        fvc_fitter = FVCFitterWeekLGBM(CNF,fold_id=megaFold,prefix=prefix)
                    else:
                        fvc_fitter = FVCFitterLGBM(CNF,fold_id=megaFold,prefix=prefix)
            
                else:
                    fvc_fitter = FVCFitterNN01(CNF,fold_id=megaFold,prefix=prefix)

                fvc_fitter.CNF['mskDfPath'] = './LTRC_feat_test_df.csv'

                test['predFVC'] += ensemble_weight * ( fvc_fitter.predict( test.drop(columns='FVC'),image_path='test' ) )/5
                
                del fvc_fitter
                gc.collect()

    
    for cnt,(prefixsModelGroup,weightsModelGroup,test) in enumerate( zip(CNF['prefixes'],CNF['ensembleWeight'],testModelGroup) ):
        
        confidence_fitter_prefix=f"lastconfModelGroup{cnt}"
        
        test['predConfidence']=0
        for megaFold in range(nMegaFold):

            conf_fitterRidge = ConfidenceFitter(CNF,fold_id=fold,model_name='ridge',prefix=confidence_fitter_prefix+'RIDGE')
            conf_fitterSVR   = ConfidenceFitter(CNF,fold_id=fold,model_name='svr',prefix=confidence_fitter_prefix+'SVR')

            _predRidge = conf_fitterRidge.predict(test[['Patient']+conf_cols])
            _predSVR   = conf_fitterSVR.predict(test[['Patient']+conf_cols])

            _pred = np.sqrt( 0.2*(_predSVR**2) + 0.8*(_predRidge**2) )

            test['predConfidence'] += (_pred**2)/5

        test['predConfidence'] = np.sqrt(test['predConfidence']) 
        
    # モデルグループをアンサンブル
    
    _predFVC = np.mean([test.predFVC for test in testModelGroup],0)
    _predConfidence = np.sqrt( np.mean([test.predConfidence**2 for test in testModelGroup],0) )
    
    # ここで繰り返し変数的に使っていたtestを元に戻す
    
    test = test_orig
    
    test['predFVC'] = _predFVC
    test['predConfidence'] = _predConfidence 
    
        
    #
    # save submission file 
    #
    
    sub = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/sample_submission.csv')

    for k in sub.Patient_Week.values:

        p, w = k.split('_')
        w = int(w) 

        sub.loc[sub.Patient_Week == k, 'FVC'] = test.loc[ (test.Patient==p)&(test.Weeks==w), 'predFVC'].values
        sub.loc[sub.Patient_Week == k, 'Confidence'] = test.loc[ (test.Patient==p)&(test.Weeks==w), 'predConfidence'].values

    sub[["Patient_Week","FVC","Confidence"]].to_csv("submission.csv", index=False)
    
    # check submission file
    sub['Patient'] = sub['Patient_Week'].apply(lambda x: x.split('_')[0])
    sub['Weeks']   = sub['Patient_Week'].apply(lambda x: int(x.split('_')[-1]))
    subPatientList = sub.Patient.unique().tolist()

    if len(subPatientList)==5:

        for p in subPatientList:

            plt.scatter( sub.loc[sub.Patient==p,'Weeks'], sub.loc[sub.Patient==p,'FVC'])
            plt.show()

    
    

In [None]:
!rm -rf test_verticalX
!rm -rf test_verticalY

In [None]:
for cnt,prefixsModelGroup in enumerate( CNF['prefixes'] ):
    
    for prefix in prefixsModelGroup:

        if 'LGBM' in prefix:

            !rm -rf {prefix}
        
!rm -rf ./segmented

for cnt,prefixsModelGroup in enumerate( CNF['prefixes'] ):
        
    confidence_fitter_prefix=f"lastconfModelGroup{cnt}"
    
    !rm -rf {confidence_fitter_prefix+'RIDGE'}
    !rm -rf {confidence_fitter_prefix+'SVR'}