In [None]:
import numpy as np
import pandas as pd
import pydicom
import os
import random
import matplotlib.pyplot as plt
from tqdm import tqdm
from PIL import Image
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold

In [None]:
'''
import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow.keras.layers as L
import tensorflow.keras.models as M
import random
'''

import torch
import torchvision

In [None]:
def seed_everything(seed=2020):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    #tf.random.set_seed(seed)
    
seed_everything(42)

In [None]:
ROOT = "../input/osic-pulmonary-fibrosis-progression"
BATCH_SIZE=128

In [None]:
tr = pd.read_csv(f"{ROOT}/train.csv")
tr.drop_duplicates(keep=False, inplace=True, subset=['Patient','Weeks'])
chunk = pd.read_csv(f"{ROOT}/test.csv")

print("add infos")
sub = pd.read_csv(f"{ROOT}/sample_submission.csv")
sub['Patient'] = sub['Patient_Week'].apply(lambda x:x.split('_')[0])
sub['Weeks'] = sub['Patient_Week'].apply(lambda x: int(x.split('_')[-1]))
print(sub.index.size)
sub =  sub[['Patient','Weeks','Confidence','Patient_Week']]
sub = sub.merge(chunk.drop('Weeks', axis=1), on="Patient")

In [None]:
sub[sub.Patient=='ID00419637202311204720264']

In [None]:
tr['WHERE'] = 'train'
chunk['WHERE'] = 'val'
sub['WHERE'] = 'test'
data = tr.append([chunk, sub])

In [None]:
print(tr.shape, chunk.shape, sub.shape, data.shape)
print(tr.Patient.nunique(), chunk.Patient.nunique(), sub.Patient.nunique(), 
      data.Patient.nunique())
#

In [None]:
data['min_week'] = data['Weeks']
data.loc[data.WHERE=='test','min_week'] = np.nan
data['min_week'] = data.groupby('Patient')['min_week'].transform('min')

In [None]:
 data.loc[data.Weeks == data.min_week]

In [None]:
base = data.loc[data.Weeks == data.min_week]
base = base[['Patient','FVC']].copy()
base.columns = ['Patient','min_FVC']
base['nb'] = 1
base['nb'] = base.groupby('Patient')['nb'].transform('cumsum')
base = base[base.nb==1]
base.drop('nb', axis=1, inplace=True)

In [None]:
base[base.Patient=='ID00419637202311204720264']

In [None]:
data = data.merge(base, on='Patient', how='left')
data['base_week'] = data['Weeks'] - data['min_week']
#del base
data['diff_fvc_prev']=data['FVC'].diff(1)/data['FVC'].shift(1)

In [None]:
data[data.Patient=='ID00007637202177411956430']

In [None]:
COLS = ['Sex','SmokingStatus'] #,'Age'
FE = []
for col in COLS:
    for mod in data[col].unique():
        FE.append(mod)
        data[mod] = (data[col] == mod).astype(int)
#=================

In [None]:
FE

In [None]:
#
data['age'] = (data['Age'] - data['Age'].min() ) / ( data['Age'].max() - data['Age'].min() )
data['BASE'] = (data['min_FVC'] - data['min_FVC'].min() ) / ( data['min_FVC'].max() - data['min_FVC'].min() )
data['week'] = (data['base_week'] - data['base_week'].min() ) / ( data['base_week'].max() - data['base_week'].min() )
data['percent'] = (data['Percent'] - data['Percent'].min() ) / ( data['Percent'].max() - data['Percent'].min() )

FE += ['age','percent','week','BASE']

In [None]:
#data.rename_columns({'base_week':'diff_from'})

In [None]:
#data['FVC'].diff(1)/data['FVC'].shift(1)*100

In [None]:
data[FE]

In [None]:
data.head()
#pd.options.display.max_rows=70
#pd.options.display.max_columns=40

data=data.fillna(0)

In [None]:
tr[tr['Patient']=='ID00419637202311204720264'] 

In [None]:
tr = data.loc[data.WHERE=='train']
chunk = data.loc[data.WHERE=='val']
sub = data.loc[data.WHERE=='test']
#del data

In [None]:
tr.shape, chunk.shape, sub.shape

In [None]:
#qloss_func(torch.tensor([1800]),torch.tensor([2000])),qloss()(torch.tensor([1800]),torch.tensor([2000]))

### BASELINE NN 

In [None]:
#np.cumsum([1, 2,3])  # => [a, a + b, a + b + c]

In [None]:
y = tr['FVC'].values
z = tr[FE].values
ze = sub[FE].values
nh = z.shape[1]
pe = np.zeros((ze.shape[0], 3))
pred = np.zeros((z.shape[0], 3))

In [None]:
from torch.utils.data import Dataset,DataLoader
from torch.utils.data.sampler import SequentialSampler, RandomSampler

In [None]:
sub[FE][sub.index==0].values

In [None]:
class DatasetRetriever(Dataset):

    def __init__(self,   train_arrays,targets,df=None, transforms=None, test=False):
        super().__init__()

        self.image_ids = train_arrays
        #self.df=df
        self.test=test
        if test:
            self.targets=torch.ones(train_arrays.shape[0])
        else:
            self.targets=targets
        
    def __getitem__(self, index: int):
        train_input = self.image_ids[index] 
        
        target=self.targets[index]
        '''
        if self.test:
            
            #patient_id=self.df[index].image_id[0]
            train_input =self.df[FE][sub.index==index].values[0]
            return train_input, target 
        '''
        return train_input, target 
    def __len__(self) -> int:
        return self.image_ids.shape[0]

In [None]:
NFOLD = 5
kf = KFold(n_splits=NFOLD)

In [None]:
#len(train_dataset)
#train_dataset[0][0].shape 

In [None]:
class osic_model(torch.nn.Module):
    
    def __init__(self, n_inputs=32):
        super(osic_model, self).__init__()
        self.layer1 = torch.nn.Linear(9, 32)
        #self.batchnorm1= torch.nn.BatchNorm1d(self.layer1.out_features) 
        self.relu=torch.nn.ReLU(inplace=True)
        
        self.fc=torch.nn.Linear(self.layer1.out_features,3)
        
    def forward(self,input ):
        x=self.layer1(input)
       
        x=self.relu(x)
        
        x=self.fc(x)
        return x

In [None]:
model=osic_model()
model

In [None]:
#tmp_x,tmp_y=next(iter(train_loader))
#train_dataset[0][0].shape

In [None]:
optimizer = torch.optim.AdamW(params=model.parameters(), lr=1e-2)
#criterion=

In [None]:
class qloss(torch.nn.Module):
    def __init__(self):
        super(qloss, self).__init__()
        self.w=torch.nn.Parameter( torch.tensor([0.2,0.5,0.8]))
         
        
    def forward(self,y_pred,y_true):
        #q = torch.tensor(qs)
    #tf.constant(np.array([qs]), dtype=tf.float32)
        e = y_true.unsqueeze(-1) - y_pred
    #print(q*e,(q-1)*e)
        #print(self.w.unsqueeze(0).size(),e.size())
        #v = torch.max(torch.cat([self.w.unsqueeze(0)*e, (self.w-1.).unsqueeze(0)*e],dim=1),dim=1)[0]#160,-640 1200,2000
        #print([self.w.unsqueeze(0)*e, (self.w-1.).unsqueeze(0)*e])
        v = torch.max(torch.stack([self.w.unsqueeze(0)*e, (self.w-1.).unsqueeze(0)*e],dim=1),dim=1)[0]
        #print(v.size(),torch.stack([self.w.unsqueeze(0)*e, (self.w-1.).unsqueeze(0)*e],dim=1).size())
        #print(v.size())
        #print(v)
        return torch.mean(torch.sum(v,dim=-1))
        
        

def qloss_func(y_true, y_pred):
    # Pinball loss for multiple quantiles
    qs = [0.2, 0.50, 0.8]
    q = torch.tensor(qs)
    #tf.constant(np.array([qs]), dtype=tf.float32)
    e = y_true - y_pred
    #print(q*e,(q-1)*e)
    v = torch.max(q*e, (q-1.)*e)[0]#160,-640 1200,2000
    return torch.mean(v)

In [None]:
def quantile_loss(preds, target, quantiles = [0.2, 0.50, 0.8]):
    #assert not target.requires_grad
    assert len(preds) == len(target)
    losses = []
    for i, q in enumerate(quantiles):
        errors = target - preds[:, i]
        print('q-1',q-1,(q - 1) * errors,'q',q, q * errors)
        print('max',torch.max((q - 1) * errors, q * errors).unsqueeze(1))
        losses.append(torch.max((q - 1) * errors, q * errors).unsqueeze(1))
    #print( torch.sum(torch.cat(losses, dim=1),dim=1).size(),losses )
    loss = torch.mean(torch.sum(torch.cat(losses, dim=1), dim=1))
    #print(torch.sum(torch.cat(losses, dim=1), dim=1).size()) 4
    return loss

In [None]:
#quantile_loss (model(torch.randn(4,9)),gt[:4])
#a=torch.randn(4,9)
#qloss() (model(a),gt[:4]) ,quantile_loss(model(a),gt[:4])

In [None]:
from tqdm.notebook import tqdm

In [None]:
#https://www.kaggle.com/havinath/eda-observations-visualizations-pytorch/output
def metric_loss(pred_fvc,true_fvc):
        #Implementation of the metric in pytorch
    sigma = pred_fvc[:, 2] - pred_fvc[:, 0]
    true_fvc=torch.reshape(true_fvc,pred_fvc[:,1].shape)
    sigma_clipped=torch.clamp(sigma,min=70)
    delta=torch.clamp(torch.abs(pred_fvc[:,1]-true_fvc),max=1000)
    metric=torch.div(-torch.sqrt(torch.tensor([2.0]).to('cpu'))*delta,sigma_clipped)-torch.log(torch.sqrt(torch.tensor([2.0]).to('cpu'))*sigma_clipped)
    return metric.mean()

In [None]:
y.shape

In [None]:
fold=0
criterion=qloss()
tmp_loss= -100000
#model.load_state_dict(torch.load('bestmodel.pth'))
for tr_idx, val_idx in tqdm(kf.split(z)):

    train_dataset = DatasetRetriever(
        train_arrays=z[tr_idx],#.index.values,
        targets= y[tr_idx]

    )

    valid_dataset = DatasetRetriever(
        train_arrays=z[val_idx],#.index.values,
        targets= y[val_idx]

    )
    
    
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=128,
        sampler=RandomSampler(train_dataset),
        pin_memory=False,
        drop_last=True,
        num_workers=4,
        #collate_fn=collate_fn,
    )
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=128,
        #sampler=RandomSampler(train_dataset),
        pin_memory=False,
        drop_last=False,
        num_workers=4,
        #collate_fn=collate_fn,
    )
    print(f'fold{fold}',end='\r')
    
    for epoch in (range(650)):
        model.train()
        
        #print(f' epoch {epoch}',end='\r')
        
    
        for input,gt in  (train_loader):
            pred=model(input.float())
            #print(gt.size(),input.size())
            loss=qloss()(pred,gt)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
        #print(f' epoch {epoch} :summary loss : {loss.item()}',end='\r')
        
        with torch.no_grad() :
            model.eval()
            val_loss=0.
            ll=0.
            
        
            for input,gt in (valid_loader):
                pred=model(input.float())
                #print(gt.size(),input.size())
                loss=criterion(pred,gt)
                val_loss+=loss.item()
                ll+=metric_loss(pred ,gt).item()
        if epoch%200==0:
            
            print(f'train loss summary loss : {loss.item()} val_loss epoch {epoch} {val_loss/len(valid_loader)}')
            print(f'll :  {epoch} {ll/len(valid_loader)}')
            if ll/len(valid_loader)>tmp_loss:
                print('saving best weights at',ll/len(valid_loader))
                tmp_loss= ll/len(valid_loader)
                torch.save(model.state_dict(),f'{fold}_bestmodel.pth')
                 
    fold=fold+1    
    tmp_loss=-100000

In [None]:
ze[0:2],sub[0:2]

In [None]:
sub=sub.reset_index(drop=True)
test_dataset = DatasetRetriever(
        train_arrays=ze,#.index.values,
        #df=sub,
        targets= None,
        test=True

    )
test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=128,
        sampler=RandomSampler(test_dataset),
        pin_memory=False,
        drop_last=False,
        num_workers=4,
    shuffle=False
        #collate_fn=collate_fn,
    )

In [None]:
test_dataset[0],FE

In [None]:
%%time
'''
cnt = 0
EPOCHS = 650
for tr_idx, val_idx in kf.split(z):
    cnt += 1
    print(f"FOLD {cnt}")
    net = make_model(nh)
    net.fit(z[tr_idx], y[tr_idx], batch_size=BATCH_SIZE, epochs=EPOCHS, 
            validation_data=(z[val_idx], y[val_idx]), verbose=0) #
    print("train", net.evaluate(z[tr_idx], y[tr_idx], verbose=0, batch_size=BATCH_SIZE))
    print("val", net.evaluate(z[val_idx], y[val_idx], verbose=0, batch_size=BATCH_SIZE))
    print("predict val...")
    pred[val_idx] = net.predict(z[val_idx], batch_size=BATCH_SIZE, verbose=0)
    print("predict test...")
    pe += net.predict(ze, batch_size=BATCH_SIZE, verbose=0) / NFOLD
#==============
'''

In [None]:
model_preds=[]
for fold in range(5):
    
    model.eval()
    model.load_state_dict(torch.load(f'{fold}_bestmodel.pth'))

    test_preds=[]
    for x_test,y_test in tqdm(test_loader):
        preds=model(x_test.float())
        test_preds.append(preds)
    model_preds.append(torch.cat(test_preds))

preds_numpy=torch.stack(model_preds,dim=0).mean(0).detach().numpy()


In [None]:
#preds_numpy=torch.cat(test_preds).detach().numpy()#

In [None]:
quantiles = (0.2, 0.5, 0.8)

In [None]:
unc = preds_numpy[:,2] - preds_numpy[:, 0]
sigma_mean = np.mean(unc)
print( sigma_mean)

In [None]:
#test_dataset[0]

In [None]:
#df = pd.DataFrame(data=preds_numpy, columns=list(quantiles))
df=pd.DataFrame({'Patient_Week':[]})
df['Patient_Week'] = sub['Patient_Week']
#df['FVC'] = df[quantiles[1]]
df['FVC'] = preds_numpy[:,1]
#df['Confidence'] = df[quantiles[2]] - df[quantiles[0]]
df['Confidence'] = preds_numpy[:,2] - preds_numpy[:,0]
#df = df.drop(columns=list(quantiles))
df=df.reset_index(drop=True)


In [None]:
otest = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')
for i in range(len(otest)):
    df.loc[df['Patient_Week']==otest.Patient[i]+'_'+str(otest.Weeks[i]), 'FVC'] = otest.FVC[i]
    df.loc[df['Patient_Week']==otest.Patient[i]+'_'+str(otest.Weeks[i]), 'Confidence'] = 0.1

In [None]:
df.to_csv('submission.csv', index=False)

In [None]:
df.describe().T

In [None]:
#df[(df.Patient_Week.str.contains('ID00419637202311204720264')) & (df.Confidence==0.1)]
#df
df

Please let me know if u find any bugs. 
Base kernel https://www.kaggle.com/ulrich07/osic-multiple-quantile-regression-starter/output