In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
import pydicom # this one is to read the dicom files 
import scipy.ndimage
import matplotlib.pyplot as plt 
import sklearn
from sklearn.preprocessing import normalize
from tqdm.auto import tqdm 

import lightgbm as lgb
from sklearn.metrics import mean_squared_error

import torch
import torch.nn as nn
import torch.nn.functional as F

from skimage import measure, morphology 

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
### CSV Functions
def csv_split (data, v, t):
    
    #remove duplicate
    data.drop_duplicates(keep=False, inplace=True, subset=['Patient','Weeks'])
    
    #Drop rows with patientids if needed
    drop_patientID = ['']
    for i in drop_patientID:
        ind=data.Patient[data.Patient == i ].index.tolist()
        for j in ind:
            #print(j)
            data=data.drop([j], axis=0)
    data.reset_index(inplace = True, drop = True)
    
    #split
    unique_patient=data.Patient.unique()
    unique_patient_val=unique_patient[-v:]
    unique_patient_test=unique_patient[-(v+t):-v]
    unique_patient_train=unique_patient[:-(v+t)]
    
    valid=pd.DataFrame()
    for id in unique_patient_val:
        valid_x=data.loc[data['Patient']==id]
        valid=pd.concat([valid,valid_x])
    test=pd.DataFrame()
    for id in unique_patient_test:
        test_x=data.loc[data['Patient']==id]
        test=pd.concat([test,test_x])
    train=pd.DataFrame()
    for id in unique_patient_train:
        train_x=data.loc[data['Patient']==id]
        train=pd.concat([train,train_x])
    
    valid.reset_index(inplace = True, drop = True)
    test.reset_index(inplace = True, drop = True)
    train.reset_index(inplace = True, drop = True)
    
    return train, valid, test

def csv_preprocess (data):
    
    #Healthy FVC
    data['Healthy-FVC']=round((data['FVC']*100)/data['Percent'])
    FE=[]
    FE.append('Healthy-FVC')
    
    #Create Male, Female, Ex-smoker, Current-smoker and Never smoked
    COLS = ['Sex','SmokingStatus']
    for col in COLS:
        for mod in data[col].unique():
            FE.append(mod)
            data[mod] = (data[col] == mod).astype(int)
    
    data =  data[['Patient','Weeks','FVC','Age']+FE]
    
    FE1=['Male','Female','Ex-smoker','Never smoked','Currently smokes']
    #Rename base_Weeks and base_FVC
    rename_col={'Weeks':'base_Weeks','FVC':'base_FVC'}
    data=data.rename(columns=rename_col)
    
    #Weeks biasing Week=-12 to Week =0 and Week = 133 to Week = 145
    data.base_Weeks+=12
    #Add new fields Week and actual_FCV
    npData=pd.DataFrame(columns=['Patient','base_Weeks','base_FVC','Age','Healthy-FVC']+FE1+['Week','actual_FVC'])
    
    for pid in data['Patient'].unique():
        weeks=data.loc[data['Patient']==pid].base_Weeks
        fvc = data.loc[data['Patient']==pid].base_FVC
        index = data.loc[data['Patient']==pid].index
        weeks.reset_index(inplace = True, drop = True)
        fvc.reset_index(inplace = True, drop = True)
        for j in index:
            for k in range(len(weeks)):
                if (weeks[k] == data.at[j,'base_Weeks']):
                    continue
                else:
                    npData=pd.concat([npData,data.loc[data.index==j]], sort=False)
                    npData.iloc[-1, npData.columns.get_loc('Week')]=weeks[k]
                    npData.iloc[-1, npData.columns.get_loc('actual_FVC')]=fvc[k]
    npData.reset_index(inplace = True, drop = True)
    npData=npData.fillna(0)
    
    #Random Shuffle
    npData=sklearn.utils.shuffle(npData)
    npData.reset_index(inplace = True, drop = True)
    
    return npData

In [None]:
###Evaluation Metric Function
def laplace_log_likelihood(actual_fvc, predicted_fvc, confidence, return_values = False):
    """
    Calculates the modified Laplace Log Likelihood score for this competition.
    """
    sd_clipped = np.maximum(confidence, 70)
    delta = np.minimum(np.abs(actual_fvc - predicted_fvc), 1000)
    metric = - np.sqrt(2) * delta / sd_clipped - np.log(np.sqrt(2) * sd_clipped)

    if return_values:
        return metric
    else:
        return np.mean(metric)

In [None]:
def sigma_generator (data):
    confidence=np.arange(70,1000,1)
    data['actual_sigma']=np.nan
    FVC=data['actual_FVC'].values
    Pred=data['Prediction'].values
    #data=pd.DataFrame(columns=['prediction_FVC', 'sigma'])
    for j in range(len(FVC)):
        score=laplace_log_likelihood(FVC[j], Pred[j], confidence, return_values = True)
        ind=np.where(score == score.max())
        i = int(ind[0])
        actual_sigma=confidence[i]
        data.at[j, 'actual_sigma']= actual_sigma
    return data

In [None]:
class DATA(nn.Module):
	def __init__(self):
		super(DATA, self).__init__()
		
		self.layer1 = nn.Linear(10,64)
		self.layer2 = nn.ReLU()
		#nn.Dropout(0.5),
		self.layer3 = nn.Linear(64,128)
		self.layer4 = nn.ReLU()
		#nn.Dropout(0.5),
		self.layer5 = nn.Linear(128,256)
		self.layer6 = nn.ReLU()
		#nn.Dropout(0.5),
		self.layer7 = nn.Linear(256,512)
		self.layer8 = nn.ReLU()
		#nn.Dropout(0.5),
		self.layer9 = nn.Linear(512,512)
		self.layer10 = nn.ReLU()
		### 
		self.layer11 = nn.Linear(512,512)
		self.layer12 = nn.ReLU()
		self.layer13 = nn.Linear(512, 512)
		self.layer14 = nn.ReLU()
		### 
		#nn.Dropout(0.5),
		self.layer15 = nn.Linear(512,128)
		self.layer16 = nn.ReLU()
		#nn.Dropout(0.5),
		self.layer17 = nn.Linear(128,64)
		self.layer18 = nn.ReLU()
		#nn.Dropout(0.5),
		self.layer19 = nn.Linear(64,1)
		self.layer20 = nn.ELU()

	def forward(self, x):
		x = self.layer1(x)
		x = self.layer2(x)
		x = self.layer3(x)
		x = self.layer4(x)
		x = self.layer5(x)
		x = self.layer6(x)
		x =x1= self.layer7(x)
		#print(x.shape)
		x = self.layer8(x)
		x = self.layer9(x)
		x = self.layer10(x)
		x = self.layer11(x)

		x = self.layer12(x)
		x = self.layer13(x)
		x = self.layer14(x)
		x = x+x1  
		#print(x1.shape)
		x = self.layer15(x)
		x = self.layer16(x)
		x = self.layer17(x)
		x = self.layer18(x)
		x = self.layer19(x)
		x = self.layer20(x)
		#x = self.layer21(x)
		return x

class SIGMA(nn.Module):
    def __init__(self):
        super(SIGMA, self).__init__()
        self.data_net1=nn.Sequential(
                        nn.Linear(10,64),
                        nn.ReLU(),
                        #nn.Dropout(0.5),
                        nn.Linear(64,118),
                        nn.ReLU()
                        #nn.Dropout(0.5)
                        )
        self.data_net2=nn.Sequential(
                        nn.Linear(128,256),
                        nn.ReLU(),
                        #nn.Dropout(0.5),
                        nn.Linear(256,502),
                        nn.ReLU()
                        #nn.Dropout(0.5)
                        )
        self.data_net3=nn.Sequential(
                        nn.Linear(512,256),
                        nn.ReLU(),
                        #nn.Dropout(0.5),
                        nn.Linear(256,118),
                        nn.ReLU()
                        #nn.Dropout(0.5)
                        )
        self.data_net4=nn.Sequential(
                        nn.Linear(748,64),
                        nn.ReLU(),
                        #nn.Dropout(0.5),
                        nn.Linear(64,1),
                        nn.ReLU()
                        )

    def forward(self, data_i):
        out1 = self.data_net1(data_i)
        out2 = torch.cat((data_i,out1), dim=-1)
        out2 = self.data_net2(out2)
        out3 = torch.cat((data_i,out2), dim=-1)
        out3 = self.data_net3(out3)
        out4 = torch.cat((data_i,out1,out2,out3), dim=-1)
        out = self.data_net4(out4)
        return out

In [None]:
######## Meta Data Model Trainer ##############################

def train_data_net(epochs, batch_size, npTrain,npValid, model, train_device ='cpu'):


    x_train_values_df = npTrain[['base_Weeks', 'base_FVC', 'Age', 'Male', 'Female', 'Ex-smoker','Never smoked', 'Currently smokes', 'Week', 'Healthy-FVC']]
    #x_train_values_df = npTrain.drop(['Patient','actual_FVC'], axis = 1) # dataframw without patientId 
    x_train_values = x_train_values_df.values # ndarray of train metadata 
    y_train_values = npTrain['actual_FVC'].values # ndarray of metadata label 

    x_valid_values_df = npValid[['base_Weeks', 'base_FVC', 'Age', 'Male', 'Female', 'Ex-smoker','Never smoked', 'Currently smokes', 'Week', 'Healthy-FVC']] # dataframw without patientId 
    x_valid_values = x_valid_values_df.values # ndarray of train metadata 
    y_valid_values = npValid['actual_FVC'].values # ndarray of metadata label
    
    if train_device =='cuda':
        device = torch.device("cuda")
        model.to(device)

    
    for epoch in range(epochs):
        #cudnn.benchmark = True
        torch.backends.cudnn.benchmark = True

        ## Training 
        n = len(x_train_values)
        model.train()
        #pbar = tqdm(range((n-1)// batch_size +1), total = (n-1)// batch_size +1)
        Steps = (n-1)// batch_size +1
        pbar = tqdm(range(Steps), total= Steps)
        for i in pbar:     
            #range((n-1)// batch_size +1):
            # path = processed_img_dir + os.sep + patientId
            # image_3darray = load_array(path)
            start_i = i * batch_size
            end_i = start_i + batch_size  
            xb_meta =  torch.tensor(x_train_values[start_i:end_i]).float()
            Y_target = torch.tensor(y_train_values[start_i:end_i]).float().unsqueeze(1)
            #print(Y_target)
            #print(Y_target.shape) 

            if train_device == 'cuda':
                xb_meta = xb_meta.cuda()
                Y_target = Y_target.cuda()
            prediction = model(xb_meta)
            loss = compute_loss(prediction, Y_target)

            loss.backward()
            optimizer.step()
            with torch.no_grad():
                accuracy =(1- ((prediction- Y_target)/Y_target).abs())

            s = ('Epochs: %5d/%d , Steps: %8d/%d , train_loss: %5.3f  ,trian_accuracy: %5.3f'%\
                  (epoch, epochs, i, Steps, loss.data, accuracy.data.item()))
            pbar.set_description(s)
            optimizer.zero_grad()
            del prediction
             
            
             


        ## Validation
        val_acc_total = 0.
        n = len(x_valid_values)
        Steps = (n-1)// batch_size +1
        pbar = tqdm(range(Steps), total= Steps)
        model.eval()
        for i in pbar:  #range((n-1)//batch_size +1)
            start_i = i * batch_size
            end_i = start_i + batch_size 
            xb_meta =  torch.tensor(x_valid_values[start_i:end_i]).float()
            Y_target = torch.tensor(y_valid_values[start_i:end_i]).float().unsqueeze(1)
            
            if train_device == 'cuda':
                xb_meta = xb_meta.cuda()
                Y_target = Y_target.cuda()
            prediction = model(xb_meta)
            loss = compute_loss(prediction, Y_target)
            with torch.no_grad():
                accuracy =(1- ((prediction- Y_target)/Y_target).abs())
            

            s = ('Epochs: %5d/%d , Steps: %8d/%d , val_loss: %5.3f  ,val_accuracy: %5.3f'%\
                  (epoch, epochs, i, Steps, loss.data, accuracy.data.item()))
            pbar.set_description(s)
            
            val_acc_total += accuracy.data.item()
            del prediction
        avg_val_acc = (val_acc_total)/n 
        print('Average Validation accuracy:', avg_val_acc)
             
        #PATH = 'Epoch'+'%s_'%epoch+'%s'%avg_val_acc + '.pth'
        #torch.save(model.state_dict(), './checkpoint_data_models'+os.sep+PATH)

######## Sigma Model Trainer ##############################

def train_sigma_net(epochs, batch_size, npTrain, npValid, model, train_device ='cpu'):


    
    x_train_values_df = npTrain[['base_Weeks','base_FVC','Age','Male','Female','Ex-smoker','Never smoked','Currently smokes','Healthy-FVC','Prediction']]
    x_train_values = x_train_values_df.values
    y_train_values = npTrain['actual_sigma'].values

    x_valid_values_df = npValid[['base_Weeks','base_FVC','Age','Male','Female','Ex-smoker','Never smoked','Currently smokes','Healthy-FVC','Prediction']]
    x_valid_values = x_valid_values_df.values
    y_valid_values = npValid['actual_sigma'].values
    
    if train_device =='cuda':
        device = torch.device("cuda")
        model.to(device)

    
    for epoch in range(epochs):
        #cudnn.benchmark = True
        torch.backends.cudnn.benchmark = True

        ## Training 
        n = len(x_train_values)
        model.train()
        #pbar = tqdm(range((n-1)// batch_size +1), total = (n-1)// batch_size +1)
        Steps = (n-1)// batch_size +1
        pbar = tqdm(range(Steps), total= Steps)
        for i in pbar:     #range((n-1)// batch_size +1):
            # path = processed_img_dir + os.sep + patientId
            # image_3darray = load_array(path)
            start_i = i * batch_size
            end_i = start_i + batch_size  
            xb_meta =  torch.tensor(x_train_values[start_i:end_i]).float()
            Y_target = torch.tensor(y_train_values[start_i:end_i]).float().unsqueeze(1)
            #print(Y_target)
            #print(Y_target.shape) 

            if train_device == 'cuda':
                xb_meta = xb_meta.cuda()
                Y_target = Y_target.cuda()
            prediction = model(xb_meta)
            loss = ((prediction- Y_target)/Y_target).abs()

            loss.backward()
            optimizer.step()
            with torch.no_grad():
                accuracy =(1- ((prediction- Y_target)/Y_target).abs())

            s = ('Epochs: %5d/%d , train_loss: %5.3f  ,trian_accuracy: %5.3f'%\
                  (epoch, epochs,  loss.data, accuracy.data.item()))
            pbar.set_description(s)
            optimizer.zero_grad()
            del prediction
             
            
             


        ## Validation
        val_loss=0
        val_acc_total = 0.
        n = len(x_valid_values)
        Steps = (n-1)// batch_size +1
        pbar = tqdm(range(Steps), total= Steps)
        model.eval()
        for i in pbar:  #range((n-1)//batch_size +1)
            start_i = i * batch_size
            end_i = start_i + batch_size 
            xb_meta =  torch.tensor(x_valid_values[start_i:end_i]).float()
            Y_target = torch.tensor(y_valid_values[start_i:end_i]).float().unsqueeze(1)
            
            if train_device == 'cuda':
                xb_meta = xb_meta.cuda()
                Y_target = Y_target.cuda()
            prediction = model(xb_meta)
            loss = ((prediction- Y_target)/Y_target).abs()
            with torch.no_grad():
                accuracy =(1- ((prediction- Y_target)/Y_target).abs())
            

            s = ('Epochs: %5d/%d , val_loss: %5.3f  ,val_accuracy: %5.3f'%\
                  (epoch, epochs,  loss.data, accuracy.data.item()))
            pbar.set_description(s)
            
            val_loss += loss.data.item()
            val_acc_total += accuracy.data.item()
            del prediction
        avg_loss = val_loss/n
        avg_val_acc = (val_acc_total)/n 
        print('Average Validation accuracy:', avg_val_acc)
        print('Average Validation loss:', avg_loss)
             
        #PATH = 'Epoch'+'%s_'%epoch+'%s'%avg_val_acc + '.pth'
        #torch.save(model.state_dict(), './checkpoint_sigma_models'+os.sep+PATH)

In [None]:
######### ----------------- Making the Evaluation Data -------------------- #######
def make_eval_data(npEval, model):
    x_features = npEval[['base_Weeks', 'base_FVC', 'Age', 'Male', 'Female', 'Ex-smoker','Never smoked', 'Currently smokes', 'Week', 'Healthy-FVC']]
    x_features = torch.tensor(x_features.values).float()
    #y_labels = npEval['actual_FVC'].values 
    patientsID = npEval['Patient'].values 
    
    predictions = []
    for x_feature in x_features:
        x_feature = x_feature.unsqueeze(0)
        #print(x_feature.shape)
        prediction = model(x_feature)
        predictions.append(prediction.data.item())
    #     eval_data_df = pd.DataFrame([])
    #     eval_data_df['PatientID'] = patientsID
    #     eval_data_df['Actual_Value'] = y_labels 
    npEval['Prediction'] = predictions
    #eval_data_df.set_index('PatientID')
    npEval.reset_index(inplace = True)
    #npEval.to_csv('myoutputtrain.csv', sep='\t')  
    return npEval



######### ----------------- Making the Evaluation Data -------------------- #######
def make_eval_sigma(npEval, model):
    x_features = npEval[['base_Weeks','base_FVC','Age','Male','Female','Ex-smoker','Never smoked','Currently smokes','Healthy-FVC','Prediction']]

    x_features = torch.tensor(x_features.values).float()
    #y_labels = npEval['actual_FVC'].values 
    patientsID = npEval['Patient'].values 
    
    confidences = []
    #print(x_features.shape)

    for i in range(x_features.shape[0]):
        x_feature  = x_features[i]
        x_feature = x_feature.unsqueeze(0)
        #print(x_feature.shape)
        confidence = model(x_feature)
        #         print(confidence.item())
        #         print(x_feature)
        confidences.append(confidence.data.item())
        
    #     eval_data_df = pd.DataFrame([])
    #     eval_data_df['PatientID'] = patientsID
    #     eval_data_df['Actual_Value'] = y_labels 
    npEval['confidence'] = confidences
    #eval_data_df.set_index('PatientID')
    #npEval.reset_index(inplace = True)
    #npEval.to_csv('myoutputtrain.csv', sep='\t')  
    return npEval

In [None]:
submission = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/sample_submission.csv')
submission['Patient']=submission['Patient_Week'].apply(lambda x:x.split('_')[0])
submission['Weeks']=submission['Patient_Week'].apply(lambda x:x.split('_')[1]).astype(int)

# Week bias add
submission.Weeks += 12


testdf = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')
merge=pd.merge(testdf,submission,on=['Patient'],how='left').sort_values(['Weeks_y','Patient']).reset_index(drop=True)
merge=merge.drop(['FVC_y'],axis=1)
merge=merge.rename(columns={'FVC_x':'base_FVC','Weeks_y':'Week','Weeks_x':'base_Weeks'})

del testdf
del submission

testdf=merge.loc[:,['Patient','base_Weeks','base_FVC','Percent','Age','Sex','SmokingStatus','Week']]
submission=merge.loc[:,['Patient_Week','base_FVC','Confidence']]
submission=submission.rename(columns={'base_FVC':'FVC'})

In [None]:
data = testdf.copy()
data['Healthy-FVC']=round((data['base_FVC']*100)/data['Percent'])
FE=[]
FE.append('Healthy-FVC')

#Create Male, Female, Ex-smoker, Current-smoker and Never smoked
COLS = ['Sex','SmokingStatus']
for col in COLS:
    for mod in data[col].unique():
        FE.append(mod)
        data[mod] = (data[col] == mod).astype(int)
FE1=['Male','Female','Ex-smoker','Never smoked','Currently smokes']
npData=pd.DataFrame(columns=['Patient','base_Weeks','base_FVC','Age','Healthy-FVC']+FE1+['Week'])
npData=npData.append(data)
npData=npData.fillna(0)

del testdf
testdf = npData[['Patient','base_Weeks','base_FVC','Age','Healthy-FVC','Male','Female','Ex-smoker','Never smoked','Currently smokes','Week']]
   

In [None]:
#print(testdf.head())

In [None]:
model_FVC = DATA()
model_FVC.load_state_dict(torch.load('../input/metadatapreweights/metadata_checkpoint.pth'))
model_FVC.eval()
# for the validation inputset of sigma 
# ndf = pd.concat([npValid,npTest])
# outdf = make_eval_data(df_test, model)
# for the training inputset of sigma 
#train_inp_sigma = make_eval_data(df_train, model)
# for the final test inputset of sigma

test_inp_sigma = make_eval_data(testdf.copy(), model_FVC)

#test_inp_sigma.head()

In [None]:
# model_sigma = SIGMA()
# model_sigma.load_state_dict(torch.load('../input/sigmapreweight/sigma_preweight.pth'))
# model_sigma.eval()


# test_confidence_df = make_eval_sigma(test_inp_sigma.copy(), model_sigma)

# #test_confidence_df.head()

In [None]:
# train = test_inp_sigma.copy()
# df = pd.DataFrame(columns=['Patient','base_Weeks','confidence'])
# tid=train.Patient.unique()
# i=0
# for pid in tid:
#     weeks=train.loc[train.Patient==pid].base_Weeks.unique()
#     for w in weeks:
#         temp=train[train.Patient==pid]
#         value=temp[temp.base_Weeks==w].Prediction.values
#         diff = (value - np.mean(value))
#         var = np.square(diff).sum()/len(value)
#         sd = np.sqrt(var)
#         c = sd*1.96/np.sqrt(len(value))
#         #print(6*c)
#         df.at[i,'Patient']=pid
#         df.at[i,'base_Weeks']=w
#         df.at[i,'confidence']=8*c
#         i=i+1

In [None]:
# merge = pd.merge(df, train, on=['Patient','base_Weeks'], how='left')
# result=merge[['Patient', 'base_Weeks', 'base_FVC', 'Age', 'Healthy-FVC',
#        'Male', 'Female', 'Ex-smoker', 'Never smoked', 'Currently smokes',
#        'Week', 'Prediction', 'confidence']]

In [None]:
# submission.loc[:, 'FVC']=result.Prediction
# submission.loc[:, 'Confidence']=result.confidence


#print(submission.tail())
#temp=temp.sort_values(by=['Week'], ascending = True)
#temp.reset_index(inplace=True , drop=True)

# temp_df = pd.DataFrame([])
# temp_df['Patient_Week'] = temp['Patient'].apply(lambda x: str(x)) +'_'+ temp['Week'].apply(lambda x: str(x))
# temp['Patient_Week'] = temp_df['Patient_Week'].values
# sample_submission_df = pd.DataFrame([])
# sample_submission_df['Patient_Week'] = temp['Patient_Week'].values
# sample_submission_df['FVC'] = temp['Prediction'].values
# sample_submission_df['Confidence'] = temp['confidence'].values
#print(sample_submission_df.tail(20))
# sample_submission_df = test_confidence_df[['Patient_Week','Prediction', 'confidence']]
# sample_submission_df.rename(columns ={'Prediction':'FVC','confidence':'Confidence'}, inplace=True) 
# sample_submission_df.head()

In [None]:
# # ### CSV Load, Read and Process
# path = "../input/osic-pulmonary-fibrosis-progression"

# data = pd.read_csv(f"{path}/train.csv")                        
# final_test  = pd.read_csv(f"{path}/test.csv")  

# valid, train, test = csv_split(data, v=0, t=0)
# print(train.shape)
# #print(valid.shape)
# #print(test.shape)
# npTrain=csv_preprocess(train)
# #npValid=csv_preprocess(valid)
# #npTest=csv_preprocess(test)
# print(npTrain.shape)
# #print(npValid.shape)
# #print(npTest.shape)

In [None]:
# model_FVC = DATA()
# model_FVC.load_state_dict(torch.load('../input/metadatapreweights/metadata_checkpoint.pth'))
# model_FVC.eval()
# # for the validation inputset of sigma 
# # ndf = pd.concat([npValid,npTest])
# # outdf = make_eval_data(df_test, model)
# # for the training inputset of sigma 
# #train_inp_sigma = make_eval_data(df_train, model)
# # for the final test inputset of sigma

# train_inp_sigma = make_eval_data(npTrain.copy(), model_FVC)

# train_inp_sigma.head()

In [None]:
# model_sigma = SIGMA()
# model_sigma.load_state_dict(torch.load('../input/sigmapreweight/sigma_preweight.pth'))
# model_sigma.eval()


# train_confidence_df = make_eval_sigma(train_inp_sigma.copy(), model_sigma)

# train_confidence_df.head()

In [None]:
# actual_FVC = train_inp_sigma.actual_FVC.values
# Prediction = train_inp_sigma.Prediction.values
# sigma = train_confidence_df.confidence.values
# #print(len(actual_FVC))
# #print(len(Prediction))
# #print(len(sigma))

In [None]:
# temp = sigma_generator(train_inp_sigma)

In [None]:
# actual_sigma=temp.actual_sigma.values

In [None]:
# score = laplace_log_likelihood(actual_FVC, Prediction, actual_sigma, return_values = True)

# print(np.mean(score))

In [None]:
# score = laplace_log_likelihood(actual_FVC, Prediction, 235, return_values = True)

# print(np.mean(score))

In [None]:
# score = laplace_log_likelihood(actual_FVC, Prediction, sigma, return_values = True)

# print(np.mean(score))

In [None]:
#===========================================================
def run_single_lightgbm(param, train_df, valid_df, test_df, categorical=False):
    
    #trn_idx = folds[folds.fold != fold_num].index
    #val_idx = folds[folds.fold == fold_num].index
    #logger.info(f'len(trn_idx) : {len(trn_idx)}')
    #logger.info(f'len(val_idx) : {len(val_idx)}')
    # ['base_FVC', 'base_Percent', 'base_Age', 'Week_passed', 'Sex', 'SmokingStatus']
    #[['Patient', 'base_Weeks', 'base_FVC', 'Age', 'Healthy-FVC', 'Male','Female', 'Ex-smoker', 'Never smoked', 'Currently smokes', 'Week','actual_FVC', 'Prediction']]
    train_target=train_df[['actual_sigma']]
    valid_target=valid_df[['actual_sigma']]
    train_df = train_df[['base_FVC','Age','Healthy-FVC','Prediction','Male','Female', 'Ex-smoker', 'Never smoked', 'Currently smokes']]
    valid_df = valid_df[['base_FVC','Age','Healthy-FVC','Prediction','Male','Female', 'Ex-smoker', 'Never smoked', 'Currently smokes']]
    test_df = test_df[['base_FVC','Age','Healthy-FVC','Prediction','Male','Female', 'Ex-smoker', 'Never smoked', 'Currently smokes']]
    
    if categorical == False:
        trn_data = lgb.Dataset(train_df,
                               label=train_target)
        val_data = lgb.Dataset(valid_df,
                               label=valid_target)
    else:
        trn_data = lgb.Dataset(train_df,
                               label=train_target,
                               categorical_feature=['Male','Female', 'Ex-smoker', 'Never smoked', 'Currently smokes'])
        val_data = lgb.Dataset(valid_df,
                               label=valid_target,
                               categorical_feature=['Male','Female', 'Ex-smoker', 'Never smoked', 'Currently smokes'])

    train_confidence = np.zeros(len(train_df))
    
    valid_confidence = np.zeros(len(valid_df))
    
    test_confidence = np.zeros(len(test_df))

    num_round = 10000

    clf = lgb.train(param,
                    trn_data,
                    num_round,
                    valid_sets=[trn_data, val_data],
                    verbose_eval=100,
                    early_stopping_rounds=100)

    train_confidence = clf.predict(train_df, num_iteration=clf.best_iteration)

    valid_confidence = clf.predict(valid_df, num_iteration=clf.best_iteration)
    
    test_confidence = clf.predict(test_df, num_iteration=clf.best_iteration)
    
    # RMSE
    print("Valid RMSE score: {:<8.5f}".format( np.sqrt(mean_squared_error(valid_target, valid_confidence))))
    
    return train_confidence, valid_confidence, test_confidence


def run_kfold_lightgbm(param, train, valid, test, epochs=5, categorical=False):
    
    #logger.info(f"================================= {n_fold}fold lightgbm =================================")
    
    train_confidence = np.zeros(len(train))
    valid_confidence = np.zeros(len(valid))
    test_confidence = np.zeros(len(test))
    
    valid_target = valid[['actual_sigma']]

    for fold_ in range(epochs):
        print("Epoch {}".format(fold_))
        _train_confidence, _valid_confidence, _test_confidence  = run_single_lightgbm(param,
                                                                     train,
                                                                     valid,test,
                                                                     categorical=categorical)
        train_confidence += _train_confidence/epochs
        valid_confidence += _valid_confidence/epochs
        test_confidence += _test_confidence/epochs

    # RMSE
    print("Final Valid RMSE score: {:<8.5f}".format(np.sqrt(mean_squared_error(valid_target, valid_confidence))))

    print(f"=========================================================================================")
    
    return train_confidence, valid_confidence, test_confidence

In [None]:
train=pd.read_csv("../input/trainallpatient/outputtrain.csv")
#data=data[['PatientID','Actual_Value','Prediction']]
#data=data.sort_values(by=['Patient','base_Weeks'], ascending=True )
#data.reset_index(inplace =True, drop=True)
train = train[['Patient', 'base_Weeks', 'base_FVC', 'Age', 'Healthy-FVC', 'Male',
       'Female', 'Ex-smoker', 'Never smoked', 'Currently smokes', 'Week',
       'actual_FVC', 'Prediction']]
valid=pd.read_csv("../input/validallpatient/outputvalid.csv")
#data=data[['PatientID','Actual_Value','Prediction']]
#data=data.sort_values(by=['Patient','base_Weeks'], ascending=True )
#data.reset_index(inplace =True, drop=True)
valid = valid[['Patient', 'base_Weeks', 'base_FVC', 'Age', 'Healthy-FVC', 'Male',
       'Female', 'Ex-smoker', 'Never smoked', 'Currently smokes', 'Week',
       'actual_FVC', 'Prediction']]
test = test_inp_sigma.copy()


In [None]:
import scipy as sp
from functools import partial
import math

#row['FVC']=train['actual_FVC'].values
#row['Pred']=train['Prediction'].values

def loss_func(weight, row):
    confidence = weight
    sigma_clipped = max(confidence, 70)
    diff = abs(row['actual_FVC'] - row['Prediction'])
    delta = min(diff, 1000)
    score = -math.sqrt(2)*delta/sigma_clipped - np.log(math.sqrt(2)*sigma_clipped)
    return -score

results = []
tk0 = tqdm(train.iterrows(), total=len(train))
for _, row in tk0:
    loss_partial = partial(loss_func, row=row)
    weight = [100]
    #bounds = [(70, 100)]
    #result = sp.optimize.minimize(loss_partial, weight, method='SLSQP', bounds=bounds)
    result = sp.optimize.minimize(loss_partial, weight, method='SLSQP')
    x = result['x']
    results.append(x[0])
train['actual_sigma']=results
results = []
tk0 = tqdm(valid.iterrows(), total=len(valid))
for _, row in tk0:
    loss_partial = partial(loss_func, row=row)
    weight = [100]
    #bounds = [(70, 100)]
    #result = sp.optimize.minimize(loss_partial, weight, method='SLSQP', bounds=bounds)
    result = sp.optimize.minimize(loss_partial, weight, method='SLSQP')
    x = result['x']
    results.append(x[0])
valid['actual_sigma']=results

In [None]:
lgb_param = {'objective': 'regression',
             'metric': 'rmse',
             'boosting_type': 'gbdt',
             'learning_rate': 0.01,
             'seed': 0,
             'max_depth': -1,
             'verbosity': -1,
            }

train_confidence, valid_confidence, test_confidence = run_kfold_lightgbm(lgb_param, train, valid, test, epochs=5, categorical=False)

In [None]:
train['confidence']=train_confidence
valid['confidence']=valid_confidence
test['confidence']=test_confidence

In [None]:
score=laplace_log_likelihood(train.actual_FVC.values, train.Prediction.values, train.confidence.values.astype(np.float64), return_values = True)
#print(np.mean(score))

In [None]:
submission.loc[:, 'FVC']=test.Prediction
submission.loc[:, 'Confidence']=test.confidence

In [None]:
submission.to_csv('submission.csv',index=False)