# LGBM Quantile
## Nested Cross-Validation for Estimating Generalization Error
Given that the public leaderboard forms only a small proportion of the total leaderboard, we might expect a 'shakeup' where the best performing models on the private dataset arent necessarily the best performing on the overall test set.

Cross-validation, where we train and validate on different subsets of the data, can either allow us to look for the best hyperparameters (i.e. those that perform the best on average over all of the subsets). However it does not allow us to estimate the generalization to the test set. This is because we select the best performing hyperparameters for the validation set so taking the cross-validation average will lead to an overly optimistic estimate.

To get around this problem, nested cross-validation has folds within folds. Each 'inner' fold estimates the best hyperparameters (or for neural networks the convergence), while each 'outer' fold takes the optimized model from the inner fold and estimates a test error on the test data for that fold.

Having set up a robust cross-validation method, we can compare different methods in a rigourous way (and a way that hopefully will translate to the private test set).

http://ethen8181.github.io/machine-learning/ab_tests/quantile_regression/quantile_regression.html

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
#Load the dependanciesfrom tqdm.notebook import tqdm
import cv2
import copy
from pathlib import Path
from sklearn.model_selection import KFold
from skimage.segmentation import clear_border
from skimage.morphology import ball, disk, dilation, binary_erosion, remove_small_objects, erosion, closing, reconstruction, binary_closing
from skimage.measure import label, regionprops
from skimage.segmentation import clear_border
from skimage.filters import roberts, sobel
from scipy import ndimage as ndi
from skimage import measure, morphology
from scipy.stats import kurtosis
import seaborn as sns
import scipy
from lightgbm import LGBMRegressor
#tuning hyperparameters
from bayes_opt import BayesianOptimization
from skopt  import BayesSearchCV 
#building models
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from tqdm.notebook import tqdm
from sklearn.ensemble import GradientBoostingRegressor
import copy
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import pydicom.pixel_data_handlers.gdcm_handler as gdcm_handler 
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import pydicom
import os
from torch.utils.data import DataLoader, Dataset
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def seed_all(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
seed_all()

In [None]:

def calc_metric_loss(pred_fvc,sigma,true_fvc):
    true_fvc=np.reshape(true_fvc,pred_fvc.shape)
    sigma[sigma<70]=70
    delta=np.abs(pred_fvc-true_fvc)
    delta[delta>1000]=1000
    metric=-(np.sqrt(2)*delta/sigma)-np.log(np.sqrt(2)*sigma)
    return -metric

def calc_fvc_loss(pred_fvc,true_fvc):
    true_fvc=np.reshape(true_fvc,pred_fvc.shape)
    fvc_err=np.abs(pred_fvc-true_fvc)
    return fvc_err

In [None]:
def plot_training_loss(train, val,title='loss'):
    plt.figure()
    plt.plot(train, label='Train')
    plt.plot(val, label='Val')
    if title=='loss':
        plt.title('Model Training Loss')
    else:
        plt.title('Model Metric Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.savefig('training_loss')

## Helper functions for preprocessing image data into 'histogram' features

In [None]:
# Load the scans in given folder path
def load_scan(path):

    #slices = [pydicom.read_file(path / s) for s in os.listdir(path)]
    slices = [pydicom.read_file(path / s) for s in os.listdir(path)]
    slices.sort(key = lambda x: float(x.ImagePositionPatient[2]))
    try:
        slice_thickness = np.abs(slices[0].ImagePositionPatient[2] - slices[1].ImagePositionPatient[2])
    except:
        slice_thickness = np.abs(slices[0].SliceLocation - slices[1].SliceLocation)
    if slice_thickness==0:
        slice_thickness=slices[0].SliceThickness
    for s in slices:
        s.SliceThickness = slice_thickness
        
    return slices

def get_pixels_hu(slices):
    image = np.stack([np.array(s.pixel_array,dtype=np.int16) for s in slices])
    # Convert to int16 (from sometimes int16), 
    # should be possible as values should always be low enough (<32k)
    image = image.astype(np.int16)

    # Set outside-of-scan pixels to 0
    # The intercept is usually -1024, so air is approximately 0
    image[image == -2000] = 0
    
    # Convert to Hounsfield units (HU)
    for slice_number in range(len(slices)):
        
        intercept = slices[slice_number].RescaleIntercept
        slope = slices[slice_number].RescaleSlope
        
        if slope != 1:
            image[slice_number] = slope * image[slice_number].astype(np.float64)
            image[slice_number] = image[slice_number].astype(np.int16)
            
        image[slice_number] += np.int16(intercept)
    
    return np.array(image, dtype=np.int16)

def resample(image, scan, new_spacing=[1,1,1]):
    # Determine current pixel spacing
    #spacing = np.array([scan[0].SliceThickness] + scan[0].PixelSpacing, dtype=np.float32)
    spacing = np.array([scan[0].SliceThickness] + list(scan[0].PixelSpacing), dtype=np.float32)
    resize_factor = spacing / new_spacing
    new_real_shape = image.shape * resize_factor
    new_shape = np.round(new_real_shape)
    real_resize_factor = new_shape / image.shape
    new_spacing = spacing / real_resize_factor
    
    image = scipy.ndimage.interpolation.zoom(image, real_resize_factor, mode='nearest')
    return image, new_spacing

def get_segmented_lungs(im, plot=False):
    
    '''
    This funtion segments the lungs from the given 2D slice.
    '''
    if plot == True:
        f, plots = plt.subplots(8, 1, figsize=(5, 40))
    '''
    Step 1: Convert into a binary image. 
    '''
    binary = im < -200
    if plot == True:
        plots[0].axis('off')
        plots[0].imshow(binary, cmap=plt.cm.bone) 
    '''
    Step 2: Remove the blobs connected to the border of the image.
    '''
    cleared = clear_border(binary)
    if plot == True:
        plots[1].axis('off')
        plots[1].imshow(cleared, cmap=plt.cm.bone) 
    '''
    Step 3: Label the image.
    '''
    label_image = label(cleared)
    if plot == True:
        plots[2].axis('off')
        plots[2].imshow(label_image, cmap=plt.cm.bone) 
    '''
    Step 4: Keep the labels with 2 largest areas.
    '''
    areas = [r.area for r in regionprops(label_image)]
    areas.sort()
    if len(areas) > 2:
        for region in regionprops(label_image):
            if region.area < areas[-2]:
                for coordinates in region.coords:                
                       label_image[coordinates[0], coordinates[1]] = 0
    binary = label_image > 0
    if plot == True:
        plots[3].axis('off')
        plots[3].imshow(binary, cmap=plt.cm.bone) 
    '''
    Step 5: Erosion operation with a disk of radius 2. This operation is 
    seperate the lung nodules attached to the blood vessels.
    '''
    selem = disk(2)
    binary = binary_erosion(binary, selem)
    if plot == True:
        plots[4].axis('off')
        plots[4].imshow(binary, cmap=plt.cm.bone) 
    '''
    Step 6: Closure operation with a disk of radius 10. This operation is 
    to keep nodules attached to the lung wall.
    '''
    selem = disk(10)
    binary = binary_closing(binary, selem)
    if plot == True:
        plots[5].axis('off')
        plots[5].imshow(binary, cmap=plt.cm.bone) 
    '''
    Step 7: Fill in the small holes inside the binary mask of lungs.
    '''
    edges = roberts(binary)
    binary = ndi.binary_fill_holes(edges)
    if plot == True:
        plots[6].axis('off')
        plots[6].imshow(binary, cmap=plt.cm.bone) 
    '''
    Step 8: Superimpose the binary mask on the input image.
    '''
    get_high_vals = binary == 0
    im[get_high_vals] = 0
    if plot == True:
        plots[7].axis('off')
        plots[7].imshow(im, cmap=plt.cm.bone) 
        
    return im

def get_kurtosis_stats(ids,ctscans_dir):
    kurt=[]
    std=[]
    mean=[]
    median=[]
    for i in ids:
        print(i)
        #try:
        patient_path= ctscans_dir / i
        scan = load_scan(patient_path)
        image=get_pixels_hu(scan)
        image, new_spacing = resample(image, scan, new_spacing=[2,2,2])
        image=np.asarray([get_segmented_lungs(slice) for slice in image])
        kurt_i=kurtosis(image.ravel()[image.ravel() < -200])
        std_i=image.ravel()[image.ravel() < -200].std()
        mean_i=image.ravel()[image.ravel() < -200].mean()
        median_i=np.median(image.ravel()[image.ravel() < -200])
        print('Kurtosis: ', kurt_i)
        print('Standard Deviation: ', std_i)
        kurt.append(kurt_i)
        std.append(std_i)
        mean.append(mean_i)
        median.append(median_i)
        ax=sns.kdeplot(image.ravel()[(image.ravel() < 0)&(image.ravel() > -1200)], bw=0.5)
        ax.set(xlabel='HU', ylabel='% voxels',title='Histogram of voxel characteristics')
        plt.show()
        plt.imshow(image[round(image.shape[0]/2),:,:])
        plt.show()
        #except:
            #print('error')
            #kurt.append(np.nan)
            #std.append(np.nan)
            #mean.append(np.nan)
            #median.append(np.nan)
    return kurt,std,mean,median

## Helper functions for preprocessing data

In [None]:
def load_and_prepare_data(add_pixel_stats=True):
    train=pd.read_csv('../input/osic-pulmonary-fibrosis-progression/train.csv')
    test=pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')
    submission=pd.read_csv('../input/osic-pulmonary-fibrosis-progression/sample_submission.csv')
    
    #Prepare Train Data
    train['base_Weeks']=train.groupby(['Patient'])['Weeks'].transform('min')
    base=train[train.Weeks==train.base_Weeks]
    base = base.rename(columns={'FVC': 'base_FVC','Percent': 'base_Percent'})
    base.drop_duplicates(subset=['Patient', 'Weeks'], keep='first',inplace=True)
    train=train.merge(base[['Patient','base_FVC','base_Percent']],on='Patient',how='left')
    train['Week_passed'] = train['Weeks'] - train['base_Weeks']
    
    test = test.rename(columns={'Weeks': 'base_Weeks', 'FVC': 'base_FVC','Percent': 'base_Percent'})
    # Adding Sample Submission
    submission = pd.read_csv("../input/osic-pulmonary-fibrosis-progression/sample_submission.csv")
    # In submisison file, format: ID_'week', using lambda to split the ID
    submission['Patient'] = submission['Patient_Week'].apply(lambda x:x.split('_')[0])
    # In submisison file, format: ID_'week', using lambda to split the Week
    submission['Weeks'] = submission['Patient_Week'].apply(lambda x:x.split('_')[1]).astype(int)
    test = submission.drop(columns = ["FVC", "Confidence"]).merge(test, on = 'Patient')
    test['Week_passed'] = test['Weeks'] - test['base_Weeks']
    test=test[train.columns.drop(['FVC','Percent'])]
    
    if add_pixel_stats:
        pixel_stats=pd.read_csv('../input/osic-histogram-features/train_pixel_stats.csv')
        train=train.merge(pixel_stats[['Patient','kurtosis','std','mean','median']],how='left',on='Patient')
        test_ids=test.Patient.unique()
        root_dir = Path('/kaggle/input/osic-pulmonary-fibrosis-progression')
        ct_scans_dir=root_dir/'test'
        pixel_stats_test=test.copy()
        pixel_stats_test.drop_duplicates(subset=['Patient'],inplace=True)
        k,s,m,me=get_kurtosis_stats(test_ids,ct_scans_dir)
        pixel_stats_test['kurtosis']=np.array(k)
        pixel_stats_test['std']=np.array(s)
        pixel_stats_test['mean']=np.array(m)
        pixel_stats_test['median']=np.array(me)
        test=test.merge(pixel_stats_test[['Patient','kurtosis','std','mean','median']],how='left',on='Patient')
    return train, test

def OH_encode(train,test):
    #OH Encoding of categorical variables (https://www.kaggle.com/ulrich07/osic-keras-starter-with-custom-metrics)
    COLS = ['Sex','SmokingStatus']
    for col in COLS:
        for mod in train[col].unique():
            train[mod] = (train[col] == mod).astype(int)
            test[mod] = (test[col] == mod).astype(int)
        train.drop(col,axis=1,inplace=True)
        test.drop(col,axis=1,inplace=True)
    return train, test

def Scale(train):
    from sklearn import preprocessing
    robust_scaler = preprocessing.RobustScaler()
    train.loc[:,train.columns.difference(['Patient','FVC','Percent','Weeks','base_Weeks'])]=robust_scaler.fit_transform(train.loc[:,train.columns.difference(['Patient','FVC','Percent','Weeks','base_Weeks'])])
    return robust_scaler

## Pytorch Dataset class
Neatened up since some of my other notebooks!

In [None]:
class OSIC(Dataset):
    def __init__(self,patient_ids,df,scaler=None,train=True,impute_vals=None):
        root_dir = Path('/kaggle/input/osic-pulmonary-fibrosis-progression')
        self.df=df.copy()
        self.df=self.df.loc[self.df.Patient.isin(patient_ids),:]
        if not train:
            ct_scans_dir=root_dir/'test'
        else:
            ctscans_dir=root_dir/'train'
        self.df.loc[:,self.df.columns.difference(['Patient','FVC','Percent','Weeks','base_Weeks'])]=scaler.transform(self.df.loc[:,self.df.columns.difference(['Patient','FVC','Percent','Weeks','base_Weeks'])])
        self.data=self.df.loc[:,self.df.columns.difference(['FVC','Patient','Percent'])].values
        if train:
            self.impute_vals=np.nanmean(self.data, axis=0)
        else:
            self.impute_vals=impute_vals
        inds = np.where(np.isnan(self.data))
        self.data[inds] = np.take(self.impute_vals, inds[1])
        self.patients=self.df['Patient'].values
        self.train=train
        if self.train:
            self.fvc=self.df['FVC'].values
    
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if self.train:
            data = {'fvc': self.fvc[idx],
                   'data': self.data[idx]}
        else:
            
            data = {'data': self.data[idx]}
        return data

## Function to perform bayesian hyperparameter search for light gbm

https://www.kaggle.com/somang1418/tuning-hyperparameters-under-10-minutes-lgbm

In [None]:
def bayes_parameter_opt_lgb(X, y, alpha=0.5,init_round=15, opt_round=25, n_folds=3, random_seed=6,n_estimators=10000, output_process=False):
    # prepare data
    train_data = lgb.Dataset(data=X, label=y, free_raw_data=False)
    # parameters
    def lgb_eval(learning_rate,num_leaves, feature_fraction, bagging_fraction, max_depth, max_bin, min_data_in_leaf,min_sum_hessian_in_leaf,subsample,alpha=alpha):
        params = {'objective':'quantile','alpha':alpha,'boosting_type': 'gbdt'}
        params['learning_rate'] = max(min(learning_rate, 1), 0)
        params["num_leaves"] = int(round(num_leaves))
        params['feature_fraction'] = max(min(feature_fraction, 1), 0)
        params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
        params['max_depth'] = int(round(max_depth))
        params['max_bin'] = int(round(max_depth))
        params['min_data_in_leaf'] = int(round(min_data_in_leaf))
        params['min_sum_hessian_in_leaf'] = min_sum_hessian_in_leaf
        params['subsample'] = max(min(subsample, 1), 0)
        
        cv_result = lgb.cv(params, train_data, nfold=n_folds, seed=random_seed, stratified=False, verbose_eval =200,metrics='mae')
        return -min(cv_result['l1-mean'])
     
    lgbBO = BayesianOptimization(lgb_eval, {'learning_rate': (0.01, 1.0),
                                            'num_leaves': (24, 80),
                                            'feature_fraction': (0.1, 0.9),
                                            'bagging_fraction': (0.8, 1),
                                            'max_depth': (5, 30),
                                            'max_bin':(20,90),
                                            'min_data_in_leaf': (20, 80),
                                            'min_sum_hessian_in_leaf':(0,100),
                                           'subsample': (0.01, 1.0)}, random_state=200,verbose=0)

    
    #n_iter: How many steps of bayesian optimization you want to perform. The more steps the more likely to find a good maximum you are.
    #init_points: How many steps of random exploration you want to perform. Random exploration can help by diversifying the exploration space.
    
    lgbBO.maximize(init_points=init_round, n_iter=opt_round)
    
    models=[]
    for model in range(len( lgbBO.res)):
        models.append(lgbBO.res[model]['target'])
    
    
    # return best parameters
    return lgbBO.res[pd.Series(models).idxmax()]['target'],lgbBO.res[pd.Series(models).idxmax()]['params']

## 'Inner' Fold training loop
Performs bayesian hyperparameter optimization on the inner train and validation set

In [None]:
def train_model(ids,train,quantiles):
    
    np.random.shuffle(ids)
    train_ids,val_ids=np.split(ids, [int(round(0.9 * len(ids), 0))])
    
    scaler=Scale(train.loc[train.Patient.isin(train_ids),:])
    train_dataset = OSIC(train_ids,train,scaler=scaler)  
    val_dataset = OSIC(val_ids,train,scaler=scaler,impute_vals=train_dataset.impute_vals)  
    
    lgb_quantile_alphas = {}
    for quantile_alpha in quantiles:
        # to train a quantile regression, we change the objective parameter and
        # specify the quantile value we're interested in
        num_round = 15000
        trn_data = lgb.Dataset(train_dataset.data, train_dataset.fvc)
        val_data = lgb.Dataset(val_dataset.data, val_dataset.fvc)
        opt_params = bayes_parameter_opt_lgb(train_dataset.data, train_dataset.fvc, init_round=5, opt_round=10, n_folds=3, random_seed=6,n_estimators=10000,alpha=quantile_alpha)
        opt_params[1]["num_leaves"] = int(round(opt_params[1]["num_leaves"]))
        opt_params[1]['max_depth'] = int(round(opt_params[1]['max_depth']))
        opt_params[1]['min_data_in_leaf'] = int(round(opt_params[1]['min_data_in_leaf']))
        opt_params[1]['max_bin'] = int(round(opt_params[1]['max_bin']))
        opt_params[1]['objective']='quantile'
        opt_params[1]['alpha']=quantile_alpha
        opt_params=opt_params[1]
        clf=lgb.train(opt_params, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=500, early_stopping_rounds = 250)
        lgb_quantile_alphas[quantile_alpha] = clf
    
    return lgb_quantile_alphas, scaler, train_dataset.impute_vals

## Model validation function
Takes the model optimised in the inner loop and applies it to 'out-of-fold' data

In [None]:
def val_model(lgb_quantile_alphas,val_dataset,quantiles):
    fvc_pred = lgb_quantile_alphas[quantiles[1]].predict(val_dataset.data)
    sigma_pred = lgb_quantile_alphas[quantiles[2]].predict(val_dataset.data)-lgb_quantile_alphas[quantiles[0]].predict(val_dataset.data)
    fvc_loss=calc_fvc_loss(fvc_pred,val_dataset.fvc).mean()
    metric_loss=calc_metric_loss(fvc_pred,sigma_pred,val_dataset.fvc).mean()
    return fvc_loss, metric_loss

## Nested CV
Performs the inner-outer loop k times. We can then average the results of each outer fold to get a generalization estimate

In [None]:
def nested_CV(train,test,quantiles,k_folds=5):
    ids=train.Patient.unique()
    np.random.shuffle(ids)
    kf = KFold(n_splits=k_folds)
    test_losses=[]
    test_metrics=[]
    models=[]
    scalers=[]
    all_impute_vals=[]
    fold_ids=[]
    fold=1
    for train_index, test_index in kf.split(ids):
        print("Training fold ",fold)
        fold+=1
        # Train
        model, scaler, impute_vals=train_model(ids[train_index],train,quantiles)
        # Validate
        val_dataset = OSIC(ids[test_index],train,scaler=scaler)  
        loss, metric = val_model(model,val_dataset,quantiles)
        print("Validation fvc loss: ",loss)
        print("Validation metric: ",metric)
        test_losses.append(loss)
        test_metrics.append(metric)
        models.append(model)
        scalers.append(scaler)
        all_impute_vals.append(impute_vals)
        fold_ids.append(test_index)
    return models,scalers,all_impute_vals, test_losses, test_metrics, fold_ids

## Run the nested CV to obtain generalization estimates

In [None]:
#add_pixel_stats=False is faster since doesn't need to preprocess the test data. Processed train data available in the attached dataset.
train,test=load_and_prepare_data(add_pixel_stats=False)
train,test=OH_encode(train,test)

In [None]:
quantiles=(0.2,0.5,0.8)
all_models,all_scalers,all_impute_vals, test_losses, test_metrics,kf_splits=nested_CV(train,test,quantiles,k_folds=5)
print("expected generalization metric: ", np.array(test_metrics).mean(), " std: ", np.array(test_metrics).std())
print("expected generalization loss: ", np.array(test_losses).mean(), " std: ", np.array(test_losses).std())

In [None]:
quantiles=(0.1,0.5,0.9)
all_models,all_scalers,all_impute_vals, test_losses, test_metrics,kf_splits=nested_CV(train,test,quantiles,k_folds=5)
print("expected generalization metric: ", np.array(test_metrics).mean(), " std: ", np.array(test_metrics).std())
print("expected generalization loss: ", np.array(test_losses).mean(), " std: ", np.array(test_losses).std())

# Test Data

Re-run the modelling procedure for the whole train data and make predictions on the test data

In [None]:
submission=pd.read_csv('../input/osic-pulmonary-fibrosis-progression/sample_submission.csv')

In [None]:
#Re run the training process on all of the data
all_train_ids=train.Patient.unique()
lgb_quantile_alphas, scaler, impute_vals=train_model(all_train_ids,train,quantiles)

test_ids=test.Patient.unique()
test_dataset = OSIC(test_ids,test,scaler=scaler,impute_vals=impute_vals,train=False) 

fvc_pred = lgb_quantile_alphas[quantiles[1]].predict(test_dataset.data)
sigma_pred = lgb_quantile_alphas[quantiles[2]].predict(test_dataset.data)-lgb_quantile_alphas[quantiles[0]].predict(test_dataset.data)

test['FVC']=fvc_pred
test['Confidence']=sigma_pred

In [None]:
test['Patient_Week']=test["Patient"] + '_' + test['Weeks'].apply(str)

In [None]:
submission=submission[['Patient_Week']].merge(test[['Patient_Week','FVC','Confidence']],on='Patient_Week')

In [None]:
submission.to_csv('submission.csv', index=False, float_format='%.1f')

# Post-Match Analysis

## Test Predictions

In [None]:
plt.scatter(submission['FVC'],submission['Confidence'])
plt.title('Test')
plt.xlabel('FVC')
plt.ylabel('Confidence')

## Train

In [None]:
train_dataset = OSIC(all_train_ids,train,scaler=scaler,impute_vals=impute_vals,train=True) 


fvc_pred_train = lgb_quantile_alphas[quantiles[1]].predict(train_dataset.data)
sigma_pred_train = lgb_quantile_alphas[quantiles[2]].predict(train_dataset.data)-lgb_quantile_alphas[quantiles[0]].predict(train_dataset.data)

print('train metric', calc_metric_loss(fvc_pred_train,sigma_pred_train,train_dataset.fvc).mean())

plt.scatter(fvc_pred_train,sigma_pred_train)
plt.title('Train')
plt.xlabel('FVC')
plt.ylabel('Confidence')


In [None]:
plt.scatter(train_dataset.fvc,fvc_pred_train)
plt.title('Train: predicted FVC vs true FVC')
plt.xlabel('True FVC')
plt.ylabel('Predicted FVC')

## All

In [None]:
plt.hist(submission['FVC'], alpha=0.5,label='test')
plt.hist(fvc_pred_train, alpha=0.5,label='train')
plt.legend()
plt.title('Histogram of FVC predictions')

In [None]:
plt.hist(submission['Confidence'], alpha=0.5,label='test')
plt.hist(sigma_pred_train, alpha=0.5,label='train')
plt.legend()
plt.title('Histogram of Confidence predictions')