In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
sns.set(font_scale=1.4)

import matplotlib.pyplot as plt
import os
import sys
import gc
import random
import warnings
warnings.filterwarnings("ignore")

#def import transformations

from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import PolynomialFeatures

from scipy.stats import normaltest as nt

from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import log_loss
from sklearn.mixture import GaussianMixture

from scipy.stats import ks_2samp
from scipy.stats import skew

from sklearn.cluster import KMeans
from sklearn.cluster import MeanShift

In [None]:
import tensorflow as tf
import tensorflow.keras.layers as L
import tensorflow.keras.backend as K

from tensorflow.keras import Model
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping, Callback, LearningRateScheduler, History

import tensorflow_addons as tfa

def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true))) 

In [None]:
def add_pca(train_df, test_df, cols, n_comp=20, fit_test = True, prefix='pca_', fit_test_first=False):
    
    pca = PCA(n_components=n_comp, random_state=42)
    pca_titles = [prefix+'_pca_'+str(x) for x in range(n_comp)]
    
    #create copies to fill nas as needed
    temp_train = train_df.copy()
    temp_test = test_df.copy()
    
    for c in cols:
        fv = temp_train[c].mean()
        temp_train[c] = temp_train[c].fillna(value=fv)
        
        fv = temp_test[c].mean()
        temp_test[c] = temp_test[c].fillna(value=fv)
    
    for p in pca_titles:
        #we update the actual original dsf
        train_df[p] = 0.0
        test_df[p] = 0.0
    
    if fit_test==True:    
        pca_data = pd.concat([temp_train[cols], temp_test[cols]], axis=0)
        
        if fit_test_first==True:
            pca.fit(pca_data[len(train_df):])
            pca_data = pca.transform(pca_data)
        else:        
            #fit and transform on the cleaned data with no NANs
            pca_data = pca.fit_transform(pca_data)
        
        train_df.loc[:, pca_titles] = pca_data[0:len(train_df)]
        test_df.loc[:, pca_titles] = pca_data[len(train_df):]
        
    return train_df, test_df, pca_titles




def split_distributions(train_df, test_df, cols, target, n_comp=20, prefix='gm', add_labels=False):    
    
    gm = GaussianMixture(n_components=n_comp,covariance_type='full', tol=0.001, reg_covar=1e-06, max_iter=100, n_init=1, 
                     init_params='kmeans', weights_init=None, means_init=None, precisions_init=None, 
                     random_state=42, warm_start=False, verbose=0, verbose_interval=10)

    #create copies to fill nas as needed
    temp_train = train_df.copy()
    temp_test = test_df.copy()       
    
    gm_data = pd.concat([temp_train[[cols]], temp_test[[cols]]], axis=0).reset_index(drop=True)
    
    gm.fit(gm_data)
    
    gm_data['labels'] = gm.predict(gm_data)
    
    if add_labels:
        gm_titles = [prefix+str(x) for x in range(n_comp)]
        train_df[gm_titles]=0
        test_df[gm_titles]=0
        dummies = pd.get_dummies(gm_data['labels'], prefix=prefix)    
        train_df.loc[:, gm_titles] = dummies[0:len(train_df)].values
        test_df.loc[:, gm_titles] = dummies[len(train_df):].values      
       
        
    else:
        gm_titles = []
        
        train_df[prefix+'_label'] = gm.labels_[0:len(train_df)]
        test_df[prefix+'_label'] = gm.labels_[len(train_df):]

        means = train_df.groupby([prefix+'_label'])[target].mean()
    
        train_df['menc_'+prefix+'_label'] = train_df[prefix+'_label'].map(means)
        test_df['menc_'+prefix+'_label'] = test_df[prefix+'_label'].map(means)
    
        gm_titles+=[prefix+'_label', 'menc_'+prefix+'_label']
    
    return train_df, test_df, gm_titles


def assign_fold(df, label_column,fold_column, NFOLDS=5):

    from sklearn.model_selection import StratifiedKFold
    skf = StratifiedKFold(n_splits=NFOLDS)

    df[fold_column]=0
    
    f=0
    for trn_idx, val_idx in skf.split(df, df[label_column]):
        df.loc[val_idx, 'fold']=f
        f+=1
    df[fold_column].value_counts()
    
    return df

def st_scale(train_df, test_df, cols):
    
    StSc = StandardScaler()
    
    train_df[cols] = StSc.fit_transform(train_df[cols])
    test_df[cols] = StSc.transform(test_df[cols])
    
    return train_df, test_df 

def seed_everything(seed=1234):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

# Import Train and Test

In [None]:
PATH = '/kaggle/input/tabular-playground-series-jan-2021/'
train = pd.read_csv(PATH+'train.csv')
test = pd.read_csv(PATH+'test.csv')
submission = pd.read_csv(PATH+'sample_submission.csv')

FT_COLS = [x for x in train.columns if 'cont' in x]
TARGET='target'

print(train.shape)
train.head(10)

In [None]:
print('describe Train data')
train[FT_COLS+[TARGET]].describe().T

In [None]:
fig,axes=plt.subplots(figsize=(16,5))
sns.distplot(train[TARGET], color='Red')
print('We can see there are some outliers in the target at low values')
axes.set_title('Target Distribution')

We can see that train and test data follow very similar distributions

In [None]:
print('plot Feature Distributions, KSTest values (Train vs Test)')

nc=3
nr=int(len(FT_COLS)/nc+1)

fig,axes=plt.subplots(nrows=nr, ncols=nc, figsize=(20,5*nr))

for count,ft in enumerate(FT_COLS):
    ks_score = ks_2samp(train[ft], test[ft])[0]
    
    sns.kdeplot(train[ft],ax=axes[count//nc, count%nc],color='Green')
    sns.kdeplot(test[ft],ax=axes[count//nc, count%nc],color='Red')
    
    axes[count//nc, count%nc].legend(['Train', 'Test'], facecolor='White')
    axes[count//nc, count%nc].set_title(ft +' ks stat :' +str(np.round(ks_score, 3)))
    
plt.tight_layout()

There are some correlations between some of the features

In [None]:
corrs = train[FT_COLS+[TARGET]].corr()
sns.set(font_scale=1.2)
fig,axes=plt.subplots(figsize=(12,12))
sns.heatmap(corrs, cmap='seismic_r', annot=True, fmt='0.1f', vmax=1, vmin=-1)
axes.set_title('Correlation Matrix')

PCA does not really help us to see any obvious separation

In [None]:
PCA_COMP=2
train, test, pca_titles = add_pca(train, test, FT_COLS, n_comp=PCA_COMP, fit_test = True,prefix='f' )

fig,axes=plt.subplots(nrows=1, ncols=3, figsize=(20,7))
axes[0].scatter(x=train[pca_titles[0]], y=train[pca_titles[1]], c=train[TARGET], s=1, alpha=0.8, cmap='seismic', vmax=9,vmin=6)
axes[1].scatter(x=train[pca_titles[0]], y=train[TARGET], s=1, alpha=0.8, color='Green')
axes[2].scatter(x=train[pca_titles[1]], y=train[TARGET], s=1, alpha=0.8, color='Green')

axes[0].set_title('PCA1 vs PCA2 (color=Target)')
axes[1].set_title('PCA1 vs Target')
axes[2].set_title('PCA2 vs Target')

plt.tight_layout()

# Adding Features

It seemed like breaking out existing features into sub-distributions within additional feature columns improved the NN outcome

I added some further PCA, then split out the feature columns using SKLearn Gaussian Mixture.

The large number of additional features feels clumsy, but the outcome of testing seemed better with more features

In [None]:
print('Adding some further PCA')

PCA_COMP=6
train, test, pca_titles2 = add_pca(train, test, FT_COLS, n_comp=PCA_COMP, fit_test = True,prefix='extra' )

nc=3
nr=2

fig,axes=plt.subplots(nrows=nr, ncols=nc, figsize=(20,5*nr))

for count,ft in enumerate(pca_titles2):
    ks_score = ks_2samp(train[ft], test[ft])[0]
    
    sns.kdeplot(train[ft],ax=axes[count//nc, count%nc],color='Green')
    sns.kdeplot(test[ft],ax=axes[count//nc, count%nc],color='Red')
    
    axes[count//nc, count%nc].legend(['Train', 'Test'], facecolor='White')
    axes[count//nc, count%nc].set_title(ft +' ks stat :' +str(np.round(ks_score, 3)))
    
plt.tight_layout()

Below are the original number of gaussian mixtures I aimed to split the main features into, based on looking at the distributions (see earlier graphs).

In practice I found that multiplying this out by a much larger number than I'd really anticipated lead to results improving.

In [None]:
#Below are the original number of gaussian mixtures I aimed to split the main features into, based on looking at the distributions
#(see earlier graphs)

original_feature_dict = {'cont1': 5,
 'cont2': 10,
 'cont3': 8,
 'cont4': 8,
 'cont5': 5,
 'cont6': 6,
 'cont7': 4,
 'cont8': 3,
 'cont9': 7,
 'cont10':3,
 'cont11': 4,
 'cont12': 2,
 'cont13': 3,
 'cont14': 5,}

pca_dict = {
       'extra_pca_0': 4,
 'extra_pca_1': 1,
 'extra_pca_2': 1,
 'extra_pca_3': 2,
 'extra_pca_4': 1,
 'extra_pca_5': 1   }

#i found original numbers were too low - easiest approach was to add a multiplier while testing

MULTIPLIER = 3.0

original_feature_dict = {a:int(b*MULTIPLIER) for (a,b) in original_feature_dict.items()}
ft_dict = {**original_feature_dict, **pca_dict}
ft_dict

Gaussian Mixture splits each original feature into a number of sub distributions

Each subdistribution has 2 columns, one for a label (Y/N) and one for the original values, recentred.

In [None]:
mixture_title_cols=[]

#split the original feature values into new labels, depending on which sub mixture they are in
for f in FT_COLS+pca_titles2:          
    train, test, titles = split_distributions(train, test,f,TARGET, n_comp=ft_dict[f], prefix=f+'_dim', add_labels=True)    
    mixture_title_cols+=titles    

#split the original feature values into new columns using the labels created in previous lines
mixture_value_cols = []

for count,f in enumerate(FT_COLS+pca_titles2):
    
    for d in range(ft_dict[f]): #this goes through each subdistribution for each feature
        
        #create a column with the original values, recentred
        t_median = train[f][train[f+'_dim'+str(d)]==1].median()
        t_std = train[f][train[f+'_dim'+str(d)]==1].std()
        
        #set to NA
        train[f+'_dim'+str(d)+'_dist'] = np.where(train[f+'_dim'+str(d)]==1,
                                                 (train[f] - t_median)/t_std, np.nan)
        
        test[f+'_dim'+str(d)+'_dist'] = np.where(test[f+'_dim'+str(d)]==1,
                                                 (test[f] - t_median)/t_std, np.nan)
        
        mixture_value_cols+=[f+'_dim'+str(d)+'_dist']

Can look at 'normality' of new distributions with sklearn normality test

In [None]:
nt_data = pd.Series(index=mixture_value_cols,
                   data=0.0)

for nd in mixture_value_cols:
    nt_data[nd] = nt(train[nd][~train[nd].isna()].values.flatten())[0]

nt_data=nt_data.sort_values()

#plot the most 'normal' and 3 of the least
fig,axes=plt.subplots(figsize=(10,5))
sns.kdeplot(train[nt_data.index[0]], color='Green')
sns.kdeplot(train[nt_data.index[-1]], color='Red')
sns.kdeplot(train[nt_data.index[-2]], color='Red')
sns.kdeplot(train[nt_data.index[-3]], color='Red')
axes.legend([nt_data.index[0], nt_data.index[-1], nt_data.index[-2], nt_data.index[-3]], facecolor='white')
axes.set_xlabel('Value')

In [None]:
print('Total Original Features', len(FT_COLS))
print('Total Submixtures Labels', len(mixture_title_cols))
print('Total Submixtures Values', len(mixture_value_cols))
print('Total Feature Columns', len(FT_COLS)+len(mixture_title_cols)+len(mixture_value_cols))

# Examine split of one feature

In [None]:
view_feature = 'cont8'

subdists = [x for x in train.columns if view_feature+'_dim' in x and 'dist' in x]

nc=3
nr=len(subdists)//nc+1

fig,axes=plt.subplots(nrows=nr, ncols=nc, figsize=(20,5*nr), sharex=False)

sns.kdeplot(train[view_feature],ax=axes[0,0],color='Blue')
axes[0,0].set_title('Original Feature - Train')

for count1,ft in enumerate(subdists):
    count=count1+1
    ks_score = ks_2samp(train[ft], test[ft])[0]
    
    sns.kdeplot(train[ft],ax=axes[count//nc, count%nc],color='Green')
    sns.kdeplot(test[ft],ax=axes[count//nc, count%nc],color='Red')
    
    axes[count//nc, count%nc].legend(['Train', 'Test'], facecolor='White')
    axes[count//nc, count%nc].set_title(ft +' ks stat :' +str(np.round(ks_score, 3)))
    axes[count//nc, count%nc].set_xlabel(None)
    axes[count//nc, count%nc].set_ylabel(None)
    
plt.tight_layout()

# Tidying of Features & Target

In [None]:
#fill nas in mixture values
NAN_VALUE = 0.0

for d in mixture_value_cols:
    train[d] = train[d].fillna(value=NAN_VALUE)
    test[d] = test[d].fillna(value=NAN_VALUE)

I am not sure if all the steps here are the best possible approach. I believe they improved my CV but did not have time to test more extensively.

In [None]:
#scale original feature columns

#if true, scale original feature columns and clip
SCALE = True

if SCALE:
    train, test = st_scale(train, test, FT_COLS)

    #clip to -2 to +2
    for f in FT_COLS:
        train[f] = np.clip(train[f], -2, 2)
        test[f] = np.clip(test[f], -2, 2)
        

        
#if true, scale new features to same range
SCALE_DISTS = True

if SCALE_DISTS:
    for d in mixture_value_cols:
        TEMP_MAX = np.abs(train[d]).max()
        train[d] = train[d] / TEMP_MAX
        test[d] = test[d] / TEMP_MAX

In [None]:
#highlight outliers and create filter column
#i dont think ive seen anything to suggest that the outliers help training

train['outlier_filter'] = np.where(train[TARGET]<4, True, False)
print('# outliers', sum(train['outlier_filter']))

In [None]:
#centre the target around zero - i think this helped to converge faster
TARGET_MEAN = train[TARGET].mean()

train[TARGET] = train[TARGET] - TARGET_MEAN
sns.distplot(train[TARGET])

# Apply Folds to Data

Folds just selected based on distribution of target.

It seems like the original feature values are pretty close between the selected folds.

I did try Multilabel Strat with the new sub-mixture labels, but this did not seem to immediately improve results and had little time to explore further.

In [None]:
print('Distribution of Original Features by Fold (for CV)')

GROUP_LABEL = 'target_group'
NQUANTS=10000
train['target_group'] = pd.qcut(train[TARGET], NQUANTS, labels=False)

NFOLDS=10
FOLD_COL='fold'

train = assign_fold(train, GROUP_LABEL, FOLD_COL, NFOLDS=NFOLDS)

nc=3
nr=int(len(FT_COLS)/nc+1)

fig,axes=plt.subplots(nrows=nr, ncols=nc, figsize=(20,5*nr))

for count,ft in enumerate(FT_COLS+[TARGET]):
    
    for count2, qbin in enumerate(range(NFOLDS)):
        filt = train[FOLD_COL]==qbin
        sns.kdeplot(train[ft][filt],ax=axes[count//nc, count%nc])
    
    
    axes[count//nc, count%nc].legend(['fold_'+str(qbin) for qbin in range(NFOLDS)], facecolor='White', fontsize=10)
    axes[count//nc, count%nc].set_title(ft)
plt.tight_layout()

# Cross Validation Setup

In [None]:
print('CV Benchmark - Random Guess of Median')
cv_bm = np.sqrt(mse(train[TARGET], np.full((train[TARGET].shape), train[TARGET].median())))
cv_bm

In [None]:
def run_training(model, train_df, test_df,sample_submission, fold_col, #input data and folds
             orig_features, mixture_val_cols,mixture_label_cols, target_col, #
             benchmark,
            outlier_col=None, nn=False, epochs=10,batch_size=32, verbose=False,
            dense=70, dout=0.15, dense_reg = 0.000001,act='elu'):

    FOLD_VALUES = sorted([x for x in train_df[fold_col].unique()]) 
    
    #set up templates to fill up with predictions
    oof = np.zeros((len(train_df),))
    test_predictions = np.zeros((len(sample_submission),))
    
    #set up axes to plot training
    fig,axes=plt.subplots(figsize=(15,6))    
    axes.set_ylim(0.69,0.75)
    
    #random seeds to run - selected at random
    RANDOM_SEEDS = [0,42,100,1000]
    #RANDOM_SEEDS = [0,]

    for rs in RANDOM_SEEDS:
        seed_everything(seed=rs)
        for fold in FOLD_VALUES:
            
            print(' ---  ')
            print('running random seed', rs, 'fold', fold)
            if outlier_col:
                trn_idx = (train_df[fold_col]!=fold) & (~train_df[outlier_col]) #filter outlying target rows       
            else:
                trn_idx = train_df[fold_col]!=fold
            
            val_idx = train_df[fold_col]==fold            
            
            #original features, new features, feature labels (yn), target
            X_train_orig = train_df.loc[trn_idx,orig_features].values
            X_train = train_df.loc[trn_idx,mixture_val_cols].values
            X_train_mask = train_df.loc[trn_idx,mixture_label_cols].values
            y_train = train_df.loc[trn_idx, target_col].values #- MED
            
            #same for validation data
            X_val_orig = train_df.loc[val_idx,orig_features].values
            X_val = train_df.loc[val_idx,mixture_val_cols].values
            X_val_mask = train_df.loc[val_idx,mixture_label_cols].values
            y_val = train_df.loc[val_idx, target_col].values #- MED
            
            #load keras model
            model = keras_model(orig_features, mixture_val_cols,mixture_label_cols,dout=dout,
               dense=dense,act=act, dense_reg = dense_reg, descend_fraction = 0.9)
            
            K.clear_session()
            
            #set up the list of epochs to use for prediction, after training
            #i have not really had time to test this properly but it seemed to improve CV
            back_count = 5
            epoch_list=list(range(epochs-back_count, epochs))
            
            #save last few epochs
            class CustomModelCheckpoint(tf.keras.callbacks.Callback):
                def on_epoch_end(self, epoch, logs=None):                    
                    if epoch in epoch_list:
                        #print('save this epoch')
                        self.model.save_weights('model_fold_'+str(fold)+'_epoch_'+str(epoch)+'.h5', overwrite=True)
                    else:
                        pass
            cbk = CustomModelCheckpoint()
            
            
            #fit keras model           
            history = model.fit([X_train_orig, X_train,X_train_mask ], y_train, epochs=epochs, batch_size=batch_size,shuffle=True,
                         validation_data=([X_val_orig, X_val,X_val_mask], y_val), verbose=verbose,
                                   callbacks=[cbk])
            
            #print outcomes
            print('Fold Last Epoch Train Error', history.history['root_mean_squared_error'][-1])
            print('Fold Last Epoch Valid Error', history.history['val_root_mean_squared_error'][-1])
            
            #save final weights
            model.save_weights('model_final_'+str(fold)+'.h5') 
            
            #plot history
            sns.lineplot(x=range(epochs), y=history.history['loss'], color='Blue')
            sns.lineplot(x=range(epochs), y=history.history['val_loss'], color='Red')
            
            
            #predicting validation data using weights from last few epochs
            val_preds = np.zeros((len(X_val),))
            for e in epoch_list:
                model = keras_model(orig_features, mixture_val_cols,mixture_label_cols,dout=dout,
                           dense=dense,act=act, dense_reg = dense_reg, descend_fraction = 0.9)
                
                model.load_weights('model_fold_'+str(fold)+'_epoch_'+str(e)+'.h5')
                
                val_preds += model.predict([X_val_orig, X_val,X_val_mask]).mean(axis=1)                
                
                #also predict the test data
                test_predictions += model.predict([test_df[orig_features].values,test_df[mixture_val_cols].values,
                                                   test_df[mixture_label_cols].values]).mean(axis=1)                
                       
            #average validation preds for fold error
            val_preds = val_preds / len(epoch_list)             
            oof[val_idx]+=val_preds
            val_error = np.sqrt(mse(y_val, val_preds))
            print('Fold multi epoch weight prediction error', val_error)

    
    #divide oof (validation) by number of random seeds  
    oof = oof / len(RANDOM_SEEDS)
    
    #total validation error
    total_val_error = np.sqrt(mse(train_df[target_col], oof))
    print('final OOF MSE', total_val_error)
        
    #list of fold errors
    fold_errors = []
    for fold in train_df[fold_col].unique():
        val_idx = train_df[fold_col]==fold
        fold_errors+=[np.sqrt(mse(train_df.loc[val_idx, target_col].values, oof[val_idx]))]

    #divide test predictions by # folds and random seed list and number of epochs' weights used
    test_predictions = test_predictions/(NFOLDS * len(RANDOM_SEEDS) * len(epoch_list)) 
  
    return oof, test_predictions, fold_errors    

# Keras Model

In [None]:
def keras_model(ft_orig, mixture_values, mixture_labels, n_layer=3,bnorm=True,dout=0.2,
               dense=20,act='elu', dense_reg = 0.000001, descend_fraction = 0.9):   
    
    
    ## original features
    input1 = L.Input(shape=(len(ft_orig)), name='input_orig')  
    input1_do = L.Dropout(0.1)(input1)
    
    #dense layer linked to original features
    XA = L.Dense(dense, activation=act, activity_regularizer=tf.keras.regularizers.L2(dense_reg), name='dense_orig')(input1_do)
    if bnorm:
        XA = L.BatchNormalization(name='bn_1')(XA)
    XA1 = L.Dropout(dout, name='do_orig')(XA)
    
    
    
    ## new features - sub mixtures
    input2 = L.Input(shape=(len(mixture_values)), name='input_new')  
    input3 = L.Input(shape=(len(mixture_labels)), name='input_new2')  
    #note - i split out Labels vs Values originally to allow more testing. 
    #In practice I think no need for separte inputs as they are concatenated now anyway
    all_input_combo = tf.keras.layers.Concatenate(axis=1, name='concat_i2')([input2, input3])
    input2_do = L.Dropout(0.2)(all_input_combo)
    
    #dense layer linked to new features
    XB = L.Dense(dense, activation=act, activity_regularizer=tf.keras.regularizers.L2(dense_reg), name='dense_new')(input2_do)
    if bnorm:
        XB = L.BatchNormalization(name='bn_2')(XB)
    XB1 = L.Dropout(dout, name='do_new')(XB)
    
    
    
    
    
    ##combine original with normal dists    
    all_input_combo = tf.keras.layers.Concatenate(axis=1, name='concat_new')([XA1, XB1])
        
        
    ##rest of model
    
    #layer 1
    X = L.Dense(int(dense), activation=act, activity_regularizer=tf.keras.regularizers.L2(dense_reg))(all_input_combo)
    if bnorm:
        X = L.BatchNormalization(name='bn_3')(X)
    X2 = L.Dropout(dout, name='do_3')(X)
    
    X2 = tf.keras.layers.concatenate([all_input_combo, X2],  axis=1)
    
    #layer 2
    X = L.Dense(int((descend_fraction**2)*dense), activation=act, activity_regularizer=tf.keras.regularizers.L2(dense_reg))(X2)
    if bnorm:
        X = L.BatchNormalization()(X)
    X3 = L.Dropout(dout)(X)
    
    X3 = tf.keras.layers.concatenate([X2, X3], axis=1)
    
    #layer 3
    X = L.Dense(int((descend_fraction**3)*dense), activation=act, activity_regularizer=tf.keras.regularizers.L2(dense_reg *2))(X3)
    if bnorm:
        X = L.BatchNormalization()(X)
    X4 = L.Dropout(dout)(X)    
    
    X4 = tf.keras.layers.concatenate([X3, X4],axis=1)
    
    #layer 4
    X = L.Dense(int((descend_fraction**4)*dense), activation=act, activity_regularizer=tf.keras.regularizers.L2(dense_reg*4))(X4)
    if bnorm:
        X = L.BatchNormalization()(X)
    X5 = L.Dropout(dout)(X)        
       
    X5 = tf.keras.layers.concatenate([X4, X5], axis=1)
    
    #final layers / output
    X = L.Dense(5, activation=act, activity_regularizer=tf.keras.regularizers.L2(0.0001))(X5)
    output1 = L.Dense(1, activation='linear')(X)
    
    #Note, the learning rate & weight decay was optimised at an early stage. I had no time to re run optuna.
    model = tf.keras.Model(inputs=[input1, input2, input3], outputs=output1)
    model.compile(loss=root_mean_squared_error,
                  optimizer=tfa.optimizers.AdamW(learning_rate=0.000809028893821181, weight_decay=9.83479875802558E-06),
                  metrics=tf.keras.metrics.RootMeanSquaredError())
    
    return model
    

K.clear_session()    
model = keras_model(FT_COLS, mixture_title_cols, mixture_value_cols,
                    dense=70, 
                    dout=0.15, 
                    dense_reg = 0.000001,
                    act='elu',
                    descend_fraction = 0.9)
model.summary()

# Run Training & Predictions

Note: at the end of every fold, the target is re-predicted with the weights from last few epochs (see CV loop)

In [None]:
oof, test_predictions, fold_errors = run_training(model, train, test,submission, 'fold', FT_COLS,
                                              mixture_value_cols, mixture_title_cols, TARGET, cv_bm,
                                            outlier_col='outlier_filter', 
                                            epochs=25,
                                            batch_size=256, #lower batch sizes looked steadier 
                                            #but i did not get much better end result from limited testing
                                            
                                            verbose=False,
                                            dense=70, 
                                            dout=0.15, 
                                            dense_reg = 0.000001,
                                            act='elu',)

# Save out of fold, test predictions

Note: as the target was centred at zero, need to make sure to re-add original mean

In [None]:
print('Save Out of Fold Predictions')
oof = pd.DataFrame(columns=['oof_prediction'], index=train['id'], data=oof + TARGET_MEAN)
oof.to_csv('oof_predictions.csv', index=True)
oof.head(10)

In [None]:
print('fold errors', fold_errors)
print('fold error std', np.array(fold_errors).std())

In [None]:
sns.set(font_scale=1.4)
fig,axes=plt.subplots(figsize=(16,5))
sns.distplot(train[TARGET] + TARGET_MEAN, color='Green')
sns.distplot(test_predictions + TARGET_MEAN, color='Red')
axes.set_title('Train Target Distribution & Test Predictions')
axes.set_xlim(4,10)
axes.legend(['Train', 'Test Predicted'])

In [None]:
submission['target'] = test_predictions + TARGET_MEAN
submission.to_csv('submission.csv', index=False)
submission.head(5)