## Updates:

1. Version 3. Kfold CV -> StratifiedKFold Cv. Increased number of folds (5 -> 10).
   CV: 7.8805
2. Version 4. Added RandomTreesEmbeddings as input features to Neural Net
   CV: 7.8781
3. Version 5. Changed RandomTreesEmbeddings parameters
   CV: 7.8784
4. Version 6. Added count features for ['f1', 'f86']
   CV: 7.8779
5. Version 7. Added PCA features + clipping negative predictions

In [None]:
import numpy as np
import pandas as pd
import os
import random
import time
from datetime import datetime
import gc
import warnings

from tqdm.notebook import tqdm

from sklearn.decomposition import PCA
from sklearn.ensemble import RandomTreesEmbedding
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, QuantileTransformer

import tensorflow as tf
from keras import Sequential
from keras import backend as K
from keras.layers import Dense,Dropout,BatchNormalization,LeakyReLU,Activation
from keras.callbacks import EarlyStopping,ModelCheckpoint,ReduceLROnPlateau
import tensorflow_addons as tfa

warnings.filterwarnings("ignore")

In [None]:
SEED = 0
os.environ['PYTHONHASHSEED']=str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [None]:
def read_data():
    train = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv')
    test = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv')
    sub = pd.read_csv('../input/tabular-playground-series-aug-2021/sample_submission.csv')
    return train,test,sub

In [None]:
def preprocess_data(X_train,X_test):
    
    df = pd.concat([X_train,X_test],axis=0,copy=False)
    
    #random trees embeddings
    rf_embedder = RandomTreesEmbedding(
        n_estimators=5, random_state=SEED, max_depth=1).fit(df)
    
    X_sparse_embedding = rf_embedder.transform(df)
    
    #scaling
    scaler = StandardScaler()
    df = scaler.fit_transform(df)
    
    #quantile transformation
    qt = QuantileTransformer(random_state=SEED, output_distribution='normal')
    df = qt.fit_transform(df)
    
    #PCA features
    pca = PCA(n_components=10, random_state=SEED)
    pca.fit(df)
    
    df = np.hstack([df, X_sparse_embedding.toarray(), pca.transform(df)])
    
    X_train = df[:len(X_train),:]
    X_test = df[len(X_train):,:]
    del df
    gc.collect()
    
    return X_train,X_test

In [None]:
def train_nn(X,X_test,y,folds): 
     
    oof = np.zeros(len(X)) 
    prediction = np.zeros(len(X_test)) 
    scores = [] 
     
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)): 
        print('Fold', fold_n, 'started at', time.ctime()) 
        X_train, X_valid = X[train_index], X[valid_index] 
        y_train, y_valid = y.loc[train_index], y.loc[valid_index] 
         
        checkpoint_path = f'repeat:Fold:{fold_n}.hdf5' 
         
        cb_checkpt = ModelCheckpoint(checkpoint_path, monitor = 'val_loss', verbose = 0, save_best_only = True, 
                                     save_weights_only = True, mode = 'min') 
         
        reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1, epsilon=1e-4, mode='min') 
        #opt = tf.keras.optimizers.Adam(learning_rate = 0.001) 
        #opt = tfa.optimizers.SWA(opt) 
         
        early_stop = EarlyStopping(monitor='val_loss', patience=5) 
        model = Sequential()
        model.add(tfa.layers.WeightNormalization(Dense(256, activation='elu')))
        model.add(Dropout(0.1))
        model.add(tfa.layers.WeightNormalization(Dense(128, activation='elu'))) 
        model.add(Dropout(0.1)) 
        model.add(tfa.layers.WeightNormalization(Dense(64, activation='elu'))) 
        model.add(Dropout(0.1))
        model.add(Dense(16, activation='elu')) 
        model.add(Dense(1, activation='linear')) 
        model.compile(optimizer='Adam', loss='mse') 
         
     
        model.fit(X_train, y_train,  
                  validation_data = (X_valid, y_valid), 
                  epochs=200, verbose=2,callbacks = [early_stop,reduce_lr_loss,cb_checkpt]) 
         
        model.load_weights(checkpoint_path) 
         
        y_pred_valid = model.predict(X_valid) 
        y_pred = model.predict(X_test) 
         
        oof[valid_index] = y_pred_valid.reshape(-1,) 
        scores.append(np.sqrt(mean_squared_error(y_valid, y_pred_valid)))
        print('RMSE: {}'.format(np.sqrt(mean_squared_error(y_valid,y_pred_valid))))
         
        del X_train,X_valid,y_train,y_valid 
        gc.collect() 
         
        prediction += y_pred.reshape(-1,) 
         
    prediction /= N_FOLDS 
    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores))) 
     
    return oof,prediction

In [None]:
N_FOLDS = 10
kf = StratifiedKFold(n_splits = N_FOLDS, random_state = SEED, shuffle = True)

In [None]:
%%time

train,test,sub = read_data()

In [None]:
y = train['loss']

X_train = train.drop(['id', 'loss'],axis=1)
X_test = test.drop(['id'],axis=1)

In [None]:
%%time

X_train,X_test = preprocess_data(X_train,X_test)

In [None]:
X_train.shape

In [None]:
%%time

oof, preds = train_nn(X_train,X_test,y,folds=kf)

In [None]:
# clipping negative values

oof_postprocessed = np.where(oof<0, 0, oof)
preds_postprocessed = np.where(preds<0, 0, preds)

In [None]:
# RMSE of clipped and notclipped predictions

print(f'Not clipped OOF RMSE: {np.sqrt(mean_squared_error(y,oof))}')
print(f'Clipped OOF RMSE: {np.sqrt(mean_squared_error(y,oof_postprocessed))}')

In [None]:
pd.DataFrame(oof).to_csv('oof.csv', index = 0)
pd.DataFrame(oof_postprocessed).to_csv('oof_postprocessed.csv', index = 0)

sub['loss'] = preds_postprocessed
sub.to_csv('sub_postprocessed.csv', index = 0)