### Prediction with DNN(PRELU)
I have used deep neural network with sequential arhitecture for ubiquant data.

The key features are:

Selu activation

Initial learning rate of .001

Decayed learning rate in steps

6 layers each with around 200 neurons

All features are scaled.

In [None]:
import tensorflow as tf
import sklearn
from tensorflow import keras
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
%matplotlib inline
import gc

This data is very vast with around 300 columns and 3 million rows. This version of data is available in parquet format which helps reading the data fast. To access it all you need to do is click on the add data and type ubiquant on search option. Once you locate the data just click on Add data.

In [None]:
train = pd.read_parquet('../input/ubiquant-parquet/train_low_mem.parquet')

I found this function useful as it converts this data into a format that makes working around it easier.

In [None]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    #start_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    #end_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    #print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

train = reduce_mem_usage(train)

### TRAIN TEST SPLIT


I have tried to split the data according to investment_ids and time_ids in order to get a better representation of data.

In [None]:
# Split investment id in 2 lots
first_lot=train['investment_id'].unique()
first_lot=first_lot[0:len(first_lot)//2]
second_lot=first_lot[len(first_lot)//2:len(first_lot)+1]

# Identify the lot in the Dataframe
train['inv_lot']=train['investment_id'].apply(lambda x: 1 if x in first_lot else 2)

# Fix a point in Time_id to split. I have used the quantile method and arbitrarily selected the 75th percentile.
tile=train['time_id'].quantile(q=0.75, interpolation='lower')

# Creating various splits as per investment_id and time_id
train1=train[(train['inv_lot'] ==1) & (train['time_id']< (tile))]
train2=train[(train['inv_lot']==2) & (train['time_id']>= tile)]
train3=train[(train['inv_lot'] ==1) & (train['time_id']>= tile)]
train4=train[(train['inv_lot'] ==2) & (train['time_id']< tile)]
train5=train[(train['inv_lot'] ==2)]

# Take a look at the length in each dataframe
print(len(train1)/len(train),len(train2)/len(train),len(train3)/len(train),len(train4)/len(train),len(train5)/len(train) )

# Creation of test and train  sets of features
train=pd.concat([train1,train5], axis=0)
test=pd.concat([train2,train3], axis=0)
# Creation of test and train  sets of target variable
y_train=train[['target']]
y_test=test[['target']]

In [None]:
f_cols=[col for col in train.columns if 'f_' in col]
f_cols=f_cols+[ 'investment_id']
X_train=train[f_cols]
X_test=test[f_cols]
print('The test set is {}% of the train set'.format((len(X_test)/len(train))*100))

In [None]:
del train,
del train1
del train2
del train3
del train4
del train5
gc.collect()

### SCALING

In [None]:
scaler= StandardScaler()
scaler.fit(X_train.values)

def scale_dataset(df):
    scaled_features=scaler.transform(df.values)
    scaled_features_df = pd.DataFrame(scaled_features, index=df.index, columns=df.columns)
    return scaled_features_df

X_train=scale_dataset(X_train)
X_test=scale_dataset(X_test)

In [None]:
gc.collect()

### MODEL 

Deep Neural Network(Sequential architecture)
I have created a network of 6 layers, excluding the input layer.

Dropout layers are added.

Adam optimizer is selected and with exponentially reducing lrearning rate.

Loss is Mean Squared error and metrics is Root mean squared error.

In [None]:
learning_sch = tf.keras.optimizers.schedules.ExponentialDecay( initial_learning_rate = 0.001, decay_steps = 4000, decay_rate = 0.98)

# Function to create a sequential model
def build_seqmodel(input_shape, layer_params=[(250,'relu'), (200, 'relu'), (150,'relu')], epochs_num=10, prelu=False):
  model=keras.models.Sequential()
  #define layers
  model.add(keras.layers.Input(shape=input_shape))
  for params in layer_params:
    if prelu==True:
        model.add(keras.layers.Dense(units=params[0], kernel_initializer='he_normal'))
        model.add(keras.layers.PReLU())
        model.add(keras.layers.Dropout(.20))
        model.add(keras.layers.BatchNormalization())
    else: 
        model.add(keras.layers.Dense(units=params[0], activation=params[1]))
        model.add(keras.layers.BatchNormalization())
    
  model.add(keras.layers.Dense(1))
  
  # Select optimizer, loss and metrics
  optimizer=keras.optimizers.Adam(learning_rate=learning_sch)
  loss=keras.losses.MeanSquaredError()
  metrics = keras.metrics.RootMeanSquaredError()
  model.compile(loss= loss, metrics=metrics, optimizer= optimizer)
  return model

In [None]:
# Specify the arguments for the model function
input_shape=X_train.shape[1:]
layer_params=[(250,'prelu'), (200, 'prelu'), (150,'prelu'),(200, 'prelu'),(200, 'prelu'),(200, 'prelu') ]

epochs_num=10

model=build_seqmodel(input_shape, layer_params=layer_params,   epochs_num=epochs_num, prelu=True)
# Create checkpoint to save best model and creat early stopping criteria
check_point_best= keras.callbacks.ModelCheckpoint('keras_model.h5',save_best_only = True)
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

tf_train = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(2022).batch(1024, drop_remainder=True).prefetch(1)
tf_val = tf.data.Dataset.from_tensor_slices((X_test, y_test)).shuffle(2022).batch(1024, drop_remainder=True).prefetch(1)

history=model.fit(tf_train, epochs=epochs_num, batch_size=128, validation_data=(tf_val), callbacks= [check_point_best,early_stopping], shuffle=True)

In [None]:
pd.DataFrame(history.history).plot(figsize=(12,6))
plt.grid(True)
plt.show()
plt.close()
plt.clf()

In [None]:
import ubiquant
best_model = keras.models.load_model('keras_model.h5')
env = ubiquant.make_env()   
iter_test = env.iter_test()    
for (test_df, sample_prediction_df) in iter_test:
    test_df=test_df[f_cols]
    test_df = scale_dataset(test_df)
    #test_df = tf.data.Dataset.from_tensor_slices((test_df)).batch(1024, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE)
    sample_prediction_df['target'] = best_model.predict(test_df)  
    env.predict(sample_prediction_df)

In [None]:
gc.collect()

Thanks! for going through my notebook.

Kindly upvote, if you liked it.