## Prediction with DNN

In [None]:
import tensorflow as tf
import sklearn
from tensorflow import keras
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
%matplotlib inline


In [None]:
train = pd.read_parquet('../input/ubiquant-parquet/train_low_mem.parquet')

In [None]:
train.head()

In [None]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    #start_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    #end_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    #print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
train = reduce_mem_usage(train)

In [None]:
#Function to generate histogram of target variable
def target_distriution(target, df):
  plt.figure(figsize=(14,6))
     
  df[target].plot.hist(bins=50)
  plt.axvline(df[target].mean(), color='lightgreen', linewidth=3, label='Mean')
  plt.axvline(np.percentile(df[target],25), color='brown', linewidth=3, label='Quartiles')
  plt.axvline(np.percentile(df[target],75), color='brown', linewidth=3)
  plt.xlabel(target,fontsize=14)
  plt.ylabel('Frequency',fontsize=14)
  plt.title('Histogram of ' + target,fontsize=14)
  plt.legend(fontsize=14)
  plt.show()
  
  plt.close()
  plt.clf()

In [None]:
target='target'
df=train
target_distriution(target, df)

In [None]:
del df

#### Scatter Plot
We will take a look at how some features appear vs the target variable.

In [None]:
features=train.columns.to_list()[4:10]

In [None]:
plt.figure(figsize=(6,6))
for feature in features:
    plt.scatter(train[feature], train['target'])
    plt.xlabel(feature,fontsize=14)
    plt.ylabel('Target',fontsize=14)
    plt.show()
    plt.close()
    plt.clf()

#### Scatterplot of Time_id vs target variable
Let's also look at the target vs the time_id for a single investment_id.

In [None]:
investment=train['investment_id'].unique()
#Pick an investment ID
investment1=investment[0]
investment2=investment[1]

In [None]:
df_investment1=train[train['investment_id']==investment1]

In [None]:
#Line plot
plt.figure()
plt.plot(df_investment1['time_id'], df_investment1['target'])
plt.xlabel('Time_id',fontsize=14)
plt.ylabel('Target',fontsize=14)
plt.show()
plt.close()
plt.clf()

In [None]:
del df_investment1

#### Train Test Split
I have tried to split the data in the following manner instead of the usual Train-test-split. This is done in order to preserve all the aspects related with Investment_ids and time_ids.

In [None]:
# Split investment id in 2 lots
first_lot=train['investment_id'].unique()
first_lot=first_lot[0:len(first_lot)//2]
second_lot=first_lot[len(first_lot)//2:len(first_lot)+1]

In [None]:
# Identify the lot in the Dataframe
train['inv_lot']=train['investment_id'].apply(lambda x: 1 if x in first_lot else 2)

In [None]:
# Fix a point in Time_id to split. I have used the quantile method and arbitrarily selected the 75th percentile.
tile=train['time_id'].quantile(q=0.75, interpolation='lower')

In [None]:
# Creating various splits as per investment_id and time_id
train1=train[(train['inv_lot'] ==1) & (train['time_id']< (tile))]
train2=train[(train['inv_lot']==2) & (train['time_id']>= tile)]
train3=train[(train['inv_lot'] ==1) & (train['time_id']>= tile)]
train4=train[(train['inv_lot'] ==2) & (train['time_id']< tile)]
train5=train[(train['inv_lot'] ==2)]

In [None]:
# Take a look at the length in each dataframe
print(len(train1)/len(train),len(train2)/len(train),len(train3)/len(train),len(train4)/len(train),len(train5)/len(train) )

In [None]:
# Creation of test and train  sets of features
X_train=pd.concat([train1,train5], axis=0)
X_test=pd.concat([train2,train3], axis=0)
# Creation of test and train  sets of target variable
y_train=X_train[['target']]
y_test=X_test[['target']]
# Remove columns that are not to be included in training data
X_train=X_train.drop(['row_id', 'target', 'inv_lot'], axis=1)
X_test=X_test.drop(['row_id', 'target', 'inv_lot'], axis=1)


print('The test set is {}% of the train set'.format((len(X_test)/len(train))*100))

In [None]:
del train
del train1
del train2
del train3
del train4
del train5

#### Scaling
We will create a Function to scale the features. Note: Don't forget to use the same function to scale the actual test data

In [None]:
scaler= StandardScaler()
scaler.fit(X_train.values)

In [None]:
def scale_dataset(df):
    scaled_features=scaler.transform(df.values)
    scaled_features_df = pd.DataFrame(scaled_features, index=df.index, columns=df.columns)
    return scaled_features_df

In [None]:
X_train=scale_dataset(X_train)
X_test=scale_dataset(X_test)

In [None]:
X_train.shape

In [None]:
X_test.shape

#### Deep Neural Network(Sequential architecture)
I have created a network of 7 layers, excluding the input layer.

Adam optimizer is selected and a function lr_ is created to take the advantage of reducing learning rate.

Loss is Mean Squared error and metrics is Root mean squared error.

In [None]:
# Function for decaying Learning rate
def lr_(init_lr, epoch_num, decay_rate):
    learning_rate = (1/ (1+  (epoch_num* decay_rate)))* init_lr
     
    return learning_rate

In [None]:
# Function to create a sequential model
def build_seqmodel(input_shape, layer_params=[(250,'relu'), (200, 'relu'), (150,'relu')], learning_rate= .001, epochs_num=5, decay_rate=1):
  model=keras.models.Sequential()
  #define layers
  model.add(keras.layers.Input(shape=input_shape))
  for params in layer_params:
    model.add(keras.layers.Dense(units=params[0], activation=params[1]))
    model.add(keras.layers.BatchNormalization())
    
  model.add(keras.layers.Dense(1))
  
  # Select optimizer, loss and metrics
  optimizer=keras.optimizers.Adam(learning_rate=lr_(learning_rate,epochs_num, decay_rate ))
  loss=keras.losses.MeanSquaredError()
  metrics = keras.metrics.RootMeanSquaredError()
  model.compile(loss= loss, metrics=metrics, optimizer= optimizer)
  return model

In [None]:
# Specify the arguments for the model function
input_shape=X_train.shape[1:]
layer_params=[(250,'relu'), (200, 'relu'), (150,'relu'),(200, 'relu'),(200, 'relu'),(200, 'relu') ]
learning_rate=.01
epochs_num=10
decay_rate=1
model=build_seqmodel(input_shape, layer_params=layer_params, learning_rate= learning_rate, epochs_num=epochs_num, decay_rate=decay_rate)
# Create checkpoint to save best model and creat early stopping criteria
check_point_best= keras.callbacks.ModelCheckpoint('keras_model.h5',save_best_only = True)
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

history=model.fit(X_train, y_train, epochs=epochs_num, batch_size=128, validation_data=(X_test, y_test), callbacks= [check_point_best,early_stopping], shuffle=True)

In [None]:
del X_train
del X_test
del model

In [None]:
# Lets view the losses and metrics
pd.DataFrame(history.history).plot(figsize=(12,6))
plt.grid(True)
plt.show()
plt.close()
plt.clf()

In [None]:
import ubiquant
best_model = keras.models.load_model('keras_model.h5')
env = ubiquant.make_env()   
iter_test = env.iter_test()    
for (test_df, sample_prediction_df) in iter_test:
    test_df = scale_dataset(test_df)
    sample_prediction_df['target'] = best_model.predict(test_df)  
    env.predict(sample_prediction_df)