# My ML Aproach Simple DNN...
The steps you are going to cover in this tutorial are as follows:

* Load Data.
* Define Keras Model.
* Compile Keras Model.
* Fit Keras Model.
* Evaluate Keras Model.
* Tie It All Together.
* Make Predictions

# 1. Loading Libraries.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%%time
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from sklearn.model_selection import StratifiedKFold
import gc

# Datatable Libraries...
import datatable as dt

# 2. Notebook Configuration.

In [None]:
%%time
# Notebook Configuration...

# Amount of data we want to load into the Model...
DATA_ROWS = None

# Dataframe, the amount of rows and cols to visualize...
NROWS = 50
NCOLS = 15

# Main data location path...
BASE_PATH = '/kaggle/input/ubiquant-market-prediction/'

In [None]:
%%time
# I like to disable my Notebook Warnings.
import warnings
warnings.filterwarnings('ignore')

In [None]:
%%time
# Configure notebook display settings to only use 2 decimal places, tables look nicer.
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', NCOLS) 
pd.set_option('display.max_rows', NROWS)

# 3. Read the Original Datasets, Build Memory Efficient Dataset...

In [None]:
%%script false --no-raise-error
%%time
# Read the CSV using datatble
trn_data = dt.fread(BASE_PATH + 'train.csv', max_nrows = DATA_ROWS)
tst_data = dt.fread(BASE_PATH + 'example_test.csv', max_nrows = DATA_ROWS)

In [None]:
%%script false --no-raise-error
%%time
# Convert from a Datatable to Pandas Df.
trn_data = trn_data.to_pandas()
tst_data = tst_data.to_pandas()

In [None]:
%%script false --no-raise-error
%%time
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
%%script false --no-raise-error
%%time
trn_data = reduce_mem_usage(trn_data)

In [None]:
%%script false --no-raise-error
%%time
tst_data = reduce_mem_usage(tst_data)

In [None]:
%%script false --no-raise-error
%%time
trn_data.to_pickle('trn_data.pkl')

In [None]:
%%script false --no-raise-error
%%time
tst_data.to_pickle('tst_data.pkl')

# 4. Loading the Memory Efficient Datasets...

In [None]:
%%time
BASE_PATH = '../input/ubiquantmarketpredictionmemoryefficientdata/'
trn_data = pd.read_pickle(BASE_PATH + 'trn_data.pkl')
tst_data = pd.read_pickle(BASE_PATH + 'tst_data.pkl')

# 5. Exploring the Datasets

In [None]:
%%time
trn_data.head()

In [None]:
%%time
IGNORE = ['row_id', 'time_id', 'investment_id', 'target']
TARGET_FEATURE_NAME = 'target'

In [None]:
%%time
FEATURE_NAMES = [feat for feat in trn_data.columns if feat not in IGNORE]
NUMERIC_FEATURE_NAMES = FEATURE_NAMES

In [None]:
%%time
#X = trn_data[FEATURE_NAMES]
#y = trn_data[TARGET_FEATURE_NAME]
#X_test = tst_data[FEATURE_NAMES]
#inv_ids = trn_data['investment_id']

In [None]:
# import gc
# del trn_data
# del tst_data
# gc.collect

In [None]:
%%time
# define the keras model
model = Sequential()
model.add(Dense(256, input_dim = (trn_data[FEATURE_NAMES].shape[1]), activation = 'swish'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(128, activation='swish'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(64, activation='swish'))
model.add(BatchNormalization())
model.add(Dense(1))

In [None]:
model.summary()

In [None]:
%%script false --no-raise-error
# compile the keras model
model.compile(loss = 'mse', optimizer = 'adam', metrics = ['mae'])

In [None]:
early_stop = keras.callbacks.EarlyStopping(patience = 25)

In [None]:
%%script false --no-raise-error
# fit the keras model on the dataset
model.fit(X, y, epochs = 100, batch_size = 2048, callbacks=[early_stop], verbose = 1)

In [None]:
%%script false --no-raise-error
# evaluate the keras model
_, mae = model.evaluate(X, y)
print('MAE: %.3f' % (mae))

In [None]:
%%script false --no-raise-error
import tensorflow as tf
def preprocess(X, y):
    return X, y

def make_dataset(X, y, batch_size = 512, mode = 'train'):
    ds = tf.data.Dataset.from_tensor_slices((X, y))
    ds = ds.map(preprocess)
    
    if mode == "train":
        ds = ds.shuffle(2048)
    ds = ds.batch(batch_size).cache().prefetch(tf.data.experimental.AUTOTUNE)
    return ds

In [None]:
optimizer = tf.optimizers.Adam(0.001)

In [None]:
%%time

models = []
predictions = []
kfold = StratifiedKFold(5, shuffle = True, random_state = 42)

for index, (train_indices, valid_indices) in enumerate(kfold.split(trn_data[FEATURE_NAMES], trn_data['investment_id'])):
    
    print(f'Training Fold Number:: {index}')
    X_train, X_val = trn_data[FEATURE_NAMES].iloc[train_indices], trn_data[FEATURE_NAMES].iloc[valid_indices]
    y_train, y_val = trn_data[TARGET_FEATURE_NAME].iloc[train_indices], trn_data[TARGET_FEATURE_NAME].iloc[valid_indices]
    
    #investment_id_train = trn_data['investment_id'].iloc[train_indices]
    #investment_id_val = trn_data['investment_id'].iloc[valid_indices]
    
    #train_ds = make_dataset(X_train, y_train)
    #valid_ds = make_dataset(X_val, y_val, mode = 'valid')
    
    
    model.compile(loss = 'mae', optimizer = optimizer, metrics = ['mae'])
    #history = model.fit(train_ds, validation_data = valid_ds, epochs = 25, callbacks=[early_stop], verbose = False)
    history = model.fit(X_train, y_train, validation_data = (X_val, y_val), epochs = 100, batch_size = 1024, callbacks = [early_stop], verbose = 1)
    print('........')
    _, mae = model.evaluate(X_val, y_val, batch_size = 32)
    print('........')
    pred = model.predict(tst_data[FEATURE_NAMES])
    
    print(f'Fold {index}, MAE:: {mae}')
    print('')
    models.append(model)
    predictions.append(pred)
    
    del X_train
    del X_val
    del y_train
    del y_val
    gc.collect()
    

In [None]:
def inference(models, test_data, features):
    y_preds = []
    for model in models:
        y_pred = model.predict(test_data[features])
        y_preds.append(y_pred)
    return np.mean(y_preds, axis = 0)

In [None]:
import ubiquant
env = ubiquant.make_env()
iter_test = env.iter_test() 

for (test_df, sample_prediction_df) in iter_test:
    sample_prediction_df['target'] = inference(models, test_df, FEATURE_NAMES)
    env.predict(sample_prediction_df)