In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from tqdm import tqdm
import os 
import glob
tqdm.pandas()
import matplotlib.pyplot as plt
import gc

Welcome to the Notebook on challenge Ubiquant Market Prediction. I hope you enjoy the Tutorial. **(WORK IN PROGRESS)**

![tf2](https://blogger.googleusercontent.com/img/a/AVvXsEhCZhxqPJb0_Qa4DD8pRS65AmohKJL49y9kMRGj1tADlHxteSKQjUovAZohPop-ej9dfw-Z4_vKyq4aS8Smvro2aSDWmLq3LzwEmfICqTf3ipmSPiWAjix1fSddgyLoajK2y387-pi1r_aRiE3FEH0L_hu-CEboaAloznbVw9aroBzJPmJWfAuhzVGu)

Content:

1. **Reducing the Size of dataset by bringing every attribute to the correct datatype and saving them in pickle chunks.**
2. **Loading the reduced version in the memory.**
3. **Trying to explore some of the variables**
4. **Finding Variables which are correlated to the target variable.**
5. **Create tensorflow data API which can be effectively passed to the model for training.**
6. **Defining the Architecture of the Model**
7. **Defining the callbacks.**
8. **Finally training the model.**

**LETS START BY DEFINING SOME OF THE PATHS******

In [None]:
train_path = "../input/ubiquant-market-prediction/train.csv"
test_path = "../input/ubiquant-market-prediction/example_test.csv"

Lets try to reduce down the **dataset** first !! The code is pretty self understandable :)

In [None]:
# Lets first try to reduce the size of the dataframe by bringing it to right dtype and saving those chunks.

def reduce_memory_usage(df, chunk):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print("Initial Memory chunk: {:.3f}".format(start_mem))
    
    for col in df.columns:
        type_ = df[col].dtype
        
        if str(type_) != "object":
            if str(type_)[:3] == "int":
                min_ = df[col].min()
                max_ = df[col].max()

                if min_ > np.iinfo(np.int8).min and max_ < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif min_ > np.iinfo(np.int16).min and max_ < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif min_ > np.iinfo(np.int32).min and max_ < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                else:
                    df[col] = df[col].astype(np.int64)
            else:
                if min_ > np.finfo(np.float16).min and max_ < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif min_ > np.finfo(np.float32).min and max_ < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")
    
    end_mem = df.memory_usage().sum() / 1024 ** 2
    print("Final Memory chunk: {:.3f}".format(end_mem))
    print("Reduced by: {:.2f}".format((start_mem - end_mem) / start_mem))
    
    df.to_pickle(f"chunk_{chunk}.pkl")
    print(f"chunk_{chunk}.pkl","saved!")
                    
    

**Since the dataset is very huge, reading it in chunks of shown size and processing every chunk seperately.**

In [None]:
gc.collect()
chunksize = 10 ** 6

for chunk_id, chunk in enumerate(pd.read_csv(train_path, chunksize=chunksize)):
    reduce_memory_usage(chunk, chunk_id)

**Since the dataset is drastically reduced lets try to concatenate everything in one frame****

In [None]:
appended_list = []
path = glob.glob(os.path.join(os.curdir, "chunk_*.pkl"), recursive=True)

for item in tqdm(path):
    df = pd.read_pickle(item)
    appended_list.append(df)
    
final_frame = pd.concat(appended_list, axis = 0, ignore_index=True)    
    

In [None]:
final_frame.describe()

In [None]:
final_frame.info()

**Lets try to do some EDA here!!**

In [None]:
def plot_counts(dataframe, name = None,bins = 1000):
    dataframe.value_counts().sort_index().plot(kind = "hist", bins = bins, density = True)
    plt.xlabel("Instance")
    if name:
        plt.title(name, fontsize=12)
    plt.ylabel("Unique Count")
    plt.grid()

In [None]:
int_col = [col for col in final_frame.columns if 'int' in str(final_frame[col].dtype)]
int_col

In [None]:
# Lets Visualize them
plt.rcParams["figure.figsize"] = (16, 8)
plt.subplot(1, 2, 1)
plot_counts(final_frame[int_col[0]], name = int_col[0], bins = 5**2)
plt.subplot(1, 2, 2)
plot_counts(final_frame[int_col[1]], name = int_col[1], bins = 5**2)

plt.subplots_adjust(wspace=0.8, hspace=0.8)

In [None]:
final_frame.drop(["row_id"], axis = 1, inplace = True)

In [None]:
target = final_frame["target"].values
final_frame.drop(["target"], axis = 1, inplace = True)

In [None]:
fig, ax = plt.subplots(figsize = (8,8))
sns.kdeplot(target, ax = ax, shade=True)
ax.set_xlabel("Target Distribution")
ax.set_ylabel("Density")

LETS SEE THE **DISTRIBUTION** OF THE VARIABLES

In [None]:
def plot_me(data):
    # Lets plot these 
    plt.figure(figsize = (30, 30))
    n_rows = 5
    n_cols = 6

    
    for idx, col in enumerate(data.columns):
        ax = plt.subplot(n_rows, n_cols, idx+1)
        sns.kdeplot(data[col], ax=ax, fill=True)
        ax.grid("True")
        ax.set_xlabel(col)
        ax.set_ylabel("Density")

    plt.subplots_adjust(wspace=0.2, hspace=0.2)


In [None]:
columns = final_frame.columns[2:]
plot_me(final_frame.loc[:,columns[:30]])

In [None]:
plot_me(final_frame.loc[:,columns[30:60]])

In [None]:
plot_me(final_frame.loc[:,columns[60:90]])

In [None]:
plot_me(final_frame.loc[:,columns[90:120]])

In [None]:
plot_me(final_frame.loc[:,columns[120:150]])

In [None]:
plot_me(final_frame.loc[:,columns[150:180]])

In [None]:
plot_me(final_frame.loc[:,columns[180:210]])

In [None]:
plot_me(final_frame.loc[:,columns[210:240]])

In [None]:
plot_me(final_frame.loc[:,columns[240:270]])

In [None]:
plot_me(final_frame.loc[:,columns[270:]])

**Lets see if there are some features which are correlated to our **target variable****

In [None]:
# Lets check which features are correlated to the target variable

correlation = []

for col in tqdm(final_frame.columns):
    pearson_relation = np.corrcoef(target, final_frame[col])[0,1]
    correlation.append(pearson_relation)

Turns out that the target variable is not much linearly correlated to any of the given features.

The below plot shows that maximum correlation goes only till 0.05(max)

WE will do more EDA Section here..........
**But lets start to do Learning stuff !!**

**Deep Learning**

![tf2](https://www.marketing-branding.com/wp-content/uploads/2020/05/tensorflow-beneficios.jpg)

In [None]:
!nvidia-smi

**We need to install right tensorflow version according to CUDA requirements**

In [None]:
!pip uninstall tensorflow -y

In [None]:
!pip install tensorflow-gpu==2.4.0

In [None]:
import tensorflow as tf
import warnings
warnings.filterwarnings("ignore")
# To ignore some of the warnings

***Lets Split the dataset into train and validation set.***

In [None]:
def split_set_index(data, size):
        train_size = int(len(data) * size)
        index = tf.random.shuffle(tf.range(len(data)))
        return index[:train_size], index[train_size:]

In [None]:
train_idx, val_idx = split_set_index(final_frame.values, size=0.8)

Dataloader : There are two different ways for generating the data. Both are shown.

1. **Keras Generator Sequence**
2. **Tensorflow Data API**

Lets first create a **Keras Generator** which generates batch of data

For learning more on tf2 data api refer [here](https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly)

In [None]:
class Keras_Dataloader(tf.keras.utils.Sequence):
    def __init__(self, data, batchsize = 16):
        self.data = data
        self.index = np.arange(len(self.data))
        self.batchsize = batchsize
        
    def on_epoch_end(self):              # Always called at end of each epoch. HEre we simply shuffle the data
        self.data = tf.random.shuffle(self.data)
    
    def __len__(self):                    # Returns the number of element in a batch
        return int(np.floor(len(self.data) / self.batchsize))
    
    def __getitem__(self, idx):           # Returns the elements of a batch.
        index = self.index[idx * self.batchsize : self.batchsize * (idx + 1)]
        id_ = [self.data[k] for k in index]
        
        X, Y = self.return_elements(id_)
        return X, Y
    
    def return_elements(self, idx):            # Helper FUnction
        
        x_batch = final_frame.iloc[idx,:].values
        y_batch = target[idx]
        
        return np.asarray(x_batch, dtype=np.float32), np.asarray(y_batch, dtype=np.float16)
        

Now lets see how to build the same thing using **Tensorflow Data API**

In [None]:
class Dataloader:
    
    def __init__(self, train, val,batchsize = 16, buffersize = 8):
        self.batchsize = batchsize
        self.buffersize = buffersize
        self.train = tf.data.Dataset.from_tensor_slices(train)
        self.val = tf.data.Dataset.from_tensor_slices(val)
        
        
    def weiter_process(self, instance):
        idx = instance.numpy()
        instance = final_frame.iloc[idx].values
        label = target[idx]
        
        # Any other preprocessing here if needs to be done
        
        return tf.cast(instance, dtype=tf.float32), tf.cast(label, dtype=tf.float16)
    
    def process(self, instance):  # Since eager tensor is passed to work on it we call tf.py_function and work on instance
        instance, label = tf.py_function(self.weiter_process, [instance],[tf.float32, tf.float16])
        return instance, label
        
    def create_loader(self, dataset):
        
        # NOte here map function maps every instance of data to first go through the function process and then does remaining on its output
        dataset = dataset.map(self.process, num_parallel_calls=tf.data.AUTOTUNE)
        
        # then we cache the data, shuffle it and batch it 
        dataset = dataset.cache().shuffle(self.buffersize).batch(self.batchsize).repeat(1)
        
        # prefetching means during the process start to prepare the next batch 
        dataset = dataset.prefetch(buffer_size=tf.data.AUTOTUNE)
        return dataset
        
    def return_loaders(self):
        train_loader = self.create_loader(self.train)
        val_loader = self.create_loader(self.val)
        return train_loader, val_loader
        

In [None]:
# COnfigurations to be used, Uncomment BUFFERSIZE If you want to use tensorflow data api

BATCHSIZE = 32
#BUFFERSIZE = 8
EPOCHS = 10

**For Keras Generator**: Lets use Keras generator API for it

In [None]:
gc.collect()
train_loader = Keras_Dataloader(data=train_idx, batchsize=BATCHSIZE)
val_loader = Keras_Dataloader(data=val_idx, batchsize=BATCHSIZE)

**For Tensorflow DATA API**

In [None]:
"""gc.collect()
dataloader = Dataloader(train = train_idx, val = val_idx, batchsize=BATCHSIZE, buffersize=BUFFERSIZE)
train_loader_tf, val_loader_tf = dataloader.return_loaders()"""

In [None]:
for X,Y in train_loader:
    print(X.shape)
    print(Y.shape)
    break
    
# FOr tensorflow data api 
"""    
for X,Y in train_loader.take(1):
    print(X.shape)
    print(Y.shape)
    """

**Lets define normal Sequential model. Everything is self understood**

In [None]:
# Lets define a Sequential Model

input_shape = final_frame.iloc[0].values.shape
model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape = input_shape),
    tf.keras.layers.Dropout(rate = 0.8),model.compile(loss=tf.keras.losses.mean_squared_error, optimizer=tf.keras.optimizers.Adam())
    tf.keras.layers.Dense(400, activation=tf.keras.activations.relu),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(rate = 0.5),
    tf.keras.layers.Dense(500, activation=tf.keras.activations.relu),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(rate = 0.5),
    tf.keras.layers.Dense(750, activation=tf.keras.activations.relu),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(rate = 0.5),
    tf.keras.layers.Dense(500, activation=tf.keras.activations.relu),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(rate = 0.5),
    tf.keras.layers.Dense(400, activation=tf.keras.activations.relu),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(rate = 0.5),
    tf.keras.layers.Dense(128, activation=tf.keras.activations.relu),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(rate = 0.5),
    tf.keras.layers.Dense(8, activation=tf.keras.activations.relu),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(rate = 0.3),
    tf.keras.layers.Dense(1)
])

In [None]:
tf.keras.utils.plot_model(model, show_dtype=True, show_shapes=True, show_layer_names=True)

**Defining Callbacks.**

To learn more refer [here](https://www.tensorflow.org/guide/keras/custom_callback)

Callbacks: There are some callbacks that you can use directly like Earlystop and Checkpoints. But tensorflow also have the option to define custom callbacks which I used here for Onecycle,Exponential Schedule and ResultCallback. How to define custom callbacks ? Ok let me break it down.

For creating custom callback you need to subclass **tf.keras.callbacks.Callback** class.
Now you have predefined function like

**on_(train|test|predict)begin(self, logs=None)**, **on(train|test|predict)end(self, logs=None)**,**on(train|test|predict)_batch_begin(self, batch, logs=None)**,etc... 

Now every function is either called at the (beginning/end) of (batch/epoch/train/test/predict) this depends on which you are using. All we gotta do is manipulate the function accordingly.

In [None]:
K = tf.keras.backend
class Track_Progress(tf.keras.callbacks.Callback):
    def __init__(self,**kwargs):
        super().__init__(**kwargs)
        self.loss = []
        self.val_loss = []
        self.epoch = []
        self.lr = []
    def on_epoch_end(self, epoch, logs=None):
        self.loss.append(logs["loss"])
        self.val_loss.append(logs["val_loss"])
        self.lr.append(K.get_value(self.model.optimizer.lr))
        self.epoch.append(epoch)

In [None]:
# Lets define some callbacks
def give_id(path):
    import time
    id_ = time.strftime("run_%Y_%m_%D_%H_%M_%S")
    return os.path.join(path, id_)


logdir = give_id(os.curdir)
tensorboard = tf.keras.callbacks.TensorBoard(log_dir = logdir)
earlystop = tf.keras.callbacks.EarlyStopping(patience = 10, restore_best_weights = True)
modelcheckpoint_best = tf.keras.callbacks.ModelCheckpoint(filepath=os.curdir, save_best_only=True,save_weights_only=True)
modelcheckpoint_last = tf.keras.callbacks.ModelCheckpoint(filepath=os.curdir, save_best_only=False,save_weights_only=True)
tracker = Track_Progress()

Callbacks = [tensorboard, earlystop, modelcheckpoint_best, modelcheckpoint_best, tracker]

In [None]:
model.compile(loss=tf.keras.losses.mean_squared_error, optimizer=tf.keras.optimizers.Adam())

In [None]:
model.fit_generator(generator=train_loader, validation_data=val_loader, callbacks=Callbacks, epochs=EPOCHS, use_multiprocessing=True, workers=-1, shuffle=False, max_queue_size=10, verbose=1)

In [None]:
"""model.fit(train_loader, epochs=EPOCHS, validation_data=val_loader, callbacks=[tensorboard, earlystop, modelcheckpoint_best, modelcheckpoint_best, tracker],
         verbose=1)"""

In [None]:
plt.figure(figsize = (8, 8))
val_loss = tracker.val_loss
train_loss = tracker.loss
epoch = tracker.epoch

plt.plot(epoch, val_loss)
plt.xlabel("Epoch")
plt.ylabel("Validation Loss")




In [None]:
plt.figure(figsize = (8, 8))
plt.plot(epoch, train_loss)
plt.xlabel("Epoch")
plt.ylabel("Training Loss")

In [None]:
# Lets train a bit more using a smaller learning rate

model.compile(loss=tf.keras.losses.mean_squared_error, optimizer=tf.keras.optimizers.Adam(learning_rate = 1e-4))
model.fit_generator(generator=train_loader, validation_data=val_loader, callbacks=Callbacks, epochs=5, use_multiprocessing=True, workers=-1, shuffle=False, max_queue_size=10, verbose=1)

In [None]:
model.save("trained_model.h5")

import json
model_json = model.to_json()
with open(os.path.join(os.curdir,"model_graph.json"), "w") as json_file:
    json_file.write(model_json)

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs

![Work in Progress](https://t4.ftcdn.net/jpg/04/33/46/65/360_F_433466592_JpXOCCvbV3kMKTWo3jZKhGBnqEafnmfw.jpg)