In [None]:
import os
import sys
import time
import logging
import importlib

import numpy as np
import pandas as pd
np.set_printoptions(precision=2, linewidth=125)

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

from tensorflow.keras        import optimizers 
from tensorflow.keras.layers import LSTM, GRU

module_path = os.path.abspath(os.path.join(r'..'))
if module_path not in sys.path:
    sys.path.append(module_path)
        
# Custom libraries
import ann_utils as ann
import gen_utils as gen

importlib.reload(ann)
importlib.reload(gen)

save_folder, model_folder = gen.create_model_and_figure_folders()

## Load the data

In [None]:
path = '../input/ventilator-pressure-prediction'
train_data_df = pd.read_csv(f"{path}/train.csv")

# Convert the dataframe to numpy array
train_data = train_data_df.to_numpy()

# Check the total number of samples
sample_length  = 80
total_number_of_samples = int(len(train_data) / sample_length)
print("Total number of samples: ", total_number_of_samples)

train_data_df.head()

## Divide the data into training, validation and testing sets

In [None]:
train_fraction = 0.8
valid_fraction = 0.1
sample_length  = 80

# Find the train/test fraction idx
train_end_idx   = int(len(train_data) * train_fraction)
valid_start_idx = int(len(train_data) * valid_fraction) + train_end_idx

# Divide the data into train and test
train_x_y = train_data[:train_end_idx, :]
valid_x_y = train_data[train_end_idx:valid_start_idx, :]
test_x_y  = train_data[valid_start_idx:, :]

number_of_train_samples = int(len(train_x_y) / sample_length)
number_of_valid_samples = int(len(valid_x_y) / sample_length)
number_of_test_samples  = int(len(test_x_y)  / sample_length)

print("Number of training samples: ",   number_of_train_samples)
print("Number of validation samples: ", number_of_valid_samples)
print("Number of testing samples:  ",   number_of_test_samples)
print("Total number of samples: ",      number_of_train_samples + number_of_valid_samples + number_of_test_samples)

## Create a Validation Timeseries Generator

In [None]:
importlib.reload(ann)

number_of_timesteps = 80
train_batch_size    = 80


generator_parameters = {'Sample Length'           : sample_length,
                        'Number of Timesteps'     : number_of_timesteps,
                        'Data Batch Size'         : train_batch_size,
                        'Padding'                 : True
                       }                      

validation_data = ann.Data_Generator(valid_x_y, 2 , generator_parameters)

number_of_batches = len(validation_data)
print("Number of batches: ", number_of_batches)

# for idx, (valid_x, valid_y) in enumerate(validation_data):
#     print(valid_x.shape)
#     print(idx)

## Create a GRU model

In [None]:
importlib.reload(ann)

# Optimizers 
sgd     = optimizers.SGD(learning_rate=0.05, decay=1e-4, momentum=0.7, nesterov=True)
rmsprop = optimizers.RMSprop(learning_rate=0.00001, rho=0.9, epsilon=None, decay=0.00001)
adam    = optimizers.Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
nadam   = optimizers.Nadam(learning_rate=0.002, beta_1=0.9, beta_2=0.999, epsilon=None, schedule_decay=0.004)


my_model_parameters = {'train_batch_size'           : train_batch_size,
                       'number_of_timesteps'        : number_of_timesteps,
                       'number_of_features'         : 5,
                       'number_of_outputs'          : 1,
                       'cell_type'                  : LSTM,
                       'number_of_recurrent_layers' : 4,
                       'number_of_recurrent_units'  : 256,
                       'number_of_FC_layers'        : 3,
                       'number_of_FC_hidden_units'  : 128,
                       'stateful_state'             : True,
                       'hidden_dropout_rate'        : 0.25,
                       'output_dropout_rate'        : 0.15,
                       'optimizer'                  : sgd
                      }



my_model = ann.create_GRU_model(my_model_parameters)

## Train the model

In [None]:
importlib.reload(ann)
importlib.reload(gen)

train_x_pad = np.zeros((number_of_timesteps, number_of_timesteps + sample_length))
train_y_pad = np.zeros(number_of_timesteps)

number_of_epochs  = 1

my_train_parameters = {'number_of_epochs'        : number_of_epochs,
                       'number_of_train_samples' : number_of_train_samples,
                       'number_of_batches'       : number_of_batches,
                       'batch_size'              : train_batch_size
                      }

# Create a logger object for saving the loss data and calculate ETA
logger = gen.training_logger(my_train_parameters)

# Iterate epochs
for e_idx in range(number_of_epochs):
       
    # Iterate all the samples
    for s_idx in range(number_of_train_samples):
               
        # Get the current sample using s_idx and create a generator
        train_generator = ann.Data_Generator(train_x_y, s_idx , generator_parameters)
        
        # Reset the sample set timer
        logger.reset_mean_loss_arrays()
        logger.start_set_time()
        
        # Iterate all the batches and save the losses
        for g_idx, (batch_x, batch_y) in enumerate(train_generator):
            
            # Train on batch
            tr_loss, tr_acc = my_model.train_on_batch(batch_x, batch_y)
            
            # Update the loss arrays and print the progress
            logger.update_mean_loss_arrays(tr_loss, tr_acc, g_idx)            
            logger.print_training_progress(e_idx, s_idx, g_idx, set_flag=True)
        
        logger.update_set_time()
        logger.print_training_progress(e_idx, s_idx, g_idx, set_flag=True)
                   
        # After finishing a sample, reset the model states
        my_model.reset_states()
        
        # Do a validation every epoch
        if (s_idx+1) % number_of_train_samples == 0:
            model_save_dir  = model_folder
            model_save_addr = os.path.join(model_save_dir, "GRU_model_2")

            my_model.save(model_save_addr)
            
#             for valid_x, valid_y in validation_data:
#                 validation_loss, _ = my_model.test_on_batch(valid_x, valid_y)
#                 logger.update_validation_loss_array(validation_loss)

    
# Plot the losses
plt.plot(logger.loss_array)

In [None]:
model_save_dir  = model_folder
model_save_addr = os.path.join(model_save_dir, "GRU_model_2")
my_model.save(model_save_addr)

## Load the test dataset

In [None]:
test_path = '../input/ventilator-pressure-prediction'
test_data_df = pd.read_csv(f"{test_path}/test.csv")

# Convert the dataframe to numpy array
test_data = test_data_df.to_numpy()

# Check the total number of samples
sample_length  = 80
total_number_of_samples = int(len(test_data) / sample_length)
print("Total number of samples: ", total_number_of_samples)

test_data_df.head()

## Do the prediction for the test dataset

In [None]:
array_len  = int(len(test_data) / 503)
pred_array = np.zeros(array_len)


# Give the generator parameters
number_of_timesteps = 80
test_batch_size     = train_batch_size
number_of_batches   = 1

generator_parameters = {'Sample Length'           : sample_length,
                        'Number of Timesteps'     : number_of_timesteps,
                        'Data Batch Size'         : test_batch_size,
                        'Padding'                 : True
                       }

array_len

In [None]:
importlib.reload(ann)

         
save_idx   = 0 
name_idx   = 0
total_time = 0

test_start_idx = int(((name_idx*array_len)/80))

for test_idx in range(test_start_idx, total_number_of_samples+1):
        
   
    sample_start = time.time()
    
    my_model.reset_states()
    test_sample        = ann.Data_Generator(test_data, test_idx, generator_parameters)
    sample_predictions = my_model.predict(test_sample,  batch_size = test_batch_size,
                                                        steps      = 1,
                                                        verbose    = 0)    
    pred_array[(save_idx*sample_length):(save_idx+1)*sample_length]    = sample_predictions[:, 0]
    save_idx += 1
    
    sample_time = time.time() - sample_start
    total_time  = total_time + sample_time
       
    mean_time = total_time / (test_idx+1)
    eta       = int(mean_time*(total_number_of_samples - test_idx))
    
    sample_str = "Sample: ["  + str(test_idx+1) + "/" + str(total_number_of_samples) + "]. "
    time_str   = "Time passed : {0:.1f}. ".format(total_time)
    eta_str    = "ETA: {0:.0f}. ".format(eta)
    
    progress_str = sample_str + time_str + eta_str
    
    print(progress_str, end='\r')
    
    if (save_idx+1) % 100 == 0:
        
        # Create the ID array
        id_idx_str = (name_idx*array_len) + 1
        id_idx_end = ((name_idx+1)*array_len) + 1
        id_array   = np.arange(id_idx_str, id_idx_end)
        
        # Combine with the predictions
        output_array = np.vstack((id_array, pred_array)).T
        
        # Create the save name string and save the array
        save_name  = "model_2_results_" + str(name_idx) + ".csv"
        save_dir   = os.path.join(save_folder, save_name)
        np.savetxt(save_dir, output_array, delimiter=",")
        
        del output_array
        
        name_idx +=  1
        save_idx  =  0

In [None]:
result_dir       = os.path.join(save_folder, "../")
save_folder_list = next(os.walk(result_dir))[1]

desired_save_folder = os.path.join(result_dir, save_folder_list[-1])
print(desired_save_folder)


model_list = []
result_list =[]

print(next(os.walk(desired_save_folder))[1])

for model_name in next(os.walk(desired_save_folder))[2]:
    print(model_name)
    model_addr = os.path.join(desired_save_folder, model_name)
    model_list.append(model_addr)
    
    result_dummy = np.loadtxt(model_addr, delimiter=",")
    result_list.append(result_dummy)

In [None]:
result_array = np.zeros((len(test_data), 2))

for idx, result in enumerate(result_list):
    
    result_idx_str = (idx*array_len)
    result_idx_end = ((idx+1)*array_len)
    
    result_array[result_idx_str:result_idx_end, :] = result
    
len(result_array)

result_array

In [None]:
save_name   = "GRU_model_2_results_all.csv"
main_folder = "/kaggle/working/"
save_dir   = os.path.join(main_folder + save_name)

print(save_dir)

# np.savetxt(save_dir, result_array, delimiter=",")

# df = pd.DataFrame(result_array, columns=['id', 'pressure'])
df = df.astype(int)

# # save the dataframe as a csv file
df.to_csv(save_dir, index = False)

In [None]:
import os
# for dirname, _, filenames in os.walk('/kaggle/working'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))


a = next(os.walk('/kaggle/working'))
a
# result_folder = os.path.join('/kaggle/working', a)

# exp_folder = os.path.join('/kaggle/working', next(os.walk(result_folder))[1][0])

# next(os.walk(exp_folder))