<a href="https://colab.research.google.com/github/sharsulkar/H1B_LCA_outcome_prediction/blob/main/DNN_reusable_workflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#1. Import libraries - numpy, pandas, matplotlib.pyplot, tensorflow, keras
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras.initializers import HeUniform
from datetime import datetime
from keras.callbacks import EarlyStopping

Weight initializers -
https://www.tensorflow.org/api_docs/python/tf/keras/initializers/HeUniform

https://towardsdatascience.com/hyper-parameters-in-action-part-ii-weight-initializers-35aee1a28404


In [None]:
#2. Load train and test data from google colab folder /root/contents/sample_data/
data_filepath='/content/sample_data/'
train_filename='california_housing_train.csv'
test_filename='california_housing_test.csv'
train_df=pd.read_csv(data_filepath+train_filename,engine='python')
test_df=pd.read_csv(data_filepath+test_filename,engine='python')

In [None]:
#4. Define preprocessing pipeline
#a. Split median_house_value in training and test dataset into output varaible Y
Y_train=train_df[['median_house_value']]
X_train=train_df.drop(columns=['median_house_value'])
Y_test=test_df[['median_house_value']]
X_test=test_df.drop(columns=['median_house_value'])

In [None]:
#define function to plot training and validation loss
def plot_loss(history,model_name):
  plt.plot(history.history['loss'], label='loss_'+model_name)
  plt.plot(history.history['val_loss'], label='val_loss_'+model_name)
  #plt.ylim([0, 10])
  plt.xlabel('Epoch')
  plt.ylabel('Error')
  plt.legend()
  plt.grid(True)

In [None]:
def evaluate_models(models,config,test_results,X_train,Y_train,X_test,Y_test,plot_lossYN=False):
  for model in models:
  #Train the model
    history=model.fit(X_train,Y_train,
                    validation_split=config['validation_split'],
                    epochs=config['epochs'],
                    batch_size=config['batch_size'],
                    verbose=0)
    
    test_results=test_results.append({
      'model_name':model.name,
      'batch_size':config['batch_size'],
      'loss':config['loss'],
      'optimizer':config['optimizer'],
      'total_epochs':config['epochs'],
      'min_train_loss':min(history.history['loss']),
      'epoch_at_min_train_loss':np.argmin(history.history['loss']),
      'min_val_loss':min(history.history['val_loss']),
      'epoch_at_min_val_loss':np.argmin(history.history['val_loss']),
      'test_eval_results': model.evaluate(X_test,Y_test),
      'timestamp':datetime.now().strftime("%D %H:%M:%S")
    },ignore_index=True)

    if plot_lossYN:
      plot_loss(history,model.name)

  return test_results

In [None]:
#function to get the best model
def get_best_model(test_results):
  return test_results[test_results.min_val_loss==test_results.min_val_loss.min()]


In [None]:
def expand_hlayers(hlayers_arr):
  arr=[]
  for i in range(len(hlayers_arr)):
    arr.append(list(range(hlayers_arr[i][0],hlayers_arr[i][1],hlayers_arr[i][2])))
  return arr

In [None]:
def return_allcombinations(arr):
  #calculate dimensions of final array
  cum_prod=np.cumprod([len(arr[i]) for i in range(len(arr))])
  m=np.prod([len(arr[i]) for i in range(len(arr))])
  hlayer_arr=np.array(arr[0],dtype=int).repeat(m/cum_prod[0])
  for i in range(1,len(arr)):
      cc=np.array(arr[i],dtype=int).repeat(m/cum_prod[i])

      for j in range(np.int(cum_prod[i-1])-1):
        cc=np.hstack((cc,np.array(arr[i],dtype=int).repeat(m/cum_prod[i])))
      hlayer_arr=np.vstack((hlayer_arr,cc))
  return hlayer_arr.T

In [None]:
#Store evaluate results for comparing between models
test_results = pd.DataFrame(columns=['model_name',
                                     'batch_size',
                                     'loss',
                                     'optimizer',
                                     'total_epochs',
                                     'min_train_loss',
                                     'epoch_at_min_train_loss',
                                     'min_val_loss',
                                     'epoch_at_min_val_loss',
                                     'test_eval_results',
                                     'timestamp'])

In [None]:
#Define reusable model params
config={
  'batch_size':np.int(0.3*len(X_train)),
  'feature_size':X_train.shape[1],
  'loss':'mean_absolute_error',
  'optimizer':optimizers.Adadelta(lr=0.05),
  'epochs':1000,
  'validation_split':0.25
}

In [None]:
#b. Normalize data using keras inbuilt normalizer module
#Important note - train the normalizer only on the training dataset, use the trained normalizer directly on the test set. DO NOT RELEARN NORMALIZER ON TEST SET
normalizer=preprocessing.Normalization()
normalizer.adapt(np.array(X_train))

In [None]:
input=keras.layers.Dense(8,input_shape=(None,8),activation='relu')
output=keras.layers.Dense(1)
initializer = HeUniform(seed=1760)

In [None]:
#path to save trained models
project_name='CA_houseprices_regression'
model_path='/content/drive/My Drive/saved models/'+project_name

In [None]:
#model_config=[(normalizer,[(5,8,1),(2,4,1),(1,4,1)],output)]
model_config=[(normalizer,[(5,7,1),(2,3,1),(1,2,1)],output)]

In [None]:
%%time
for i in range(len(model_config)):
  arr=expand_hlayers(model_config[i][1])

  hlayer_arr=return_allcombinations(arr)
  if len(hlayer_arr.shape)==1:
    hlayer_arr=hlayer_arr.reshape(hlayer_arr.shape[0],1)

  for j in range(hlayer_arr.shape[0]):
    tf.keras.backend.clear_session()
    model_name='Dense'+['.'.join(str(hlayer_arr[j][l]) for l in range(len(hlayer_arr[j])))][0]
    #Define and initialize model
    model=keras.Sequential(name=model_name)
    #Add input layer
    model.add(model_config[i][0])
    #add hidden layers
    for k in range(hlayer_arr.shape[1]):
      model.add(layers.Dense(np.int(hlayer_arr[j,k]),activation='relu',kernel_initializer=initializer))
    #Add output layer
    model.add(layers.Dense(1))  
    model.compile(loss=config['loss'],optimizer=config['optimizer'])    
    models=[model]
    test_results=evaluate_models(models,config,test_results,X_train,Y_train,X_test,Y_test)
    model.save(model_path+'/'+model.name+'/')

get_best_model(test_results)

INFO:tensorflow:Assets written to: /content/drive/My Drive/saved models/CA_houseprices_regression/Dense5.2.1/assets
INFO:tensorflow:Assets written to: /content/drive/My Drive/saved models/CA_houseprices_regression/Dense6.2.1/assets
CPU times: user 50.8 s, sys: 2.27 s, total: 53 s
Wall time: 44.3 s


In [None]:
get_best_model(test_results)

Unnamed: 0,model_name,batch_size,loss,optimizer,total_epochs,min_train_loss,epoch_at_min_train_loss,min_val_loss,epoch_at_min_val_loss,test_eval_results,timestamp
12,Dense6.3.3,3400,mean_absolute_error,<tensorflow.python.keras.optimizer_v2.adadelta...,1000,41736.664062,996,49702.117188,976,44194.230469,11/02/20 02:15:01


In [None]:
callback_ES = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=100,restore_best_weights=True)

In [None]:
m_final = keras.models.load_model(model_path+'/Dense6.2.1')
m_final.summary()

Model: "Dense6.2.1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
normalization (Normalization (None, 8)                 17        
_________________________________________________________________
dense (Dense)                (None, 6)                 54        
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 14        
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 3         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 2         
Total params: 90
Trainable params: 73
Non-trainable params: 17
_________________________________________________________________


In [None]:
tf.keras.backend.clear_session()
m_final=keras.Sequential(name='final',
  layers=[
  normalizer,                            
  layers.Dense(5,activation='relu',kernel_initializer=initializer),
  layers.Dense(2,activation='relu',kernel_initializer=initializer),
  layers.Dense(3,activation='relu',kernel_initializer=initializer),
  layers.Dense(1)
])
m_final.compile(loss=config['loss'],optimizer=config['optimizer'])


In [None]:
models=[m_final]
test_results=evaluate_models(models,config,test_results,X_train,Y_train,X_test,Y_test)


