# HyperParameters

In [None]:
global batch_size, training_epochs, learning_rate, output_regularization, l2_regularization
training_epochs = 150
learning_rate = 1e-3
output_regularization = 0
l2_regularization = 0
batch_size = 1024
logdir = '/kaggle/working/logs'
dataset_name = 'CMS_Muon_Momentum'
decay_rate = 0.94
dropout = 0.1
data_split = 1
tf_seed = 242
feature_dropout = 0.0
num_basis_functions = 512
units_multiplier = 6
cross_val = False
max_checkpoints_to_keep = 1
save_checkpoint_every_n_epochs = 1
n_models = 1
num_splits = 2
fold_num = 1
activation = 'exu'
regression = True
debug = False
shallow = True
use_dnn = False
early_stopping_epochs = 25
_N_FOLDS = 5

# Imports

In [None]:
!pip uninstall tensorflow -y
!pip -qq install tensorflow==1.15

In [None]:
from typing import Union, List, Tuple, Iterator, Dict
import functools
from typing import Union, List, Optional, Tuple, Callable, Dict
from sklearn import metrics as sk_metrics
import tensorflow.compat.v1 as tf
import operator
import os
import gzip
import os.path as osp
import tarfile

import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import KFold
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
gfile = tf.io.gfile

tf.enable_eager_execution()

# Model.py

In [None]:
TfInput = Union[np.ndarray, tf.Tensor]


def exu(x, weight, bias):
  """ExU hidden unit modification."""
  return tf.exp(weight) * (x - bias)


# Activation Functions
def relu(x, weight, bias):
  """ReLU activation."""
  return tf.nn.relu(weight * (x - bias))


def relu_n(x, n = 1):
  """ReLU activation clipped at n."""
  return tf.clip_by_value(x, 0, n)
#   return x

In [None]:
class ActivationLayer(tf.keras.layers.Layer):
  """Custom activation Layer to support ExU hidden units."""

  def __init__(self,
               num_units,
               name = None,
               activation = 'exu',
               trainable = True):
    """Initializes ActivationLayer hyperparameters.
    Args:
      num_units: Number of hidden units in the layer.
      name: The name of the layer.
      activation: Activation to use. The default value of `None` corresponds to
        using the ReLU-1 activation with ExU units while `relu` would use
        standard hidden units with ReLU activation.
      trainable: Whether the layer parameters are trainable or not.
    """
    super(ActivationLayer, self).__init__(trainable=trainable, name=name)
    self.num_units = num_units
    self._trainable = trainable
    if activation == 'relu':
      self._activation = relu
      self._beta_initializer = 'glorot_uniform'
    elif activation == 'exu':
      self._activation = lambda x, weight, bias: relu_n(exu(x, weight, bias))
      self._beta_initializer = tf.initializers.truncated_normal(
          mean=4.0, stddev=0.5)
    else:
      raise ValueError('{} is not a valid activation'.format(activation))

  def build(self, input_shape):
    """Builds the layer weight and bias parameters."""
    self._beta = self.add_weight(
        name='beta',
        shape=[input_shape[-1], self.num_units],
        initializer=self._beta_initializer,
        trainable=self._trainable)
    self._c = self.add_weight(
        name='c',
        shape=[1, self.num_units],
        initializer=tf.initializers.truncated_normal(stddev=0.5),
        trainable=self._trainable)
    super(ActivationLayer, self).build(input_shape)

  @tf.function
  def call(self, x):
    """Computes the output activations."""
    center = tf.tile(self._c, [tf.shape(x)[0], 1])
    out = self._activation(x, self._beta, center)
    return out

In [None]:
class FeatureNN(tf.keras.layers.Layer):
  """Neural Network model for each individual feature.
  Attributes:
    hidden_layers: A list containing hidden layers. The first layer is an
      `ActivationLayer` containing `num_units` neurons with specified
      `activation`. If `shallow` is False, then it additionally contains 2
      tf.keras.layers.Dense ReLU layers with 64, 32 hidden units respectively.
    linear: Fully connected layer.
  """

  def __init__(self,
               num_units,
               dropout = 0.5,
               trainable = True,
               shallow = True,
               feature_num = 0,
               name_scope = 'model',
               activation = 'exu'):
    """Initializes FeatureNN hyperparameters.
    Args:
      num_units: Number of hidden units in first hidden layer.
      dropout: Coefficient for dropout regularization.
      trainable: Whether the FeatureNN parameters are trainable or not.
      shallow: If True, then a shallow network with a single hidden layer is
        created, otherwise, a network with 3 hidden layers is created.
      feature_num: Feature Index used for naming the hidden layers.
      name_scope: TF name scope str for the model.
      activation: Activation and type of hidden unit(ExUs/Standard) used in the
        first hidden layer.
    """
    super(FeatureNN, self).__init__()
    self._num_units = num_units
    self._dropout = dropout
    self._trainable = trainable
    self._tf_name_scope = name_scope
    self._feature_num = feature_num
    self._shallow = shallow
    self._activation = activation

  def build(self, input_shape):
    """Builds the feature net layers."""
    self.hidden_layers = [
        ActivationLayer(
            self._num_units,
            trainable=self._trainable,
            activation=self._activation,
            name='activation_layer_{}'.format(self._feature_num))
    ]
    if not self._shallow:
      self._h1 = tf.keras.layers.Dense(
          64,
          activation='relu',
          use_bias=True,
          trainable=self._trainable,
          name='h1_{}'.format(self._feature_num),
          kernel_initializer='glorot_uniform')
      self._h2 = tf.keras.layers.Dense(
          32,
          activation='relu',
          use_bias=True,
          trainable=self._trainable,
          name='h2_{}'.format(self._feature_num),
          kernel_initializer='glorot_uniform')
      self.hidden_layers += [self._h1, self._h2]
    self.linear = tf.keras.layers.Dense(
        1,
        use_bias=False,
        trainable=self._trainable,
        name='dense_{}'.format(self._feature_num),
        kernel_initializer='glorot_uniform')
    super(FeatureNN, self).build(input_shape)

  @tf.function
  def call(self, x, training):
    """Computes FeatureNN output with either evaluation or training mode."""
    with tf.name_scope(self._tf_name_scope):
      for l in self.hidden_layers:
        x = tf.nn.dropout(
            l(x), rate=tf.cond(training, lambda: self._dropout, lambda: 0.0))
      x = tf.squeeze(self.linear(x), axis=1)
    return x

In [None]:
class NAM(tf.keras.Model):
  """Neural additive model.
  Attributes:
    feature_nns: List of FeatureNN, one per input feature.
  """

  def __init__(self,
               num_inputs,
               num_units,
               trainable = True,
               shallow = True,
               feature_dropout = 0.0,
               dropout = 0.0,
               **kwargs):
    """Initializes NAM hyperparameters.
    Args:
      num_inputs: Number of feature inputs in input data.
      num_units: Number of hidden units in first layer of each feature net.
      trainable: Whether the NAM parameters are trainable or not.
      shallow: If True, then shallow feature nets with a single hidden layer are
        created, otherwise, feature nets with 3 hidden layers are created.
      feature_dropout: Coefficient for dropping out entire Feature NNs.
      dropout: Coefficient for dropout within each Feature NNs.
      **kwargs: Arbitrary keyword arguments. Used for passing the `activation`
        function as well as the `name_scope`.
    """
    super(NAM, self).__init__()
    self._num_inputs = num_inputs
    if isinstance(num_units, list):
      assert len(num_units) == num_inputs
      self._num_units = num_units
    elif isinstance(num_units, int):
      self._num_units = [num_units for _ in range(self._num_inputs)]
    self._trainable = trainable
    self._shallow = shallow
    self._feature_dropout = feature_dropout
    self._dropout = dropout
    self._kwargs = kwargs

  def build(self, input_shape):
    """Builds the FeatureNNs on the first call."""
    self.feature_nns = [None] * self._num_inputs
    for i in range(self._num_inputs):
      self.feature_nns[i] = FeatureNN(
          num_units=self._num_units[i],
          dropout=self._dropout,
          trainable=self._trainable,
          shallow=self._shallow,
          feature_num=i,
          **self._kwargs)
    self._bias = self.add_weight(
        name='bias',
        initializer=tf.keras.initializers.Zeros(),
        shape=(1,),
        trainable=self._trainable)
    self._true = tf.constant(True, dtype=tf.bool)
    self._false = tf.constant(False, dtype=tf.bool)
#     self.lin0 = tf.keras.layers.Dense(
#                                   16,
#                                   activation='relu',
#                                   use_bias=True,
#                                   kernel_initializer='glorot_uniform')
#     self.lin1 = tf.keras.layers.Dense(
#                                   16,
#                                   activation='relu',
#                                   use_bias=True,
#                                   kernel_initializer='glorot_uniform')
#     self.lin2 = tf.keras.layers.Dense(
#                                   1,
#                                   activation='relu',
#                                   use_bias=True,
#                                   kernel_initializer='glorot_uniform')

  def call(self, x, training = True):
    """Computes NAM output by adding the outputs of individual feature nets."""
    individual_outputs = self.calc_outputs(x, training=training)
    stacked_out = tf.stack(individual_outputs, axis=-1)
    training = self._true if training else self._false
    dropout_out = tf.nn.dropout(
        stacked_out,
        rate=tf.cond(training, lambda: self._feature_dropout, lambda: 0.0))
#     dropout_out0 = self.lin0(dropout_out)
#     dropout_out0 = self.lin1(dropout_out0)
# #     dropout_out1 = tf.nn.dropout(
# #         stacked_out,
# #         rate=tf.cond(training, lambda: self._feature_dropout, lambda: 0.0))
#     dropout_out2 = self.lin2(dropout_out0)
#     return dropout_out2
    
    out = tf.reduce_sum(dropout_out, axis=-1)
    return tf.sigmoid(out + self._bias)

  def _name_scope(self):
    """Overrides the default function to fix name_scope for bias."""
    tf_name_scope = self._kwargs.get('name_scope', None)
    name_scope = super(NAM, self)._name_scope()
    if tf_name_scope:
      return tf_name_scope + '/' + name_scope
    else:
      return name_scope

  def calc_outputs(self, x, training = True):
    """Returns the output computed by each feature net."""
    training = self._true if training else self._false
    list_x = tf.split(x, self._num_inputs, axis=-1)
    return [
        self.feature_nns[i](x_i, training=training)
        for i, x_i in enumerate(list_x)
    ]

# Graph_builder.py

In [None]:
np.warnings.filterwarnings('ignore')
LossFunction = Callable[[tf.keras.Model, TfInput, TfInput], tf.Tensor]
GraphOpsAndTensors = Dict[str, Union[tf.Tensor, tf.Operation, tf.keras.Model]]
EvaluationMetric = Callable[[tf.Session], float]

In [None]:
def penalized_loss(loss_func,
                   model,
                   inputs,
                   targets,
                   output_regularization,
                   l2_regularization = 0.0,
                   use_dnn = False):
  """Computes penalized loss with L2 regularization and output penalty.
  Args:
    loss_func: Loss function.
    model: Neural network model.
    inputs: Input values to be fed into the model for computing predictions.
    targets: Target values containing either real values or binary labels.
    output_regularization: Coefficient for feature output penalty.
    l2_regularization: Coefficient for L2 regularization.
    use_dnn: Whether using DNN or not when computing L2 regularization.
  Returns:
    The penalized loss.
  """
  loss = loss_func(model, inputs, targets)
  reg_loss = 0.0
  if output_regularization > 0:
    reg_loss += output_regularization * feature_output_regularization(
        model, inputs)
  if l2_regularization > 0:
    num_networks = 1 if use_dnn else len(model.feature_nns)
    reg_loss += l2_regularization * weight_decay(
        model, num_networks=num_networks)
  return loss + reg_loss

In [None]:
def penalized_mse_loss(model,
                       inputs,
                       targets,
                       output_regularization,
                       l2_regularization = 0.0,
                       use_dnn = False):
  """Mean Squared Error with L2 regularization and output penalty."""
  return penalized_loss(mse_loss, model, inputs, targets, output_regularization,
                        l2_regularization, use_dnn)

def feature_output_regularization(model,
                                  inputs):
  """Penalizes the L2 norm of the prediction of each feature net."""
  per_feature_outputs = model.calc_outputs(inputs, training=False)
  per_feature_norm = [  # L2 Regularization
      tf.reduce_mean(tf.square(outputs)) for outputs in per_feature_outputs
  ]
  return tf.add_n(per_feature_norm) / len(per_feature_norm)


def weight_decay(model, num_networks = 1):
  """Penalizes the L2 norm of weights in each feature net."""
  l2_losses = [tf.nn.l2_loss(x) for x in model.trainable_variables]
  return tf.add_n(l2_losses) / num_networks


def mse_loss(model, inputs,
             targets):
  """Mean squared error loss for regression."""
  predicted = model(inputs, training=True)
  predicted = tf.squeeze(predicted)
  targets = tf.squeeze(targets)
  return tf.losses.mean_squared_error(predicted, targets)

In [None]:
def generate_predictions(pred_tensor, dataset_init_op,
                         sess):
  """Iterates over the `pred_tensor` to compute predictions.
  Args:
    pred_tensor: Nested structure representing the next prediction element
      obtained from the `get_next` call on a `tf.compat.v1.data.Iterator`.
    dataset_init_op: Dataset iterator initializer for `pred_tensor`.
    sess: Tensorflow session.
  Returns:
    Predictions obtained over the dataset iterated using `pred_tensor`.
  """
  sess.run(dataset_init_op)
  y_pred = []
  while True:
    try:
      y_pred.extend(sess.run(pred_tensor))
    except tf.errors.OutOfRangeError:
      break
  return y_pred

In [None]:
def rmse_loss(sess, y_true, pred_tensor,
              dataset_init_op):
  """Calculates the RMSE error."""
  y_pred = generate_predictions(pred_tensor, dataset_init_op, sess)
  return rmse(y_true, y_pred)

def rmse_loss_2(sess, y_true, pred_tensor,
              dataset_init_op):
  """Calculates the RMSE error."""
  y_pred = generate_predictions(pred_tensor, dataset_init_op, sess)
  try:
    df = pd.read_csv('valid.csv')
    col_name = 'pred'+str(sorted([int(i.split('pred')[-1]) for i in df.columns if 'actual' not in i])[-1]+1)
    df[col_name] = np.array(y_pred).reshape((-1))
    df.to_csv('valid.csv', index=False)
  except:
    df = pd.DataFrame()
    df['actual'] = np.array(y_true).reshape((-1))
    df['pred0'] = np.array(y_pred).reshape((-1))
    df.to_csv('valid.csv', index=False)
  return rmse(y_true, y_pred)


def rmse(y_true, y_pred):
  """Root mean squared error between true and predicted values."""
  return float(np.sqrt(sk_metrics.mean_squared_error(y_true, y_pred)))

In [None]:
def grad(
    model,
    inputs,
    targets,
    loss_fn = rmse_loss,
    train_vars = None
):
  """Calculates gradient w.r.t. `train_vars` of the `loss_fn` for `model`."""
  loss_value = loss_fn(model, inputs, targets)
  if train_vars is None:
    train_vars = model.trainable_variables
  return loss_value, tf.gradients(loss_value, train_vars)

In [None]:
def create_iterators(
    datasets,
    batch_size):
  """Create tf.Dataset iterators from a tuple of one or more numpy arrays.
  Args:
    datasets: Single or pair of input numpy arrays containing  features.
    batch_size: Batch size for iterating over the datasets.
  Returns:
    Sampling tensor and Initializable iterator(s) for the input datasets.
  """
  tf_datasets = [
      tf.data.Dataset.from_tensor_slices(data).batch(batch_size)
      for data in datasets
  ]
  input_iterator = tf.data.Iterator.from_structure(tf_datasets[0].output_types,
                                                   tf_datasets[0].output_shapes)
  init_ops = [input_iterator.make_initializer(data) for data in tf_datasets]
  x_batch = input_iterator.get_next()
  return x_batch, init_ops

In [None]:
def create_nam_model(x_train,
                     dropout,
                     feature_dropout = 0.0,
                     num_basis_functions = 1000,
                     units_multiplier = 2,
                     activation = 'exu',
                     name_scope = 'model',
                     shallow = True,
                     trainable = True):
  """Create the NAM model."""
  global num_unique_vals, num_units, num_inputs
  num_unique_vals = [
      len(np.unique(x_train[:, i])) for i in range(x_train.shape[1])
  ]
  num_units = [
      min(num_basis_functions, i * units_multiplier) for i in num_unique_vals
  ]
  num_inputs = x_train.shape[-1]
  nn_model = NAM(
      num_inputs=num_inputs,
      num_units=num_units,
      dropout=np.float32(dropout),
      feature_dropout=np.float32(feature_dropout),
      activation=activation,
      shallow=shallow,
      trainable=trainable,
      name_scope=name_scope)
  return nn_model

In [None]:
def build_graph(
    x_train,
    y_train,
    x_test,
    y_test,
    learning_rate,
    batch_size,
    output_regularization,
    dropout,
    decay_rate,
    shallow,
    l2_regularization = 0.0,
    feature_dropout = 0.0,
    num_basis_functions = 1000,
    units_multiplier = 2,
    activation = 'exu',
    name_scope = 'model',
    regression = False,
    use_dnn = False,
    trainable = True
):
  """Constructs the computation graph with specified hyperparameters."""
  if regression:
    ds_tensors = tf.data.Dataset.from_tensor_slices((x_train, y_train)).apply(
        tf.data.experimental.shuffle_and_repeat(buffer_size=len(x_train[0])))
    ds_tensors = ds_tensors.batch(batch_size)
  else:
    # Create a balanced dataset to handle class imbalance
    ds_tensors = create_balanced_dataset(x_train, y_train, batch_size)
  x_batch, (train_init_op, test_init_op) = create_iterators((x_train, x_test),
                                                            batch_size)

  if use_dnn:
    nn_model = DNN(dropout=dropout, trainable=trainable)
  else:
    nn_model = create_nam_model(
        x_train=x_train,
        dropout=dropout,
        feature_dropout=feature_dropout,
        activation=activation,
        num_basis_functions=num_basis_functions,
        shallow=shallow,
        units_multiplier=units_multiplier,
        trainable=trainable,
        name_scope=name_scope)

  global_step = tf.train.get_or_create_global_step()
  learning_rate = tf.Variable(learning_rate, trainable=False)
  lr_decay_op = learning_rate.assign(decay_rate * learning_rate)
  optimizer = tf.train.AdamOptimizer(learning_rate)

  predictions = nn_model(x_batch, training=False)
#   test_prediction = nn_model(test_data_gen, training=False)
  tf.logging.info(nn_model.summary())
  train_vars = nn_model.trainable_variables
  if regression:
    loss_fn, y_pred = penalized_mse_loss, predictions
  else:
    # Apply sigmoid transformation for binary classification
    loss_fn, y_pred = penalized_cross_entropy_loss, tf.nn.sigmoid(predictions)
  loss_fn = functools.partial(
      loss_fn,
      output_regularization=output_regularization,
      l2_regularization=l2_regularization,
      use_dnn=use_dnn)

  iterator = ds_tensors.make_initializable_iterator()
  x1, y1 = iterator.get_next()
  loss_tensor, grads = grad(nn_model, x1, y1, loss_fn, train_vars)
  update_step = optimizer.apply_gradients(
      zip(grads, train_vars), global_step=global_step)
  avg_loss, avg_loss_update_op = tf.metrics.mean(
      loss_tensor, name='avg_train_loss')
  tf.summary.scalar('avg_train_loss', avg_loss)

  running_mean_vars = tf.get_collection(
      tf.GraphKeys.LOCAL_VARIABLES, scope='avg_train_loss')
  running_vars_initializer = tf.variables_initializer(
      var_list=running_mean_vars)

  # Use RMSE for regression and ROC AUC for classification.
  evaluation_metric = rmse_loss if regression else roc_auc_score
  train_metric = functools.partial(
      evaluation_metric,
      y_true=y_train,
      pred_tensor=y_pred,
      dataset_init_op=train_init_op)
  test_metric = functools.partial(
      rmse_loss_2,
      y_true=y_test,
      pred_tensor=y_pred,
      dataset_init_op=test_init_op)

  summary_op = tf.summary.merge_all()

  graph_tensors = {
      'train_op': [update_step, avg_loss_update_op],
      'lr_decay_op': lr_decay_op,
      'summary_op': summary_op,
      'iterator_initializer': iterator.initializer,
      'running_vars_initializer': running_vars_initializer,
      'nn_model': nn_model,
      'global_step': global_step,
  }
  eval_metric_scores = {'test': test_metric, 'train': train_metric}
  return graph_tensors, eval_metric_scores

# Nam_train.py

In [None]:
def data_split_with_cross_validation():
  return (data_split == 1) or (not cross_val)

def _get_train_and_lr_decay_ops(
    graph_tensors_and_ops,
    early_stopping):
  """Returns training and learning rate decay ops."""
  train_ops = [
      g['train_op']
      for n, g in enumerate(graph_tensors_and_ops)
      if not early_stopping[n]
  ]
  lr_decay_ops = [
      g['lr_decay_op']
      for n, g in enumerate(graph_tensors_and_ops)
      if not early_stopping[n]
  ]
  return train_ops, lr_decay_ops


def _update_latest_checkpoint(checkpoint_dir,
                              best_checkpoint_dir):
  """Updates the latest checkpoint in `best_checkpoint_dir` from `checkpoint_dir`."""
  for filename in tf.io.gfile.glob(os.path.join(best_checkpoint_dir, 'model.*')):
    gfile.remove(filename)
  for name in tf.io.gfile.glob(os.path.join(checkpoint_dir, 'model.*')):
    gfile.copy(
        name,
        os.path.join(best_checkpoint_dir, os.path.basename(name)),
        overwrite=True)

In [None]:
def _create_computation_graph(
    x_train, y_train, x_validation,
    y_validation, batch_size
):
  """Build the computation graph."""
  graph_tensors_and_ops = []
  metric_scores = []
  for n in range(n_models):
    graph_tensors_and_ops_n, metric_scores_n = build_graph(
        x_train=x_train,
        y_train=y_train,
        x_test=x_validation,
        y_test=y_validation,
        activation=activation,
        learning_rate=learning_rate,
        batch_size=batch_size,
        shallow=shallow,
        output_regularization=output_regularization,
        l2_regularization=l2_regularization,
        dropout=dropout,
        num_basis_functions=num_basis_functions,
        units_multiplier=units_multiplier,
        decay_rate=decay_rate,
        feature_dropout=feature_dropout,
        regression=regression,
        use_dnn=use_dnn,
        trainable=True,
        name_scope=f'model_{n}')
    graph_tensors_and_ops.append(graph_tensors_and_ops_n)
    metric_scores.append(metric_scores_n)
  return graph_tensors_and_ops, metric_scores

In [None]:
def _create_graph_saver(graph_tensors_and_ops,
                        logdir, num_steps_per_epoch):
  """Create saving hook(s) as well as model and checkpoint directories."""
  saver_hooks, model_dirs, best_checkpoint_dirs = [], [], []
  save_steps = num_steps_per_epoch * save_checkpoint_every_n_epochs
  # The MonitoredTraining Session counter increments by `n_models`
  save_steps = save_steps * n_models
  for n in range(n_models):
    saver=tf.train.Saver(
            var_list=graph_tensors_and_ops[n]['nn_model'].trainable_variables,
            save_relative_paths=True,
            max_to_keep=max_checkpoints_to_keep) 
    scaffold = tf.train.Scaffold(saver=saver)
    model_dirs.append(os.path.join(logdir, 'model_{}').format(n))
    best_checkpoint_dirs.append(os.path.join(model_dirs[-1], 'best_checkpoint'))
    tf.io.gfile.makedirs(best_checkpoint_dirs[-1])
    saver_hook = tf.train.CheckpointSaverHook(
        checkpoint_dir=model_dirs[-1], save_steps=save_steps, scaffold=scaffold)
    saver_hooks.append(saver_hook)
  return saver_hooks, model_dirs, best_checkpoint_dirs

In [None]:
def _update_metrics_and_checkpoints(sess,
                                    epoch,
                                    metric_scores,
                                    curr_best_epoch,
                                    best_validation_metric,
                                    best_train_metric,
                                    model_dir,
                                    best_checkpoint_dir,
                                    graph,
                                    metric_name = 'RMSE'
                                    ):
  """Update metric scores and latest checkpoint."""
  # Minimize RMSE and maximize AUROC
  compare_metric = operator.lt if regression else operator.gt
  # Calculate the AUROC/RMSE on the validation split
  validation_metric = metric_scores['test'](sess)
  tf.logging.info('Epoch %d %s Val %.4f', epoch, metric_name,
                validation_metric)
  print('Epoch ', epoch, metric_name,' Val ', validation_metric)
  if compare_metric(validation_metric, best_validation_metric):
    curr_best_epoch = epoch
    best_validation_metric = validation_metric
    best_train_metric = metric_scores['train'](sess)
    # copy the checkpoints files *.meta *.index, *.data* each time
    # there is a better result
    _update_latest_checkpoint(model_dir, best_checkpoint_dir)
#     save_test_predictions(sess, )
  return curr_best_epoch, best_validation_metric, best_train_metric

In [None]:
def training(x_train, y_train, x_validation,
             y_validation,
             logdir):
  """Trains the Neural Additive Model (NAM).
  Args:
    x_train: Training inputs.
    y_train: Training labels.
    x_validation: Validation inputs.
    y_validation: Validation labels.
    logdir: dir to save the checkpoints.
  Returns:
    Best train and validation evaluation metric obtained during NAM training.
  """
  tf.logging.info('Started training with logdir %s', logdir)
  print('Started training with logdir ', logdir)
  global batch_size
  batch_size = min(batch_size, x_train.shape[0])
  num_steps_per_epoch = x_train.shape[0] // batch_size
  # Keep track of the best validation RMSE/AUROC and train AUROC score which
  # corresponds to the best validation metric score.
  if regression:
    best_train_metric = np.inf * np.ones(n_models)
    best_validation_metric = np.inf * np.ones(n_models)
  else:
    best_train_metric = np.zeros(n_models)
    best_validation_metric = np.zeros(n_models)
  # Set to a large value to avoid early stopping initially during training
  curr_best_epoch = np.full(n_models, np.inf)
  # Boolean variables to indicate whether the training of a specific model has
  # been early stopped.
  early_stopping = [False] * n_models
  # Classification: AUROC, Regression : RMSE Score
  metric_name = 'RMSE' if regression else 'AUROC'
  tf.reset_default_graph()
  with tf.Graph().as_default():
    tf.set_random_seed(tf_seed)
    # Setup your training.
    graph_tensors_and_ops, metric_scores = _create_computation_graph(
        x_train, y_train, x_validation, y_validation, batch_size)

    train_ops, lr_decay_ops = _get_train_and_lr_decay_ops(
        graph_tensors_and_ops, early_stopping)
    global_step = tf.train.get_or_create_global_step()
    increment_global_step = tf.assign(global_step, global_step + 1)
    saver_hooks, model_dirs, best_checkpoint_dirs = _create_graph_saver(
        graph_tensors_and_ops, logdir, num_steps_per_epoch)
    if debug:
      summary_writer = tf.summary.FileWriter(os.path.join(logdir, 'tb_log'))

    with tf.train.MonitoredSession(hooks=saver_hooks) as sess:
      for n in range(n_models):
        sess.run([
            graph_tensors_and_ops[n]['iterator_initializer'],
            graph_tensors_and_ops[n]['running_vars_initializer']
        ])
      for epoch in range(1, training_epochs + 1):
        if not all(early_stopping):
          for _ in range(num_steps_per_epoch):
            sess.run(train_ops)  # Train the network
          # Decay the learning rate by a fixed ratio every epoch
          sess.run(lr_decay_ops)
        else:
          tf.logging.info('All models early stopped at epoch %d', epoch)
          print('All models early stopped at epoch ', epoch)
          break
#         print(model.predict(test_data_gen[0]))

        for n in range(n_models):
          if early_stopping[n]:
            sess.run(increment_global_step)
            continue
          # Log summaries
          if debug:
            global_summary, global_step = sess.run([
                graph_tensors_and_ops[n]['summary_op'],
                graph_tensors_and_ops[n]['global_step']
            ])
            summary_writer.add_summary(global_summary, global_step)

          if epoch % save_checkpoint_every_n_epochs == 0:
            (curr_best_epoch[n], best_validation_metric[n],
             best_train_metric[n]) = _update_metrics_and_checkpoints(
                 sess, epoch, metric_scores[n], curr_best_epoch[n],
                 best_validation_metric[n], best_train_metric[n], model_dirs[n],
                 best_checkpoint_dirs[n], graph_tensors_and_ops[n], metric_name)
            if curr_best_epoch[n] + early_stopping_epochs < epoch:
              tf.logging.info('Early stopping at epoch {}'.format(epoch))
              print('Early stopping at epoch {}'.format(epoch))
              early_stopping[n] = True  # Set early stopping for model `n`.
              train_ops, lr_decay_ops = _get_train_and_lr_decay_ops(
                  graph_tensors_and_ops, early_stopping)
          # Reset running variable counters
#           graph_tensors_and_ops[n]['nn_model'].save_weights('epoch{}.h5'.format(epoch))
          sess.run(graph_tensors_and_ops[n]['running_vars_initializer'])

  tf.logging.info('Finished training.')
  print('Finished training.')
  for n in range(n_models):
    print(
        'Model ', n,': Best Epoch ', curr_best_epoch[n],', Individual ',metric_name,': Train ',best_train_metric[n],', Validation %.4f',
        best_validation_metric[n])
    tf.logging.info(
        'Model %d: Best Epoch %d, Individual %s: Train %.4f, Validation %.4f',
        n, curr_best_epoch[n], metric_name, best_train_metric[n],
        best_validation_metric[n])

  return np.mean(best_train_metric), np.mean(best_validation_metric)

In [None]:
def create_test_train_fold(
    fold_num
):
  """Splits the dataset into training and held-out test set."""
  data_x, data_y, _ = load_dataset(dataset_name)
  print('Dataset: ', dataset_name, ', Size: ', data_x.shape[0])
  print('Cross-val fold: ', fold_num, _N_FOLDS)
  tf.logging.info('Dataset: %s, Size: %d', dataset_name, data_x.shape[0])
  tf.logging.info('Cross-val fold: %d/%d', fold_num, _N_FOLDS)
  # Get the training and test set based on the StratifiedKFold split
  (x_train_all, y_train_all), test_dataset = get_train_test_fold(
      data_x,
      data_y,
      fold_num=fold_num,
      num_folds=_N_FOLDS,
      stratified=not regression)
  data_gen = split_training_dataset(
      x_train_all,
      y_train_all,
      num_splits,
      stratified=not regression)
  return data_gen, test_dataset


In [None]:
def single_split_training(data_gen,
                          logdir):
  """Uses a specific (training, validation) split for NAM training."""
  for _ in range(data_split):
    (x_train, y_train), (x_validation, y_validation) = next(data_gen)
  curr_logdir = os.path.join(logdir, 'fold_{}',
                             'split_{}').format(fold_num,
                                                data_split)
  training(x_train, y_train, x_validation, y_validation, curr_logdir)

# Data_utils.py

In [None]:
def load_cms_muon_momentum(
):
  df = pd.read_csv('/kaggle/input/cmsnewsamples/new-smaples.csv').drop(columns = 'Unnamed: 0')
  df = df.drop(columns = [i for i in df.columns if '_1' in i])
  df['non_hits'] = df[[i for i in df.columns if 'mask' in i]].sum(axis=1)
  df = df[df['non_hits']==0].reset_index(drop=True)
  df['1/pT'] = df['q/pt'].abs()
  features = ['emtf_phi_'+str(i) for i in [0,2,3,4]] + ['emtf_theta_'+str(i) for i in [0,2,3,4]] + ['old_emtf_phi_'+str(i) for i in [0,2,3,4]]

  new_features = []
  for i in range(len(features)-1):
    for j in range(i+1, (i//4+1)*4):
        new_features.append('delta_'+'_'.join(features[i].split('_')[:-1])+'_'+str((j)%4)+'_'+str(i%4))
        df[new_features[-1]]=df[features[j]]-df[features[i]]

  features += new_features[:]
#   scaler_1 = MinMaxScaler()
  scaler_1 = StandardScaler()
  df[features] = scaler_1.fit_transform(df[features])

#   for i in range(int(len(features)//3)):
#         features.append('phi_theta_'+str(i))
#         features.append('phi_phi_'+str(i))
#         df[features[-2]]=df[features[i]]-df[features[int(len(features)//3)+1]]
#         df[features[-1]]=df[features[i]]-df[features[2*int(len(features)//3)+1]]
  
  return {
      'problem': 'regression',
      'X': pd.DataFrame(df[features].to_numpy()),
      'y': df[['1/pT']].to_numpy(),
  }

In [None]:
def load_dataset(dataset_name):
  """Loads the dataset according to the `dataset_name` passed.
  Args:
    dataset_name: Name of the dataset to be loaded.
  Returns:
    data_x: np.ndarray of size (n_examples, n_features) containining the
      features per input data point where n_examples is the number of examples
      and n_features is the number of features.
    data_y: np.ndarray of size (n_examples, ) containing the label/target
      for each example where n_examples is the number of examples.
    column_names: A list containing the feature names.
  """
  if dataset_name == 'CMS_Muon_Momentum':
    dataset = load_cms_muon_momentum()
  else:
    raise ValueError('{} not found!'.format(dataset_name))

  data_x, data_y = dataset['X'].copy(), dataset['y'].copy()
  problem_type = dataset['problem']
  data_x, column_names = transform_data(data_x)
  data_x = data_x.astype('float32')
  if (problem_type == 'classification') and \
      (not isinstance(data_y, np.ndarray)):
    data_y = pd.get_dummies(data_y).values
    data_y = np.argmax(data_y, axis=-1)
  data_y = data_y.astype('float32')
  return data_x, data_y, column_names

In [None]:
def get_train_test_fold(
    data_x,
    data_y,
    fold_num,
    num_folds,
    stratified = True,
    random_state = 42):
  """Returns a specific fold split for K-Fold cross validation.
  Randomly split dataset into `num_folds` consecutive folds and returns the fold
  with index `fold_index` for testing while the `num_folds` - 1 remaining folds
  form the training set.
  Args:
    data_x: Training data, with shape (n_samples, n_features), where n_samples
      is the number of samples and n_features is the number of features.
    data_y: The target variable, with shape (n_samples), for supervised learning
      problems.  Stratification is done based on the y labels.
    fold_num: Index of fold used for testing.
    num_folds: Number of folds.
    stratified: Whether to preserve the percentage of samples for each class in
      the different folds (only applicable for classification).
    random_state: Seed used by the random number generator.
  Returns:
    (x_train, y_train): Training folds containing 1 - (1/`num_folds`) fraction
      of entire data.
    (x_test, y_test): Test fold containing 1/`num_folds` fraction of data.
  """
  if stratified:
    stratified_k_fold = StratifiedKFold(
        n_splits=num_folds, shuffle=True, random_state=random_state)
  else:
    stratified_k_fold = KFold(
        n_splits=num_folds, shuffle=True, random_state=random_state)
  assert fold_num <= num_folds and fold_num > 0, 'Pass a valid fold number.'
  for train_index, test_index in stratified_k_fold.split(data_x, data_y):
    if fold_num == 1:
      x_train, x_test = data_x[train_index], data_x[test_index]
      y_train, y_test = data_y[train_index], data_y[test_index]
      return (x_train, y_train), (x_test, y_test)
    else:
      fold_num -= 1

In [None]:
def split_training_dataset(
    data_x,
    data_y,
    n_splits,
    stratified = True,
    test_size = 0.2,
    random_state = 1337):
  """Yields a generator that randomly splits data into (train, validation) set.
  The train set is used for fitting the DNNs/NAMs while the validation set is
  used for early stopping.
  Args:
    data_x: Training data, with shape (n_samples, n_features), where n_samples
      is the number of samples and n_features is the number of features.
    data_y: The target variable, with shape (n_samples), for supervised learning
      problems.  Stratification is done based on the y labels.
    n_splits: Number of re-shuffling & splitting iterations.
    stratified: Whether to preserve the percentage of samples for each class in
      the (train, validation) splits. (only applicable for classification).
    test_size: The proportion of the dataset to include in the validation split.
    random_state: Seed used by the random number generator.
  Yields:
    (x_train, y_train): The training data split.
    (x_validation, y_validation): The validation data split.
  """
  if stratified:
    stratified_shuffle_split = StratifiedShuffleSplit(
        n_splits=n_splits, test_size=test_size, random_state=random_state)
  else:
    stratified_shuffle_split = ShuffleSplit(
        n_splits=n_splits, test_size=test_size, random_state=random_state)
  split_gen = stratified_shuffle_split.split(data_x, data_y)

  for train_index, validation_index in split_gen:
    x_train, x_validation = data_x[train_index], data_x[validation_index]
    y_train, y_validation = data_y[train_index], data_y[validation_index]
    assert x_train.shape[0] == y_train.shape[0]
    yield (x_train, y_train), (x_validation, y_validation)

In [None]:
def transform_data(df):
  """Apply a fixed set of transformations to the pd.Dataframe `df`.
  Args:
    df: Input dataframe containing features.
  Returns:
    Transformed dataframe and corresponding column names. The transformations
    include (1) encoding categorical features as a one-hot numeric array, (2)
    identity `FunctionTransformer` for numerical variables. This is followed by
    scaling all features to the range (-1, 1) using min-max scaling.
  """
  column_names = df.columns
  new_column_names = []
  is_categorical = np.array([dt.kind == 'O' for dt in df.dtypes])
  categorical_cols = df.columns.values[is_categorical]
  numerical_cols = df.columns.values[~is_categorical]
  for index, is_cat in enumerate(is_categorical):
    col_name = column_names[index]
    if is_cat:
      new_column_names += [
          '{}: {}'.format(col_name, val) for val in set(df[col_name])
      ]
    else:
      new_column_names.append(col_name)
  cat_ohe_step = ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))

  cat_pipe = Pipeline([cat_ohe_step])
  num_pipe = Pipeline([('identity', FunctionTransformer(validate=True))])
  transformers = [('cat', cat_pipe, categorical_cols),
                  ('num', num_pipe, numerical_cols)]
  column_transform = ColumnTransformer(transformers=transformers)

  pipe = CustomPipeline([('column_transform', column_transform),
                         ('min_max', MinMaxScaler((-1, 1))), ('dummy', None)])
  df = pipe.apply_transformation(df)
  return df, new_column_names

In [None]:
class CustomPipeline(Pipeline):
  """Custom sklearn Pipeline to transform data."""

  def apply_transformation(self, x):
    """Applies all transforms to the data, without applying last estimator.
    Args:
      x: Iterable data to predict on. Must fulfill input requirements of first
        step of the pipeline.
    Returns:
      xt: Transformed data.
    """
    xt = x
    for _, transform in self.steps[:-1]:
      xt = transform.fit_transform(xt)
    return xt

# Main

In [None]:
tf.logging.set_verbosity(tf.logging.WARN)
data_gen, test_data_gen = create_test_train_fold(fold_num)
single_split_training(data_gen, logdir)

In [None]:
# tf.reset_default_graph()
# model = NAM(
#       num_inputs=num_inputs,
#       num_units=num_units,
#       dropout=np.float32(dropout),
#       feature_dropout=np.float32(feature_dropout),
#       activation=activation,
#       shallow=shallow,
#       trainable=True)

In [None]:
pd.read_csv('valid.csv').head()