## Jupyter notebook for running transformer metrics

In [1]:
!pip install networkx sklearn

Collecting sklearn
  Using cached sklearn-0.0.post12.tar.gz (2.6 kB)
  Preparing metadata (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[15 lines of output][0m
  [31m   [0m The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
  [31m   [0m rather than 'sklearn' for pip commands.
  [31m   [0m 
  [31m   [0m Here is how to fix this error in the main use cases:
  [31m   [0m - use 'pip install scikit-learn' rather than 'pip install sklearn'
  [31m   [0m - replace 'sklearn' by 'scikit-learn' in your pip requirements files
  [31m   [0m   (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
  [31m   [0m - if the 'sklearn' package is used by one of your dependencies,
  [31m   [0m   it would be great if you take some time to track which package uses
  [31m   [0m   'sklearn' instead of 'scikit-le

In [2]:
!git clone https://github.com/fmsnew/nas-bench-nlp-release.git
%cd nas-bench-nlp-release
!unzip -nq data/datasets.zip -d data/
!unzip -nq train_logs_multi_runs/logs.zip -d train_logs_multi_runs/

fatal: destination path 'nas-bench-nlp-release' already exists and is not an empty directory.
/data2/xxx/workspace/PRUNE/LPZero/nas-bench-nlp-release


In [3]:
import csv
import datetime
import json
import math
import os
import random
from argparse import Namespace

import data as nas_data
import numpy as np
import torch
from model import AWDRNNModel
from sklearn.metrics import pairwise_distances
from splitcross import SplitCrossEntropyLoss
from train import evaluate, train
from utils import batchify

In [4]:
def calculate_activations(inp):
    try:
        if isinstance(inp, tuple):
            inp = inp[0]
        # reshape input tensor to be batch size x single dimensional
        inp = inp[0].view(inp.size(1), -1)
        # will ReLU unit be active or not for each input? (binary codes) store in new tensor
        x = (inp > 0).float()
        # calculations for hamming distance
        K = x @ x.t()
        K2 = (1.0 - x) @ (1.0 - x.t())
        # sum above to rest of calculations, store as numpy array in cpu memory
        global K_score
        K_score = K_score + K.cpu().numpy() + K2.cpu().numpy()
    except:
        pass

In [5]:
import math

import networkx as nx
import torch
import torch.nn
from multilinear import MultiLinear

# From NAS-Bench-NLP https://github.com/fmsnew/nas-bench-nlp-release
class CustomRNNCell(torch.nn.Module):
    elementwise_ops_dict = {"prod": torch.mul, "sum": torch.add}

    def __init__(self, input_size, hidden_size, recepie):
        super(CustomRNNCell, self).__init__()

        self.activations_dict = {
            "tanh": torch.nn.Tanh(),
            "sigm": torch.nn.Sigmoid(),
            "leaky_relu": torch.nn.LeakyReLU(),
        }

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.recepie = recepie
        self.hidden_tuple_size = 0

        components_dict = {}

        self.G = nx.DiGraph()
        for k in recepie.keys():
            if k not in components_dict:

                component = self._make_component(recepie[k])
                if component is not None:
                    components_dict[k] = component
                if k.startswith("h_new"):
                    suffix = k.replace("h_new_", "")
                    if suffix.isdigit():
                        self.hidden_tuple_size = max(
                            [self.hidden_tuple_size, int(suffix) + 1]
                        )

                if k not in self.G.nodes():
                    self.G.add_node(k)
                for i, n in enumerate(recepie[k]["input"]):
                    if n not in self.G.nodes():
                        self.G.add_node(k)
                    self.G.add_edge(n, k)

        self.components = torch.nn.ModuleDict(components_dict)
        self.nodes_order = list(nx.algorithms.dag.topological_sort(self.G))

    def forward(self, x, hidden_tuple):
        calculated_nodes = {}
        # Modified to be able to get hidden states
        hidden_tuple[0].requires_grad_()
        hidden_tuple[0].retain_grad()
        hidden_states.append(hidden_tuple[0])
        for n in self.nodes_order:
            if n == "x":
                calculated_nodes["x"] = x.unsqueeze(0)
            elif n.startswith("h_prev") and n.replace("h_prev_", "").isdigit():
                calculated_nodes[n] = hidden_tuple[
                    int(n.replace("h_prev_", ""))
                ].unsqueeze(0)
            elif n in self.components:
                inputs = [calculated_nodes[k] for k in self.recepie[n]["input"]]
                calculated_nodes[n] = self.components[n](*inputs)
            else:
                # simple operations
                op = self.recepie[n]["op"]
                inputs = [calculated_nodes[k] for k in self.recepie[n]["input"]]
                if op in ["elementwise_prod", "elementwise_sum"]:
                    op_func = CustomRNNCell.elementwise_ops_dict[
                        op.replace("elementwise_", "")
                    ]
                    calculated_nodes[n] = op_func(inputs[0], inputs[1])
                    for inp in range(2, len(inputs)):
                        calculated_nodes[n] = op_func(calculated_nodes[n], inputs[i])
                elif op == "blend":
                    calculated_nodes[n] = (
                        inputs[0] * inputs[1] + (1 - inputs[0]) * inputs[2]
                    )
                elif op.startswith("activation"):
                    op_func = self.activations_dict[op.replace("activation_", "")]
                    calculated_nodes[n] = op_func(inputs[0])
                    # calculate and store K codes for activations in RNN - LeakyReLU, TanH, Sigmoid
                    calculate_activations(calculated_nodes[n])
        return tuple(
            [calculated_nodes[f"h_new_{i}"][0] for i in range(self.hidden_tuple_size)]
        )

    def _make_component(self, spec):
        if spec["op"] == "linear":
            input_sizes = [
                self.input_size if inp == "x" else self.hidden_size
                for inp in spec["input"]
            ]
            return MultiLinear(input_sizes, self.hidden_size)


class CustomRNN(torch.nn.Module):
    def __init__(self, input_size, hidden_size, recepie):
        super(CustomRNN, self).__init__()
        self.hidden_size = hidden_size
        self.cell = CustomRNNCell(input_size, hidden_size, recepie)
        self.reset_parameters()

    def forward(self, inputs, hidden_tuple=None):
        batch_size = inputs.size(1)
        if hidden_tuple is None:
            hidden_tuple = tuple(
                [
                    self.init_hidden(batch_size)
                    for _ in range(self.cell.hidden_tuple_size)
                ]
            )

        self.check_hidden_size(hidden_tuple, batch_size)

        hidden_tuple = tuple([x[0] for x in hidden_tuple])
        outputs = []
        for x in torch.unbind(inputs, dim=0):
            hidden_tuple = self.cell(x, hidden_tuple)
            outputs.append(hidden_tuple[0].clone())

        return torch.stack(outputs, dim=0), tuple(
            [x.unsqueeze(0) for x in hidden_tuple]
        )

    def init_hidden(self, batch_size):
        # num_layers == const (1)
        return torch.zeros(1, batch_size, self.hidden_size).to(
            next(self.parameters()).device
        )

    def reset_parameters(self):
        stdv = 1.0 / math.sqrt(self.hidden_size)
        for param in self.parameters():
            torch.nn.init.uniform_(param, -stdv, stdv)

    def check_hidden_size(self, hidden_tuple, batch_size):
        expected_hidden_size = (1, batch_size, self.hidden_size)
        msg = "Expected hidden size {}, got {}"
        for hx in hidden_tuple:
            if hx.size() != expected_hidden_size:
                raise RuntimeError(msg.format(expected_hidden_size, tuple(hx.size())))

In [6]:
import json

import numpy as np
import torch
import torch.nn
from embed_regularize import embedded_dropout
from locked_dropout import LockedDropout
from weight_drop import ParameterListWeightDrop, WeightDrop

# From NAS-Bench-NLP https://github.com/fmsnew/nas-bench-nlp-release
class AWDRNNModel(torch.nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""

    # add batch_size parameter
    def __init__(
        self,
        rnn_type,
        ntoken,
        ninp,
        nhid,
        nlayers,
        dropout=0.5,
        dropouth=0.5,
        dropouti=0.5,
        dropoute=0.1,
        wdrop=0,
        tie_weights=False,
        recepie=None,
        verbose=True,
    ):
        super(AWDRNNModel, self).__init__()
        self.lockdrop = LockedDropout()
        self.idrop = torch.nn.Dropout(dropouti)
        self.hdrop = torch.nn.Dropout(dropouth)
        self.drop = torch.nn.Dropout(dropout)
        self.encoder = torch.nn.Embedding(ntoken, ninp)
        self.wdrop = wdrop
        self.verbose = verbose
        self.ntoken = ntoken

        if recepie is not None:
            recepie = json.loads(recepie)

        self.rnns = []
        for i in range(nlayers):
            input_size = ninp if i == 0 else nhid
            hidden_size = nhid if i != nlayers - 1 else (ninp if tie_weights else nhid)
            if rnn_type == "LSTM":
                self.rnns.append(torch.nn.LSTM(input_size, hidden_size))
            elif rnn_type == "CustomRNN":
                self.rnns.append(CustomRNN(input_size, hidden_size, recepie))

        if wdrop:
            if rnn_type == "LSTM":
                self.rnns = [
                    WeightDrop(rnn, ["weight_hh_l0"], dropout=wdrop)
                    for rnn in self.rnns
                ]
            elif rnn_type == "CustomRNN":
                wd_rnns = []
                for rnn in self.rnns:
                    multilinear_components = []
                    for k, v in rnn.cell.components.items():
                        if rnn.cell.recepie[k]["op"] == "linear":
                            for i in np.where(
                                np.array(rnn.cell.recepie[k]["input"]) != "x"
                            )[0]:
                                multilinear_components.append(
                                    f"cell.components.{k}.weights.{i}"
                                )
                    wd_rnns.append(
                        ParameterListWeightDrop(
                            rnn, multilinear_components, dropout=wdrop
                        )
                    )
                    self.rnns = wd_rnns

        if self.verbose:
            print(self.rnns)
        self.rnns = torch.nn.ModuleList(self.rnns)
        self.decoder = torch.nn.Linear(nhid, ntoken)

        if tie_weights:
            self.decoder.weight = self.encoder.weight

        self.init_weights()

        self.rnn_type = rnn_type
        self.ninp = ninp
        self.nhid = nhid
        self.nlayers = nlayers
        self.dropout = dropout
        self.dropouti = dropouti
        self.dropouth = dropouth
        self.dropoute = dropoute
        self.tie_weights = tie_weights
        self.recepie = recepie

    def reset(self):
        pass

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden, return_h=False, skip_embedding=False):

        emb = (
            input
            if skip_embedding
            else embedded_dropout(
                self.encoder, input, dropout=self.dropoute if self.training else 0
            )
        )

        # store embedding output
        self.embeddings = emb
        # emb = self.idrop(emb)

        emb = self.lockdrop(emb, self.dropouti)

        raw_output = emb
        new_hidden = []
        raw_outputs = []
        outputs = []
        for i, rnn in enumerate(self.rnns):
            raw_output, new_h = rnn(raw_output, hidden[i])
            new_hidden.append(new_h)
            raw_outputs.append(raw_output)
            if i != self.nlayers - 1:
                # self.hdrop(raw_output) add???
                raw_output = self.lockdrop(raw_output, self.dropouth)
                outputs.append(raw_output)
        hidden = new_hidden

        output = self.lockdrop(raw_output, self.dropout)
        outputs.append(output)
        result = output.view(output.size(0) * output.size(1), output.size(2))
        if return_h:
            return result, hidden, raw_outputs, outputs
        return result, hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters()).data
        hidden = []
        for i in range(self.nlayers):
            if self.rnn_type == "LSTM":
                hidden_tuple_size = 2
            elif self.rnn_type == "CustomRNN":
                if self.wdrop:
                    # wrapped with ParameterListWeightDrop
                    hidden_tuple_size = self.rnns[0].module.cell.hidden_tuple_size
                else:
                    hidden_tuple_size = self.rnns[0].cell.hidden_tuple_size
            hidden_size = (
                self.nhid
                if i != self.nlayers - 1
                else (self.ninp if self.tie_weights else self.nhid)
            )
            hidden.append(
                tuple(
                    [
                        weight.new(1, bsz, hidden_size).zero_()
                        for _ in range(hidden_tuple_size)
                    ]
                )
            )

        return hidden

In [7]:
def get_batch(source, i, args, seq_len=None, evaluation=False):
    seq_len = min(seq_len if seq_len else args.bptt, len(source) - 1 - i)
    data_ = source[i : i + seq_len]
    target = source[i + 1 : i + 1 + seq_len].view(-1)
    return data_, target

In [8]:
def repackage_hidden(h):
    """Wraps hidden states in new Tensors,
    to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

In [9]:
# Get Jacobian of layer
def get_batch_jacobian(net, x, target, device, hidden, args=None, skip_embedding=False):
    # reset gradient of network
    net.zero_grad()
    # begin recording all operations on input
    # run network on input batch with hooks, returns classified and raw output
    y, out = net(x, hidden, skip_embedding=skip_embedding)
    net.embeddings.retain_grad()
    # backpropogate from output with regards to gradients of 1 (used for old jacobian metric)
    y.backward(torch.ones_like(y))
    # get gradients of inputs
    jacob = net.embeddings.grad.detach()
    outputs = []
    for output in out:
        outputs.append(output[0].detach())
    return jacob, target.detach(), y.detach(), outputs

In [10]:
def hooklogdet(K, labels=None):
    # compute natural logarithm of determinant of array (the final score)
    s, ld = np.linalg.slogdet(K)
    return ld

In [11]:
# Jacobian Cosine Score
def jacobian_score(jacobs):
    jacob = torch.transpose(jacobs, 0, 1).reshape(jacobs.size(1), -1).cpu().numpy()
    # calculate Pearson product-moment correlation coefficients
    correlations = np.corrcoef(jacob)
    # compute eignenvalues of matrix, discard normalized eigenvectors
    v, _ = np.linalg.eig(correlations)
    # compute final score
    k = 1e-5
    return -np.sum(np.log(v + k) + 1.0 / (v + k))

In [12]:
# Jacobian Cosine Score
def cosine_score(jacobs):
    jacob = torch.transpose(jacobs, 0, 1).reshape(jacobs.size(1), -1).cpu().numpy()
    norm = np.linalg.norm(jacob, axis=1)
    normed = jacob / norm[:, None]
    cosines = (-pairwise_distances(normed, metric="cosine") + 1) - np.identity(
        normed.shape[0]
    )
    summed = np.sum(np.power(np.absolute(cosines.flatten()), 1.0 / 20)) / 2
    return 1 - (1 / (pow(cosines.shape[0], 2) - cosines.shape[0]) * summed)

In [13]:
# Jacobian Noise Score
def noised_jacobian(
    network, amplitude, embedding_output, variance, noise, target, device, hidden, args
):
    noisy = embedding_output + amplitude * variance * noise
    noisy = torch.tensor(noisy, requires_grad=True).float().to(device)
    noise_jacobs, labels, y, out = get_batch_jacobian(
        network, noisy, target, device, hidden, args, skip_embedding=True
    )
    return noise_jacobs

In [14]:
# Synflow (Synaptic Saliency) Score
def synflow(net, x, target, device, hidden, args=None, skip_embedding=False):
    net.zero_grad()
    y, out = net(x, hidden, skip_embedding=False)
    y.backward(y)

    metric_array = []
    for layer in net.modules():
        if isinstance(layer, MultiLinear):
            for i in range(1, len(layer.weights)):
                if layer.weights[i].grad is not None:
                    metric_array.append(
                        torch.abs(layer.weights[i] * layer.weights[i].grad)
                    )
                else:
                    metric_array.append(torch.zeros_like(layer.weights[i]))
        if isinstance(layer, torch.nn.Linear):

            if layer.weight is not None:
                metric_array.append(
                    torch.abs(layer.weight.double() * layer.weight.grad.double())
                )
            else:
                metric_array.append(torch.zeros_like(layer.weight))
    
    sum = 0.0
    for i in range(len(metric_array)):
        sum += torch.nansum(metric_array[i])
    return sum.detach().item()

In [15]:
# Hidden state scores
def hidden_scores(hiddens):
    metric_array = []
    for hidden in hidden_states:
        if hidden.grad is not None:
            metric_array.append(torch.abs(hidden * hidden.grad))
        else:
            metric_array.append(torch.zeros_like(hidden))
    sum = 0.0
    for i in range(len(metric_array)):
        sum += torch.nansum(metric_array[i])

    hidden_state1 = torch.dstack([i.detach() for i in hidden_states[:70]])
    hidden_state2 = torch.dstack([i.detach() for i in hidden_states[70:140]])
    hidden_state3 = torch.dstack([i.detach() for i in hidden_states[-70:]])

    return [
        sum.detach().item(),
        jacobian_score(hidden_state1),
        jacobian_score(hidden_state2),
        jacobian_score(hidden_state3),
    ]

In [16]:
# Run scores on all architectures
def run_scores(log_file, test_data, ntokens, criterion, writer, batch_num=0, seed_num=0):
    log = json.load(open(log_file, "r"))
    args = Namespace(**log)
    args.cuda = True

    global K_score
    K_score = np.zeros((args.eval_batch_size, args.eval_batch_size))

    global hidden_states
    hidden_states = []
    
    # set seed for reproducability
    random.seed(seed_num)
    np.random.seed(seed_num)
    torch.manual_seed(seed_num)

    network = AWDRNNModel(
        args.model,
        ntokens,
        args.emsize,
        args.nhid,
        args.nlayers,
        args.dropout,
        args.dropouth,
        args.dropouti,
        args.dropoute,
        args.wdrop,
        args.tied,
        args.recepie,
        verbose=False,
    )

    try:
        test_loss = args.test_losses[-1]
    except:
        test_loss = math.nan
    print("-" * 89)
    try:
        test_perplexity = math.exp(test_loss)
    except:
        test_perplexity = math.nan
    test_bpw = test_loss / math.log(2)

    # set device to run model on
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    network.to(device)
    
    param = sum(p.numel() for p in network.parameters())
    train_param = sum(p.numel() for p in network.parameters() if p.requires_grad)

    network.eval()
    hidden = network.init_hidden(args.eval_batch_size)
    data, target = get_batch(test_data, batch_num, args, evaluation=True)

    # put network on device
    network = network.to(device)

    # inputs for binary codes score (K)
    x2 = torch.clone(data)
    x2 = x2.to(device)
    data, target = data.to(device), target.to(device)

    # get jacobian for old score
    jacobs, labels, y, out = get_batch_jacobian(
        network, data, target, device, hidden, args
    )

    # calculate K score
    network(x2.to(device), hidden)
    k_score = hooklogdet(K_score, target)
    print("K Score: " + str(k_score))

    ### Old Jacobian Score Computation
    j_score = jacobian_score(jacobs)
    print("Jacobian Score: " + str(j_score))

    ### Jacobian Score Cosine
    c_score = cosine_score(jacobs)
    print("Jacobian Score Cosine: " + str(c_score))
    embedding_output = network.embeddings.detach().cpu().numpy()
    variance = np.var(embedding_output, axis=1, keepdims=True)
    noise = np.random.normal(size=embedding_output.shape)

    ### Small Noise Score
    sn_jacobs = noised_jacobian(
        network,
        1.0 / 10.0,
        embedding_output,
        variance,
        noise,
        target,
        device,
        hidden,
        args,
    )
    sn_score = 1 - abs(j_score - jacobian_score(sn_jacobs))
    print("Small Noise Score: " + str(sn_score))

    ### Small Noise Score Cosine
    snc_score = 1 - abs(c_score - cosine_score(sn_jacobs))
    print("Small Noise Score Cosine: " + str(snc_score))

    ### Large Noise Score
    ln_jacobs = noised_jacobian(
        network, 10, embedding_output, variance, noise, target, device, hidden, args
    )
    ln_score = 1 - abs(j_score - jacobian_score(ln_jacobs))
    print("Large Noise Score: " + str(ln_score))

    ### Large Noise Score Cosine
    lnc_score = 1 - abs(c_score - cosine_score(ln_jacobs))
    print("Large Noise Score Cosine: " + str(lnc_score))

    ### More Noised Jacobian Score
    vln_jacobs = noised_jacobian(
        network, 100, embedding_output, variance, noise, target, device, hidden, args
    )
    mnj_score = (
        j_score
        * jacobian_score(sn_jacobs)
        * jacobian_score(ln_jacobs)
        * jacobian_score(vln_jacobs)
    )
    print("More Noised Jacobian Score: " + str(mnj_score))

    ### More Noised Jacobian Score Cosine
    mnjc_score = (
        c_score
        * cosine_score(sn_jacobs)
        * cosine_score(ln_jacobs)
        * cosine_score(vln_jacobs)
    )
    print("More Noised Jacobian Score Cosine: " + str(mnjc_score))

    ### Synflow
    synflow_score = synflow(network, data, target, device, hidden, args)
    print("Synflow Score: " + str(synflow_score))
    
    ### Hiden States Scores
    hidden_states_scores = hidden_scores(hidden_states)
    print("Hidden State Scores: " + str(hidden_states_scores))

    row = [
        os.path.basename(log_file),
        test_loss,
        test_perplexity,
        test_bpw,
        k_score,
        j_score,
        c_score,
        sn_score,
        snc_score,
        ln_score,
        lnc_score,
        mnj_score,
        mnjc_score,
        synflow_score,
    ]

    print(row)
    writer.writerow(row + hidden_states_scores)

In [18]:
!ls

benchmarking_examples.ipynb  model.py			  search_space.py
calculate_ged.py	     models_weights		  setup.py
custom_rnn.py		     multilinear.py		  splitcross.py
data			     nas_environment.py		  train_logs_multi_runs
data.py			     plotting.py		  train_logs_single_run
embed_regularize.py	     __pycache__		  train_logs_wikitext-2
__init__.py		     README.md			  train.py
LICENSE			     reproduce_model.ipynb	  utils.py
locked_dropout.py	     requirements.txt		  weight_drop.py
main_one_model_train.py      search_space_analysis.ipynb
make_arch_embeddings.ipynb   search_space_examples.ipynb


In [19]:
# Load NAS-Bench-NLP with Penn Treebank dataset
suffix = "0025_2020-04-19_13-24-21_999981968"
log = json.load(
    open("train_logs_multi_runs/log_stats_model_100" + suffix + ".json", "r")
)
args = Namespace(**log)
args.cuda = True
cuda = "cuda:0"

if "test_data" not in globals():
    corpus = nas_data.Corpus(args.data)
    test_data = batchify(corpus.test, args.eval_batch_size, args, cuda)

ntokens = len(corpus.dictionary)

criterion = SplitCrossEntropyLoss(args.emsize, splits=[], verbose=False)

K_score = np.zeros((args.eval_batch_size, args.eval_batch_size))

In [20]:
ablation_models = ["log_stats_model_1009532_2020-04-13_04-50-42_999940095.json",
"log_stats_model_1003424_2020-04-22_06-45-47_999996414.json",
"log_stats_model_1006809_2020-04-14_02-17-10_999987074.json",
"log_stats_model_1004163_2020-04-21_07-14-54_999984871.json",
"log_stats_model_1011260_2020-04-15_13-12-42_999971989.json",
"log_stats_model_1002763_2020-04-22_17-21-45_999986705.json",
"log_stats_model_1013157_2020-04-14_06-17-11_999980530.json",
"log_stats_model_1002378_2020-04-16_11-37-11_999966726.json",
"log_stats_model_1001317_2020-04-21_07-33-43_999989525.json",
"log_stats_model_1013823_2020-04-16_09-58-08_999916458.json",]

In [21]:
# Ablation on different intializations
with open("data/RNN_intialization_ablation.csv", "a") as f:
    writer = csv.writer(f)
    header = [
        "log name",
        "loss",
        "perplexity",
        "bits per word",
        "K Score",
        "Jacobian Score",
        "Jacobian Score Cosine",
        "Small Noise Score",
        "Small Noise Score Cosine",
        "Large Noise Score",
        "Large Noise Score Cosine",
        "More Noised Jacobian Score",
        "More Noised Jacobian Score Cosine",
        "Synflow Score",
        "Hidden Synflow Score",
        "Hidden Layer 1 Covariance Score",
        "Hidden Layer 2 Covariance Score",
        "Hidden Layer 3 Covariance Score",
    ]
    writer.writerow(header)
    f.flush()

    directory = "nas-bench-nlp-release/train_logs_multi_runs"
    i = 1
    
    for filename in os.listdir(directory):
        log_file = os.path.join(directory, filename)
        print(i)
        try:
            run_scores(log_file, test_data, ntokens, criterion, writer)
        except:
            row = [os.path.basename(log_file)] + [math.nan] * len(header)
            writer.writerow(row)
        f.flush()
        i = i + 1
        print()

FileNotFoundError: [Errno 2] No such file or directory: 'nas-bench-nlp-release/train_logs_multi_runs'

In [23]:
# Ablation on different minibatches
with open("data/RNN_bach_ablation.csv", "a") as f:
    writer = csv.writer(f)
    header = [
        "log name",
        "loss",
        "perplexity",
        "bits per word",
        "K Score",
        "Jacobian Score",
        "Jacobian Score Cosine",
        "Small Noise Score",
        "Small Noise Score Cosine",
        "Large Noise Score",
        "Large Noise Score Cosine",
        "More Noised Jacobian Score",
        "More Noised Jacobian Score Cosine",
        "Synflow Score",
        "Hidden Synflow Score",
        "Hidden Layer 1 Covariance Score",
        "Hidden Layer 2 Covariance Score",
        "Hidden Layer 3 Covariance Score",
    ]
    writer.writerow(header)
    f.flush()

    directory = "train_logs_multi_runs"
    i = 1
    
    for filename in ablation_models:
        log_file = os.path.join(directory, filename)
        print(i)
        for j in range(0, 10):
            run_scores(log_file, test_data, ntokens, criterion, writer, seed_num=j)
            f.flush()
        i = i + 1
        print()

1
-----------------------------------------------------------------------------------------
K Score: 467.13094737260195
Jacobian Score: -971.2166577207325
Jacobian Score Cosine: 0.5011517375415844
Small Noise Score: 0.25286743696688063
Small Noise Score Cosine: 0.9999994587419816
Large Noise Score: -171.7227279373435
Large Noise Score Cosine: 0.9997801420989116
More Noised Jacobian Score: 76003042030.19421
More Noised Jacobian Score Cosine: 0.06417502913889579
Synflow Score: 2274100.5
Hidden State Scores: [3917744.75, -340872.8637571794, -5030771.8383138785, -9369538.49248696]
['log_stats_model_1009532_2020-04-13_04-50-42_999940095.json', 4.622077719679156, 101.70512749998062, 6.6682486047844876, 467.13094737260195, -971.2166577207325, 0.5011517375415844, 0.25286743696688063, 0.9999994587419816, -171.7227279373435, 0.9997801420989116, 76003042030.19421, 0.06417502913889579, 2274100.5]
-----------------------------------------------------------------------------------------
K Score: 477