samirsalman · samirsalman · Mar 24, 2021 · Mar 28, 2021 · Mar 28, 2021 · Mar 28, 2021
diff --git a/scripts/contrib/pad_model_vocabulary.py b/scripts/contrib/pad_model_vocabulary.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+# Pads a Marian model's vocabulary to have greater size.  The added tokens have
+# zero probability.
+# ./pad_model_vocabulary.py input.npz output.npz desired_vocab_size
+#
+# You'll also need to separately pad your vocabulary file like so:
+# old=$(wc -l input.vocab |cut -d " " -f 1)
+# (cat input.vocab; seq -f "<PADDING%g>" $((desired_vocab_size-old))) >output.vocab
+#
+# Warning: probably only works with shared vocabulary models.
+import math
+import numpy as np
+import sys
+import yaml
+
+# Amend the vocab size in a raw ["special:model.yml"] data from a Marian npz.
+# Returns the raw data to use for ["special:model.yml"]
+def substitute_vocab_config(raw, new_size):
+  print("Old yml: ", raw.tostring())
+  raw_yaml = raw.tostring().decode("utf-8")
+  #Python yaml doesn't like null bytes.
+  if raw_yaml.endswith("\x00"):
+    raw_yaml = raw_yaml[:-1]
+  config = yaml.load(raw_yaml)
+  config['dim-vocabs'] = [new_size] * len(config['dim-vocabs'])
+  raw_yaml = yaml.dump(config)
+  if raw_yaml.endswith("\n"):
+    raw_yaml = raw_yaml[:-1]
+  raw_yaml += "\x00"
+  return np.array(bytearray(raw_yaml, 'utf-8'))
+
+if len(sys.argv) != 4:
+  print("Usage: " + sys.argv[0] + " input.npz output.npz desired_vocab_size")
+  sys.exit(1)
+
+resized_path = sys.argv[2]
+new_size = int(sys.argv[3])
+old_model = np.load(sys.argv[1])
+
+new_model = dict(old_model)
+old_size = len(old_model["Wemb"])
+if old_size > new_size:
+  sys.stderr.write("New size is smaller than original.  Cowardly refusing to clip vocab.\n")
+  sys.exit(2)
+print("Before: ", new_model["decoder_ff_logit_out_b"].shape, new_model["Wemb"].shape)
+bias = new_model["decoder_ff_logit_out_b"]
+new_model["decoder_ff_logit_out_b"] = np.pad(bias, [(0,0),(0,new_size - bias.shape[1])], mode='constant', constant_values = -math.inf)
+new_model["Wemb"] = np.pad(new_model["Wemb"], [(0,new_size - bias.shape[1]), (0,0)], mode='constant', constant_values = 0)
+print("After: ", new_model["decoder_ff_logit_out_b"].shape, new_model["Wemb"].shape)
+new_model["special:model.yml"] = substitute_vocab_config(new_model["special:model.yml"], new_size)
+print("New yml: ", new_model["special:model.yml"].tostring())
+np.savez(resized_path, **new_model)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -100,6 +100,7 @@ set(MARIAN_SOURCES
   translator/nth_element.cpp
   translator/helpers.cpp
   translator/scorers.cpp
+  translator/swappable.cpp
 
   training/graph_group_async.cpp
   training/graph_group_sync.cpp
@@ -182,6 +183,7 @@ if(CUDA_FOUND)
     tensors/gpu/add_all.cu
     tensors/gpu/tensor_operators.cu
     tensors/gpu/cudnn_wrappers.cu
+    tensors/gpu/swap.cu
     translator/nth_element.cu
     translator/helpers.cu
     STATIC)
@@ -213,6 +215,10 @@ if (NOT COMPILE_LIBRARY_ONLY)
   set_target_properties(marian_decoder PROPERTIES OUTPUT_NAME marian-decoder)
   target_compile_options(marian_decoder PRIVATE ${ALL_WARNINGS})
 
+  add_executable(marian_swapper command/marian_swapper.cpp)
+  set_target_properties(marian_swapper PROPERTIES OUTPUT_NAME marian_swapper)
+  target_compile_options(marian_swapper PRIVATE ${ALL_WARNINGS})
+
   add_executable(marian_scorer command/marian_scorer.cpp)
   set_target_properties(marian_scorer PROPERTIES OUTPUT_NAME marian-scorer)
   target_compile_options(marian_scorer PRIVATE ${ALL_WARNINGS})
@@ -225,7 +231,7 @@ if (NOT COMPILE_LIBRARY_ONLY)
   set_target_properties(marian_conv PROPERTIES OUTPUT_NAME marian-conv)
   target_compile_options(marian_conv PRIVATE ${ALL_WARNINGS})
 
-  set(EXECUTABLES ${EXECUTABLES} marian_train marian_decoder marian_scorer marian_vocab marian_conv)
+  set(EXECUTABLES ${EXECUTABLES} marian_train marian_decoder marian_swapper marian_scorer marian_vocab marian_conv)
 
   # marian.zip and marian.tgz
   # This combines marian, marian_decoder in a single ZIP or TAR file for

diff --git a/src/command/marian_swapper.cpp b/src/command/marian_swapper.cpp
@@ -0,0 +1,98 @@
+#include "translator/history.h"
+#include "translator/output_printer.h"
+#include "translator/swappable.h"
+
+#include <iostream>
+#include <string>
+#include <unordered_map>
+
+namespace marian {
+void LoadBig(Ptr<Options> options, std::unordered_map<std::string, CPULoadedModel> &to) {
+  to.emplace("pten", CPULoadedModel(options,
+      "/home/ubuntu/consistent-big-models/padded/pten.npz",
+      {"/home/ubuntu/consistent-big-models/padded/pten.vocab"},
+      "/home/ubuntu/consistent-big-models/padded/pten.vocab"));
+
+  to.emplace("enit", CPULoadedModel(options,
+      "/home/ubuntu/consistent-big-models/padded/enit.npz",
+      {"/home/ubuntu/consistent-big-models/padded/enit.vocab"},
+      "/home/ubuntu/consistent-big-models/padded/enit.vocab"));
+}
+
+void LoadTiny(Ptr<Options> options, std::unordered_map<std::string, CPULoadedModel> &to) {
+  std::vector<std::string> models = {"csen", "encs", "enet", "eten", "esen", "enes"};
+  for (const std::string m : models) {
+    std::string base = "/home/ubuntu/consistent-bergamot-students/padded/";
+    base += m + ".";
+    to.emplace(m, CPULoadedModel(options, base + "npz", {base + "spm"}, base + "spm"));
+  }
+}
+
+} // namespace
+
+/* Demo program: run with options for any of the models */
+int main(int argc, char** argv) {
+  using namespace marian;
+  Ptr<Options> options = parseOptions(argc, argv, cli::mode::translation);
+
+  Ptr<GPUEngine> engine = New<GPUEngine>(options, 0);
+  GPULoadedModel slot(engine);
+
+  std::unordered_map<std::string, CPULoadedModel> models;
+//  LoadBig(options, models);
+  LoadTiny(options, models);
+
+  // begin with a space to avoid conflict with a real sentence.
+  const std::string kSwitchPrefix(" CHANGE ");
+
+  bool alignments = !options->get<std::string>("alignment").empty();
+
+  bool loaded = false;
+  std::string line;
+  while (std::getline(std::cin, line)) {
+    // Switch out which model is used.
+    if (line.substr(0, kSwitchPrefix.size()) == kSwitchPrefix) {
+      std::string key = line.substr(kSwitchPrefix.size());
+      auto found = models.find(key);
+      if (found == models.end()) {
+        std::cerr << "Model for " << key << " not loaded." << std::endl;
+        return 1;
+      }
+      slot.Load(found->second);
+      loaded = true;
+      continue;
+    }
+    if (!loaded) {
+      std::cerr << "Select a model first." << std::endl;
+      continue;
+    }
+
+    // Actually translating with a model.
+    marian::Histories histories = slot.Translate({line});
+    // In practice there is one history because we provided one line.
+    for(auto history : histories) {
+      Result result(history->top());
+      Words words = std::get<0>(result);
+      std::cout << slot.TrgVocab()->decode(words) << std::endl;
+
+      /* Print alignments */
+      if (alignments) {
+        Hypothesis &hypo = *std::get<1>(result);
+        // [t][s] -> P(s|t)
+        marian::data::SoftAlignment alignment(hypo.tracebackAlignment());
+        // An easier call for this is:
+        // std:cout << data::SoftAlignToString(alignment);
+        // The below is just there to show how access them programatically.
+        // NB you can convert to hard with data::ConvertSoftAlignToHardAlign(alignment, threshold)
+        for (auto target : alignment) {
+          for (float source : target) {
+            std::cout << source << ' ';
+          }
+          std::cout << '\n';
+        }
+      }
+    }
+  }
+
+  return 0;
+}
diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
@@ -396,6 +396,9 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
       "Redefine logical epoch counter as multiple of data epochs (e.g. 1e), updates (e.g. 100Ku) or labels (e.g. 1Gt). "
       "Second parameter defines width of fractional display, 0 by default.",
       {"1e", "0"});
+  cli.add<std::string>("--save-from",
+      "Save model not before arg updates (append 't' for every  arg  target labels)",
+      "0u");
 
   addSuboptionsInputLength(cli);
   addSuboptionsTSV(cli);
@@ -572,8 +575,11 @@ void ConfigParser::addOptionsValidation(cli::CLIWrapper& cli) {
   cli.add<bool>("--valid-reset-stalled",
      "Reset all stalled validation metrics when the training is restarted");
   cli.add<size_t>("--early-stopping",
-     "Stop if the first validation metric does not improve for  arg  consecutive validation steps",
-     10);
+      "Stop if the first validation metric does not improve for  arg  consecutive validation steps",
+      10);
+    cli.add<std::string>("--valid-from",
+        "Validate model not before arg updates (append 't' for every arg target labels)",
+        "0u");
 
   // decoding options
   cli.add<size_t>("--beam-size,-b",
@@ -691,7 +697,8 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
   cli.add<std::vector<int>>("--output-approx-knn",
      "Use approximate knn search in output layer (currently only in transformer)")
      ->implicit_val("100 1024");
-
+  cli.add<std::string>("--swap-model",
+      "Path to model to swap to.");
 #if 0 // @TODO: Ask Hany if there are any decoding-time options
   // add ULR settings
   addSuboptionsULR(cli);

diff --git a/src/graph/parameters.h b/src/graph/parameters.h
@@ -2,6 +2,7 @@
 
 #include <fstream>
 #include <map>
+#include <unordered_map>
 #include <unordered_set>
 
 #include "common/definitions.h"
@@ -22,12 +23,12 @@ class Parameters {
 
   /** @brief List of all parameter nodes of this expression graph. */
   std::vector<Expr> params_;
-  std::map<std::string, Expr> named_;
+  std::unordered_map<std::string, Expr> named_;
 
-  Ptr<TensorAllocator> vals_;
-  Ptr<TensorAllocator> grads_;
+  std::unique_ptr<TensorAllocator> vals_;
+  std::unique_ptr<TensorAllocator> grads_;
 
-  size_t totalCapacity(Ptr<TensorAllocator> alloc) {
+  size_t totalCapacity(const TensorAllocator *alloc) {
     size_t sum = 0;
     for(auto p : params_) {
       sum += alloc->capacity(p->shape(), p->value_type());
@@ -73,18 +74,18 @@ class Parameters {
   }
 
   virtual void init(Ptr<Backend> backend) {
-    vals_ = New<TensorAllocator>(backend);
-    grads_ = New<TensorAllocator>(backend);
+    vals_.reset(new TensorAllocator(backend));
+    grads_.reset(new TensorAllocator(backend));
   }
 
   virtual void init(Ptr<Backend> backend, Ptr<Device> device) {
-    vals_ = New<TensorAllocator>(backend, device);
-    grads_ = New<TensorAllocator>(backend, device);
+    vals_.reset(new TensorAllocator(backend, device));
+    grads_.reset(new TensorAllocator(backend, device));
   }
 
   virtual void allocateForward() {
-    if(!params_.empty() && vals_->size() == 0) {
-      vals_->reserveExact(totalCapacity(vals_));
+    if(!params_.empty() && vals_ && vals_->size() == 0) {
+      vals_->reserveExact(totalCapacity(vals_.get()));
 
       // sort parameters by name before allocation to make sure the memory layout after allocation is always the same
       std::sort(params_.begin(), params_.end(), [](Expr n1, Expr n2){ return n1->name() < n2->name(); });
@@ -98,12 +99,12 @@ class Parameters {
   }
 
   virtual void allocateBackward() {
-    if(!params_.empty() && grads_->size() == 0) {
+    if(!params_.empty() && grads_ && grads_->size() == 0) {
 
       // sort parameters by name before allocation to make sure the memory layout after allocation is always the same
       std::sort(params_.begin(), params_.end(), [](Expr n1, Expr n2){ return n1->name() < n2->name(); });
 
-      grads_->reserveExact(totalCapacity(grads_));
+      grads_->reserveExact(totalCapacity(grads_.get()));
       for(auto p : params_)
         if(!p->grad())
           grads_->allocate(p->grad(), p->shape(), p->value_type());
@@ -120,8 +121,14 @@ class Parameters {
     params_.clear();
     named_.clear();
 
-    vals_->clear();
-    grads_->clear();
+    if (vals_) vals_->clear();
+    if (grads_) grads_->clear();
+  }
+
+  // Free the underlying memory but leave the parameters and names with dangling pointers.  Only use this if you're going to edit the memory pointers manually.
+  virtual void freeMemory() {
+    vals_.reset();
+    grads_.reset();
   }
 };
 
@@ -167,6 +174,10 @@ class MappedParameters : public Parameters {
     params_.clear();
     named_.clear();
   }
+
+  void freeMemory() override {
+    ABORT("Have not implemented partial munmap for parameters");
+  }
 };
 
 }  // namespace marian
diff --git a/src/optimizers/optimizers.h b/src/optimizers/optimizers.h
@@ -60,6 +60,8 @@ class OptimizerBase : public TrainingObserver, public ExponentialSmoothing {
     batchesSeen_ = state.batches;
   }
 
+  virtual void restore(const std::vector<io::Item> &state) {};
+
   virtual void actAfterLoaded(TrainingState& state) override {
     eta_ = state.eta;
     batchesSeen_ = state.batches;
@@ -195,19 +197,32 @@ class Adagrad : public OptimizerBase {
 
   void setParams(const std::vector<float>& params) override {
     if(params.size() > 0)
-      eps_ = params[0];
+      def_eps_ = eps_ = params[0];
   }
 
-  std::vector<Tensor> getShards() override { 
+  std::vector<Tensor> getShards() override {
     auto shards = OptimizerBase::getShards();
     shards.push_back(gt_);
     return shards;
   }
 
+  void restore(const std::vector<io::Item> &state) override {
+    OptimizerBase::restore(state);
+
+    for (auto &item : state) {
+      if ("adagrad_gt" == item.name) {
+        if (gt_) gt_->set(item);
+      }
+    }
+
+    eps_ = def_eps_;
+  }
+
 private:
   void updateImpl(Tensor params, Tensor grads, size_t actualMBSize) override;
   void resetStats() override;
 
+  float def_eps_ = 1e-8f;
   float eps_ = 1e-8f;
   Ptr<TensorAllocator> alloc_;
   Tensor gt_;
@@ -242,6 +257,16 @@ class Adam : public OptimizerBase {
     return shards;
   }
 
+  void restore(const std::vector<io::Item> &state) override {
+    for (auto &item : state) {
+      if ("adam_mt" == item.name) {
+        if (mt_) mt_->set(item);
+      } else if ("adam_vt" == item.name) {
+        if (vt_) vt_->set(item);
+      }
+    }
+  }
+
 private:
   void updateImpl(Tensor params, Tensor grads, size_t actualMBSize) override;
   void resetStats() override;

diff --git a/src/tensors/allocator.h b/src/tensors/allocator.h
@@ -175,7 +175,7 @@ class Allocator {
     reserve(bytes);
   }
 
-  size_t alignedSize(size_t size) {
+  size_t alignedSize(size_t size) const {
     return (size_t)(ceil(size / (double)alignment_) * alignment_);
   }
 
@@ -189,7 +189,7 @@ class Allocator {
   }
 
   template <typename T>
-  size_t capacity(size_t num) {
+  size_t capacity(size_t num) const {
     return alignedSize(num * sizeof(T));
   }