From 45c3fe8145627d9bda80f58b226e4ab0c03bffbb Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Sat, 14 Dec 2019 12:56:17 -0800
Subject: [PATCH 01/49] Checkpoint

---
 nestedtensor/csrc/jit_list_apply.cpp | 12 ++++++
 nestedtensor/csrc/jit_list_apply.h   |  4 +-
 nestedtensor/csrc/py_init.cpp        |  1 +
 nestedtensor/nested/utils.py         | 62 ++++++++++++++++++++++++++++
 nestedtensor/version.py              |  4 +-
 5 files changed, 80 insertions(+), 3 deletions(-)

diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index 99ee21bd..e96999c7 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -81,5 +81,17 @@ THP_ListNestedTensor jit_apply_function(
   return THP_ListNestedTensor(_ListNestedTensor(nested_node));
 }
 
+py::cpp_function jit_tensorwise() {
+  return py::cpp_function([](py::function f) {
+    return py::cpp_function([f](py::args args, py::kwargs kwargs) {
+        std::vector<py::object> result;
+      for (size_t i = 0; i < args.size(); i++) {
+        result.push_back(f(args[i]));
+      }
+      return THP_ListNestedTensor(py::list(result));
+    });
+  });
+}
+
 } // namespace nested_tensor
 } // namespace torch
diff --git a/nestedtensor/csrc/jit_list_apply.h b/nestedtensor/csrc/jit_list_apply.h
index 7d774439..e95a84df 100644
--- a/nestedtensor/csrc/jit_list_apply.h
+++ b/nestedtensor/csrc/jit_list_apply.h
@@ -4,11 +4,13 @@
 #include <torch/csrc/jit/pybind_utils.h>
 #include <torch/csrc/utils/python_strings.h>
 #include <torch/extension.h>
+#include <pybind11/functional.h>
 
 namespace torch {
 namespace nested_tensor {
 THP_ListNestedTensor jit_apply_function(
     std::vector<THP_ListNestedTensor> nts_,
     py::object fn);
-}
+py::cpp_function jit_tensorwise();
+} // namespace nested_tensor
 } // namespace torch
diff --git a/nestedtensor/csrc/py_init.cpp b/nestedtensor/csrc/py_init.cpp
index c5a5bf03..548ec0ec 100644
--- a/nestedtensor/csrc/py_init.cpp
+++ b/nestedtensor/csrc/py_init.cpp
@@ -150,4 +150,5 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
           &torch::nested_tensor::THP_BufferNestedTensor::get_buffer);
 
   m.def("jit_apply_function", &torch::nested_tensor::jit_apply_function);
+  m.def("jit_tensorwise", &torch::nested_tensor::jit_tensorwise);
 }
diff --git a/nestedtensor/nested/utils.py b/nestedtensor/nested/utils.py
index d4851447..e10b3c17 100644
--- a/nestedtensor/nested/utils.py
+++ b/nestedtensor/nested/utils.py
@@ -163,8 +163,70 @@ def match_type_signature_prefix(types, args):
 # Make nested_stride optional (cont. by default)
 # Return flattened tensor pairs, then create _BufferNestedTensor impl directly
 
+def __gen_unbound(*args, **kwargs):
+    # Unbind everything via __getitem__ that is either NestedTensor or in unbind_args
+    # All args to-be-unbound should match in length
+
+    dispatch_key = find_nested_tensor_dispatch_key(*args)
+    key_len = len(dispatch_key)
+
+    unbound_args = []
+    for i, arg in enumerate(args):
+        if is_nested_tensor(arg):
+            assert len(arg) == key_len
+            unbound_args.append(tuple(arg[i] for i in range(key_len)))
+        else:
+            unbound_args.append(tuple(arg for _ in range(key_len)))
+
+    unbound_kwargs = []
+    for k, arg in kwargs.items():
+        if is_nested_tensor(arg) or k in unb_args:
+            assert len(arg) == key_len
+            new_kwarg = tuple((k, arg[i]) for i in range(key_len))
+        else:
+            new_kwarg = tuple((k, arg) for _ in range(key_len))
+        unbound_kwargs.append(new_kwarg)
+
+    args_gen = zip(*unbound_args)
+    for new_args in args_gen:
+        yield (new_args, {})
+
+def _tensorwise():
+
+    def wrapper(f):
+        @wraps(f)
+        def decorator(*_args, **_kwargs):
+            def _func(*args, **kwargs):
+                if find_nested_tensor_dispatch_key(*args) is None:
+                    # import pdb; pdb.set_trace()
+                    result = f(*args, **kwargs)
+                    if not torch.is_tensor(result):
+                        return tuple(result)
+                    return result
+                else:
+                    results = []
+                    for local_args, local_kwargs in __gen_unbound(*args, **kwargs):
+                        results.append(_func(*local_args, **local_kwargs))
+                    return results
+            dispatch_key = find_nested_tensor_dispatch_key(*_args)
+            if dispatch_key is None:
+                return f(*_args, **_kwargs)
+            else:
+                args = _args
+                kwargs = _kwargs
+                results = _func(*args, **kwargs)
+                results = _unwrap_tensor_tuples(results)
+                if len(results) == 1:
+                    return creation.nested_tensor(results[0])
+                return tuple(map(creation.nested_tensor, results))
+
+        return decorator
+    return wrapper
 
 def tensorwise(unbind_args=None, dim_args=None, wrap_dim_args=True):
+    if unbind_args is None and dim_args is None and wrap_dim_args is False:
+        return _tensorwise()
+
     if unbind_args is None:
         unbind_args = []
     if dim_args is None:
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index 9db2b704..3ccbf551 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev201912143+f576372'
-git_version = 'f576372b54f808280193f936cd6079e3037e2303'
+__version__ = '0.0.1.dev2019121420+a5dfdf7'
+git_version = 'a5dfdf783b4186371112e56ed9addac5dc98117e'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From 331190b31d0288a74082e5321aadde8337280fb7 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Sat, 14 Dec 2019 13:59:54 -0800
Subject: [PATCH 02/49] Checkpoint

---
 nestedtensor/csrc/jit_list_apply.cpp | 17 ++++++++++++-----
 nestedtensor/csrc/jit_list_apply.h   |  1 +
 nestedtensor/csrc/nested_node.h      |  2 ++
 nestedtensor/version.py              |  4 ++--
 4 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index e96999c7..1c8e21d8 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -1,4 +1,5 @@
 #include <jit_list_apply.h>
+#include <python_list_nested_tensor.h>
 
 namespace torch {
 namespace nested_tensor {
@@ -82,13 +83,19 @@ THP_ListNestedTensor jit_apply_function(
 }
 
 py::cpp_function jit_tensorwise() {
-  return py::cpp_function([](py::function f) {
-    return py::cpp_function([f](py::args args, py::kwargs kwargs) {
-        std::vector<py::object> result;
+  return py::cpp_function([](py::object fn) {
+    return py::cpp_function([fn](py::args args, py::kwargs kwargs) {
+      auto sfn = py::cast<StrongFunctionPtr>(fn);
+      Function& f = *sfn.function_;
+      std::vector<TensorNode> nested_nodes;
       for (size_t i = 0; i < args.size(); i++) {
-        result.push_back(f(args[i]));
+        nested_nodes.push_back(
+            py::cast<THP_ListNestedTensor>(args[i]).data().get_structure());
       }
-      return THP_ListNestedTensor(py::list(result));
+      py::gil_scoped_release release;
+      TensorNode result = apply_jit_function(nested_nodes, f);
+      py::gil_scoped_acquire acquire;
+      return THP_ListNestedTensor(_ListNestedTensor(result));
     });
   });
 }
diff --git a/nestedtensor/csrc/jit_list_apply.h b/nestedtensor/csrc/jit_list_apply.h
index e95a84df..315467e7 100644
--- a/nestedtensor/csrc/jit_list_apply.h
+++ b/nestedtensor/csrc/jit_list_apply.h
@@ -5,6 +5,7 @@
 #include <torch/csrc/utils/python_strings.h>
 #include <torch/extension.h>
 #include <pybind11/functional.h>
+#include <pybind11/stl.h>
 
 namespace torch {
 namespace nested_tensor {
diff --git a/nestedtensor/csrc/nested_node.h b/nestedtensor/csrc/nested_node.h
index 5746067b..46240ba5 100644
--- a/nestedtensor/csrc/nested_node.h
+++ b/nestedtensor/csrc/nested_node.h
@@ -88,6 +88,7 @@ inline py::object wrap_nested_node(NestedNode<A> nested_node) {
   return result1;
 }
 
+// TODO: Add correct indentation
 static std::string _NestedNode___str__(const TensorNode& nested_node) {
   std::stringstream result;
   result << "nested_tensor([";
@@ -97,6 +98,7 @@ static std::string _NestedNode___str__(const TensorNode& nested_node) {
       PyObject* objectsRepresentation =
           PyObject_Str(THPVariable_Wrap(nested_node.payload(i)));
       result << THPUtils_unpackString(objectsRepresentation);
+      result << "," << std::endl;
     }
   } else {
     result << "  ";
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index 3ccbf551..218fa26a 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev2019121420+a5dfdf7'
-git_version = 'a5dfdf783b4186371112e56ed9addac5dc98117e'
+__version__ = '0.0.1.dev2019121421+45c3fe8'
+git_version = '45c3fe8145627d9bda80f58b226e4ab0c03bffbb'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From 06a70b913ddf9365632a50c3a8b2c1df48d226db Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Sat, 14 Dec 2019 14:07:29 -0800
Subject: [PATCH 03/49] Checkpoint

---
 benchmarks/jit_tensorwise.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 benchmarks/jit_tensorwise.py

diff --git a/benchmarks/jit_tensorwise.py b/benchmarks/jit_tensorwise.py
new file mode 100644
index 00000000..a2bac43c
--- /dev/null
+++ b/benchmarks/jit_tensorwise.py
@@ -0,0 +1,16 @@
+import torch
+import nestedtensor
+import utils
+
+
+@nestedtensor._C.jit_tensorwise()
+@torch.jit.script
+def f(i, w):
+    return torch.conv2d(i, w)
+
+
+if __name__ == "__main__":
+    r = f(nestedtensor._C._ListNestedTensor([torch.randn(1, 3, 10, 20)]),
+        nestedtensor._C._ListNestedTensor([torch.randn(5, 3, 3, 3)]))
+    
+    print(r.nested_size())

From ee5f59f8f7311b5a99a956e0fc3de9a37e3f7c19 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Mon, 16 Dec 2019 19:46:27 -0800
Subject: [PATCH 04/49] Checkpoint

---
 nestedtensor/csrc/jit_list_apply.cpp | 102 +++++++++++++++++++--------
 nestedtensor/csrc/jit_list_apply.h   |   4 +-
 nestedtensor/version.py              |   4 +-
 3 files changed, 75 insertions(+), 35 deletions(-)

diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index 1c8e21d8..7e9eae4c 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -3,53 +3,93 @@
 
 namespace torch {
 namespace nested_tensor {
+
+struct ArgWrapper {
+  ArgWrapper(TensorNode nested_tensor)
+      : _is_nested_tensor(true), _nested_tensor(nested_tensor) {}
+  ArgWrapper(c10::IValue ivalue) : _is_nested_tensor(false), _ivalue(ivalue) {}
+
+  bool is_nested_tensor() {
+    return _is_nested_tensor;
+  }
+
+  c10::IValue ivalue() {
+    return _ivalue;
+  }
+
+  TensorNode nested_tensor() {
+    return _nested_tensor;
+  }
+
+ private:
+  bool _is_nested_tensor;
+  c10::IValue _ivalue;
+  TensorNode _nested_tensor;
+};
+
+// TODO: Assert that one arg must be a nestedtensor?
 static TensorNode apply_jit_function(
-    const std::vector<TensorNode>& nested_nodes,
+    const std::vector<ArgWrapper>& args,
     Function& fn) {
   bool all_leaf = true;
-  for (size_t i = 0; i < nested_nodes.size(); i++) {
-    all_leaf = all_leaf && nested_nodes[i].is_leaf();
+  for (size_t i = 0; i < args.size(); i++) {
+    if (args[i].is_nested_tensor()) {
+      all_leaf = all_leaf && args[i].nested_tensor().is_leaf();
+    }
   }
   if (all_leaf) {
-    // NOTE: Assuming this is a pure function not a method (no self!)
-    // NOTE: We assume there is only one Tensor inputs.
     // NOTE: We assume no named tensors and no sparse variables as
-    // appropriate
-    // for TorchScript. NOTE: We know the IValues of the argument, there is
-    // no
-    // need to cast around.
-    c10::List<at::Tensor> result;
-    for (size_t j = 0; j < nested_nodes[0].size(); j++) {
-      Stack stack;
-      for (size_t i = 0; i < nested_nodes.size(); i++) {
-        push(stack, nested_nodes[i].payload(j));
+    // appropriate for TorchScript.
+    // TODO: Assert leaf sizes match and are non-zero, otherwise this isn't
+    // a NestedTensor function.
+    size_t leaf_size = 0;
+    for (size_t i = 0; i < args.size(); i++) {
+      if (args[i].is_nested_tensor()) {
+        leaf_size = args[i].size();
+        break;
+      }
+    }
+    std::vector<std::vector<IValue>> stacks(leaf_size);
+    for (size_t j = 0; j < leaf_size; j++) {
+      for (size_t i = 0; i < args.size(); i++) {
+        if (args[i].is_nested_tensor() {
+          stacks[j].push_back(args[i].nested_tensor().payload(j));
+        } else {
+          stacks[j].push_back(args[i].ivalue());
+        }
       }
-      fn.run(stack);
-      result.push_back(stack.back().toTensor());
+    }
+    c10::List<at::Tensor> results;
+    for (size_t i = 0; i < stacks.size(); i++) {
+      result.push_back(fn(stacks[i]));
     }
     return TensorNode(result);
   } else {
     bool broadcastable = true;
     size_t num_children = 0;
-    for (size_t i = 0; i < nested_nodes.size(); i++) {
-      if (!nested_nodes[i].is_leaf()) {
+    for (size_t i = 0; i < args.size(); i++) {
+      if (args[i].is_nested_tensor() && !args[i].is_leaf()) {
         if (num_children > 0) {
-          broadcastable =
-              broadcastable && (num_children == nested_nodes[i].degree());
+          broadcastable = broadcastable && (num_children == args[i].degree());
         } else {
-          num_children = nested_nodes[i].degree();
+          num_children = args[i].degree();
         }
       }
     }
     TORCH_CHECK(broadcastable, "Can't broadcast given nested tensors");
     std::vector<TensorNode> result;
     for (size_t i = 0; i < num_children; i++) {
-      std::vector<TensorNode> local_args;
-      for (size_t j = 0; j < nested_nodes.size(); j++) {
-        if (nested_nodes[j].is_leaf()) {
-          local_args.push_back(nested_nodes[j]);
+      std::vector<ArgWrapper> local_args;
+      for (size_t j = 0; j < args.size(); j++) {
+        if (args[j].is_nested_tensor()) {
+          if (args[j].nested_tensor().is_leaf()) {
+            local_args.push_back(args[j]);
+          } else {
+            local_args.push_back(
+                ArgWrapper(args[j].nested_tensor().children(i)));
+          }
         } else {
-          local_args.push_back(nested_nodes[j].children(i));
+          local_args.push_back(ArgWrapper(args[j].ivalue()));
         }
       }
       result.push_back(apply_jit_function(local_args, fn));
@@ -72,9 +112,9 @@ THP_ListNestedTensor jit_apply_function(
   TORCH_CHECK(
       schema.arguments().size() == nts.size(),
       "Give NestedTensors don't match function args.");
-  std::vector<TensorNode> nested_nodes;
+  std::vector<ArgWrapper> nested_nodes;
   for (size_t i = 0; i < nts.size(); i++) {
-    nested_nodes.push_back(nts[i].get_structure());
+    nested_nodes.push_back(ArgWrapper(nts[i].get_structure()));
   }
   py::gil_scoped_release release;
   TensorNode nested_node = apply_jit_function(nested_nodes, callee);
@@ -87,10 +127,10 @@ py::cpp_function jit_tensorwise() {
     return py::cpp_function([fn](py::args args, py::kwargs kwargs) {
       auto sfn = py::cast<StrongFunctionPtr>(fn);
       Function& f = *sfn.function_;
-      std::vector<TensorNode> nested_nodes;
+      std::vector<ArgWrapper> nested_nodes;
       for (size_t i = 0; i < args.size(); i++) {
-        nested_nodes.push_back(
-            py::cast<THP_ListNestedTensor>(args[i]).data().get_structure());
+        nested_nodes.push_back(ArgWrapper(
+            py::cast<THP_ListNestedTensor>(args[i]).data().get_structure()));
       }
       py::gil_scoped_release release;
       TensorNode result = apply_jit_function(nested_nodes, f);
diff --git a/nestedtensor/csrc/jit_list_apply.h b/nestedtensor/csrc/jit_list_apply.h
index 315467e7..5177a42d 100644
--- a/nestedtensor/csrc/jit_list_apply.h
+++ b/nestedtensor/csrc/jit_list_apply.h
@@ -1,11 +1,11 @@
 #include <Python.h>
+#include <pybind11/functional.h>
+#include <pybind11/stl.h>
 #include <python_list_nested_tensor.h>
 #include <torch/csrc/autograd/utils/wrap_outputs.h>
 #include <torch/csrc/jit/pybind_utils.h>
 #include <torch/csrc/utils/python_strings.h>
 #include <torch/extension.h>
-#include <pybind11/functional.h>
-#include <pybind11/stl.h>
 
 namespace torch {
 namespace nested_tensor {
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index f0d848f7..793cd0d9 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev201912171+ddcee83'
-git_version = 'ddcee83837d2bea240224e272cbd09d803a52f36'
+__version__ = '0.0.1.dev201912173+d14cf1e'
+git_version = 'd14cf1e8c83d12c6483992ced0cebbc66ffb6c41'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From ff04e1cec065b69d617819cb6537bd637493ed7a Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Mon, 16 Dec 2019 20:44:19 -0800
Subject: [PATCH 05/49] Checkpoint

---
 benchmarks/nearest_neighbors.py          | 20 ++++---
 nestedtensor/csrc/buffer_nested_tensor.h |  8 ++-
 nestedtensor/csrc/jit_list_apply.cpp     | 28 ++++++----
 nestedtensor/nested/nested.py            |  7 ++-
 nestedtensor/nested/utils.py             | 66 ++----------------------
 nestedtensor/version.py                  |  4 +-
 6 files changed, 44 insertions(+), 89 deletions(-)

diff --git a/benchmarks/nearest_neighbors.py b/benchmarks/nearest_neighbors.py
index be8cf5a9..fc5326a2 100644
--- a/benchmarks/nearest_neighbors.py
+++ b/benchmarks/nearest_neighbors.py
@@ -1,11 +1,11 @@
-from nestedtensor import torch
 import nestedtensor
+import torch
 import argparse
 import time
 import random
 import pprint
 
-EMBED_DIM = 1024
+EMBED_DIM = 128
 
 SEED = 0
 
@@ -60,8 +60,8 @@ def gen_algorithm_nested_mv(keys, sub_clusters):
     for sub_cluster in sub_clusters:
         new_sub_cluster = [torch.tensor(list(map(list, cluster))) for cluster in sub_cluster]
         new_sub_clusters.append(new_sub_cluster)
-    nested_sub_clusters = torch.nested_tensor(sub_clusters).to_tensor(2)
-    nested_keys = torch.nested_tensor(keys)
+    nested_sub_clusters = nestedtensor.nested_tensor(sub_clusters).to_tensor(2)
+    nested_keys = nestedtensor.nested_tensor(keys)
     def _nested_mv():
         return torch.mv(nested_sub_clusters, nested_keys)
     return _nested_mv
@@ -75,17 +75,15 @@ def gen_algorithm_nested_jit_mv(keys, sub_clusters):
             new_sub_cluster.append(torch.stack(cluster))
         new_sub_clusters.append(new_sub_cluster)
     nested_sub_clusters = nestedtensor._ListNestedTensor(new_sub_clusters)
-    print("HERE")
-    print(nested_sub_clusters.nested_size())
     nested_keys = nestedtensor._ListNestedTensor(keys)
-    print(nested_keys.nested_size())
 
+    @nestedtensor._C.jit_tensorwise()
     @torch.jit.script
     def my_fun(x, y):
         return torch.mv(x, y)
 
     def _nested_jit_mv():
-        return nestedtensor._C.jit_apply_function((nested_sub_clusters, nested_keys), my_fun)
+        return my_fun(nested_sub_clusters, nested_keys)
     return _nested_jit_mv
 
 
@@ -141,9 +139,9 @@ def benchmark_fn(fn, run_time = 15.0):
     gen_results_nested_mv = gen_algorithm_nested_mv(keys, sub_clusters)
     gen_results_nested_jit_mv = gen_algorithm_nested_jit_mv(keys, sub_clusters)
 
-    # print(benchmark_fn(gen_results_naive))
-    # print(benchmark_fn(gen_results_mv))
-    # print(benchmark_fn(gen_results_nested_mv))
+    print(benchmark_fn(gen_results_naive))
+    print(benchmark_fn(gen_results_mv))
+    print(benchmark_fn(gen_results_nested_mv))
     print(benchmark_fn(gen_results_nested_jit_mv))
     # import cProfile, pstats, io
     # pr = cProfile.Profile()
diff --git a/nestedtensor/csrc/buffer_nested_tensor.h b/nestedtensor/csrc/buffer_nested_tensor.h
index 5806a2b6..3d904a89 100644
--- a/nestedtensor/csrc/buffer_nested_tensor.h
+++ b/nestedtensor/csrc/buffer_nested_tensor.h
@@ -130,9 +130,13 @@ struct TORCH_API _BufferNestedTensor {
       new_size.push_back(start->degree());
       start = start->children_data(0);
     }
-    for (size_t i = 0; i < start->payload(0).size(); i++) {
-      new_size.push_back(start->payload(0)[i]);
+    new_size.push_back(start->size());
+    if (start->size() > 0) {
+      for (size_t i = 0; i < start->payload(0).size(); i++) {
+        new_size.push_back(start->payload(0)[i]);
+      }
     }
+    std::cout << "new_size: " << new_size << std::endl;
     return _buffer.reshape(at::IntArrayRef(new_size));
   }
 
diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index 7e9eae4c..70a0562a 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -29,7 +29,7 @@ struct ArgWrapper {
 
 // TODO: Assert that one arg must be a nestedtensor?
 static TensorNode apply_jit_function(
-    const std::vector<ArgWrapper>& args,
+    std::vector<ArgWrapper>& args,
     Function& fn) {
   bool all_leaf = true;
   for (size_t i = 0; i < args.size(); i++) {
@@ -45,14 +45,14 @@ static TensorNode apply_jit_function(
     size_t leaf_size = 0;
     for (size_t i = 0; i < args.size(); i++) {
       if (args[i].is_nested_tensor()) {
-        leaf_size = args[i].size();
+        leaf_size = args[i].nested_tensor().size();
         break;
       }
     }
     std::vector<std::vector<IValue>> stacks(leaf_size);
     for (size_t j = 0; j < leaf_size; j++) {
       for (size_t i = 0; i < args.size(); i++) {
-        if (args[i].is_nested_tensor() {
+        if (args[i].is_nested_tensor()) {
           stacks[j].push_back(args[i].nested_tensor().payload(j));
         } else {
           stacks[j].push_back(args[i].ivalue());
@@ -61,18 +61,19 @@ static TensorNode apply_jit_function(
     }
     c10::List<at::Tensor> results;
     for (size_t i = 0; i < stacks.size(); i++) {
-      result.push_back(fn(stacks[i]));
+      results.push_back(fn(stacks[i]).toTensor());
     }
-    return TensorNode(result);
+    return TensorNode(results);
   } else {
     bool broadcastable = true;
     size_t num_children = 0;
     for (size_t i = 0; i < args.size(); i++) {
-      if (args[i].is_nested_tensor() && !args[i].is_leaf()) {
+      if (args[i].is_nested_tensor() && !args[i].nested_tensor().is_leaf()) {
         if (num_children > 0) {
-          broadcastable = broadcastable && (num_children == args[i].degree());
+          broadcastable = broadcastable &&
+              (num_children == args[i].nested_tensor().degree());
         } else {
-          num_children = args[i].degree();
+          num_children = args[i].nested_tensor().degree();
         }
       }
     }
@@ -129,8 +130,15 @@ py::cpp_function jit_tensorwise() {
       Function& f = *sfn.function_;
       std::vector<ArgWrapper> nested_nodes;
       for (size_t i = 0; i < args.size(); i++) {
-        nested_nodes.push_back(ArgWrapper(
-            py::cast<THP_ListNestedTensor>(args[i]).data().get_structure()));
+        if (py::isinstance<THP_ListNestedTensor>(args[i])) {
+          nested_nodes.push_back(ArgWrapper(
+              py::cast<THP_ListNestedTensor>(args[i]).data().get_structure()));
+        } else if (py::isinstance<THP_BufferNestedTensor>(args[i])) {
+          nested_nodes.push_back(ArgWrapper(
+              py::cast<THP_BufferNestedTensor>(args[i]).data().get_structure()));
+        } else {
+          nested_nodes.push_back(ArgWrapper(toTypeInferredIValue(args[i])));
+        }
       }
       py::gil_scoped_release release;
       TensorNode result = apply_jit_function(nested_nodes, f);
diff --git a/nestedtensor/nested/nested.py b/nestedtensor/nested/nested.py
index a8aa2f3d..cadfefc9 100644
--- a/nestedtensor/nested/nested.py
+++ b/nestedtensor/nested/nested.py
@@ -263,7 +263,12 @@ def to_tensor(self, dim=0):
         if dim == 0:
             if None in self.size():
                 raise ValueError("Shape not Tensor compliant")
-            return self._impl.to_tensor()
+            print('self.nested_size()')
+            print(self.nested_size())
+            result = self._impl.to_tensor()
+            print('result.size()')
+            print(result.size())
+            return result
         # If dim is bigger than nested_dim the NestedTensor is already
         # of Tensor for dimensions bigger than the given.
         if self.nested_dim() == 1:
diff --git a/nestedtensor/nested/utils.py b/nestedtensor/nested/utils.py
index c8e196aa..6100f591 100644
--- a/nestedtensor/nested/utils.py
+++ b/nestedtensor/nested/utils.py
@@ -157,76 +157,16 @@ def match_type_signature_prefix(types, args):
             return False
     return True
 
+def jit_tensorwise():
+    pass
+
 # The assumption is that f can handle a list of tensors
 # This is used to write tensor-wise functions
 # The resulting function accepts a multiple NestedTensors as arguments
 # and calls f tensor-wise
 # Make nested_stride optional (cont. by default)
 # Return flattened tensor pairs, then create _BufferNestedTensor impl directly
-
-def __gen_unbound(*args, **kwargs):
-    # Unbind everything via __getitem__ that is either NestedTensor or in unbind_args
-    # All args to-be-unbound should match in length
-
-    dispatch_key = find_nested_tensor_dispatch_key(*args)
-    key_len = len(dispatch_key)
-
-    unbound_args = []
-    for i, arg in enumerate(args):
-        if is_nested_tensor(arg):
-            assert len(arg) == key_len
-            unbound_args.append(tuple(arg[i] for i in range(key_len)))
-        else:
-            unbound_args.append(tuple(arg for _ in range(key_len)))
-
-    unbound_kwargs = []
-    for k, arg in kwargs.items():
-        if is_nested_tensor(arg) or k in unb_args:
-            assert len(arg) == key_len
-            new_kwarg = tuple((k, arg[i]) for i in range(key_len))
-        else:
-            new_kwarg = tuple((k, arg) for _ in range(key_len))
-        unbound_kwargs.append(new_kwarg)
-
-    args_gen = zip(*unbound_args)
-    for new_args in args_gen:
-        yield (new_args, {})
-
-def _tensorwise():
-
-    def wrapper(f):
-        @wraps(f)
-        def decorator(*_args, **_kwargs):
-            def _func(*args, **kwargs):
-                if find_nested_tensor_dispatch_key(*args) is None:
-                    # import pdb; pdb.set_trace()
-                    result = f(*args, **kwargs)
-                    if not torch.is_tensor(result):
-                        return tuple(result)
-                    return result
-                else:
-                    results = []
-                    for local_args, local_kwargs in __gen_unbound(*args, **kwargs):
-                        results.append(_func(*local_args, **local_kwargs))
-                    return results
-            dispatch_key = find_nested_tensor_dispatch_key(*_args)
-            if dispatch_key is None:
-                return f(*_args, **_kwargs)
-            else:
-                args = _args
-                kwargs = _kwargs
-                results = _func(*args, **kwargs)
-                results = _unwrap_tensor_tuples(results)
-                if len(results) == 1:
-                    return creation.nested_tensor(results[0])
-                return tuple(map(creation.nested_tensor, results))
-
-        return decorator
-    return wrapper
-
 def tensorwise(unbind_args=None, dim_args=None, wrap_dim_args=True):
-    if unbind_args is None and dim_args is None and wrap_dim_args is False:
-        return _tensorwise()
 
     if unbind_args is None:
         unbind_args = []
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index 793cd0d9..bafad79e 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev201912173+d14cf1e'
-git_version = 'd14cf1e8c83d12c6483992ced0cebbc66ffb6c41'
+__version__ = '0.0.1.dev201912174+ee5f59f'
+git_version = 'ee5f59f8f7311b5a99a956e0fc3de9a37e3f7c19'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From 925416dd39e8469dbf50541ff2566694e0c95a73 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Mon, 16 Dec 2019 20:58:24 -0800
Subject: [PATCH 06/49] Checkpoint

---
 nestedtensor/nested/monkey_patch.py | 8 ++++++++
 nestedtensor/nested/nested.py       | 5 +++++
 nestedtensor/nested/utils.py        | 3 ---
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/nestedtensor/nested/monkey_patch.py b/nestedtensor/nested/monkey_patch.py
index 6bbe308d..64daa2f7 100644
--- a/nestedtensor/nested/monkey_patch.py
+++ b/nestedtensor/nested/monkey_patch.py
@@ -12,8 +12,10 @@ def monkey_patch(NestedTensor):
     from nestedtensor.nested import functions
     import torch
     from nestedtensor.nested import utils
+    from nestedtensor import _C
 
     function_dispatch = {}
+    jit_function_dispatch = {}
 
     def _check_meaningful_overwrite(cls, method_name):
         import os
@@ -34,6 +36,10 @@ def set_wrapped_torch_function(function_name, wrapper):
         function_dispatch[getattr(torch, function_name)] = wrapper(
             getattr(torch, function_name))
 
+    def set_wrapped_jit_torch_function(function_name, wrapper):
+        jit_function_dispatch[getattr(torch, function_name)] = wrapper(
+            torch.jit.script(getattr(torch, function_name)))
+
     def set_function(key, function):
         function_dispatch[key] = function
 
@@ -83,6 +89,7 @@ def set_function(key, function):
         if function_name in ['fill']:
             continue
         set_wrapped_torch_function(function_name, utils.tensorwise())
+        set_wrapped_jit_torch_function(function_name, _C.jit_tensorwise())
         set_nt_method(function_name, utils.tensorwise())
     # <
 
@@ -222,3 +229,4 @@ def set_function(key, function):
     # module.NestedTensor = NestedTensor
 
     setattr(NestedTensor, '_NestedTensor__function_dispatch', function_dispatch)
+    setattr(NestedTensor, '_NestedTensor__jit_function_dispatch', jit_function_dispatch)
diff --git a/nestedtensor/nested/nested.py b/nestedtensor/nested/nested.py
index cadfefc9..2440781e 100644
--- a/nestedtensor/nested/nested.py
+++ b/nestedtensor/nested/nested.py
@@ -323,6 +323,11 @@ def nested_stride(self, dim=None):
 
     def __torch_function__(self, func, args=(), kwargs=None):
         _local_func = None
+        if func in NestedTensor.__jit_function_dispatch:
+            if kwargs is None:
+                _jit_local_func = NestedTensor.__jit_function_dispatch[func]
+                impl_args = [a._impl if isinstance(a, NestedTensor) else a for a in args]
+                return _jit_local_func(*impl_args)
         if func in NestedTensor.__function_dispatch:
             _local_func = NestedTensor.__function_dispatch[func]
             return _local_func(*args) if kwargs is None else _local_func(*args, **kwargs)
diff --git a/nestedtensor/nested/utils.py b/nestedtensor/nested/utils.py
index 6100f591..c02371a0 100644
--- a/nestedtensor/nested/utils.py
+++ b/nestedtensor/nested/utils.py
@@ -157,9 +157,6 @@ def match_type_signature_prefix(types, args):
             return False
     return True
 
-def jit_tensorwise():
-    pass
-
 # The assumption is that f can handle a list of tensors
 # This is used to write tensor-wise functions
 # The resulting function accepts a multiple NestedTensors as arguments

From 9a5d8faf4a88f15e6870a8ceda16956714473cb3 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Wed, 18 Dec 2019 16:00:44 -0800
Subject: [PATCH 07/49] Checkpoint

---
 nestedtensor/csrc/jit_list_apply.cpp | 28 ++++++++++++++++++++++++++++
 nestedtensor/csrc/jit_list_apply.h   |  1 +
 nestedtensor/csrc/py_init.cpp        |  1 +
 nestedtensor/nested/monkey_patch.py  |  5 +++--
 nestedtensor/version.py              |  4 ++--
 5 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index 70a0562a..8689ce19 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -1,5 +1,9 @@
 #include <jit_list_apply.h>
 #include <python_list_nested_tensor.h>
+#include <python_buffer_nested_tensor.h>
+#include <torch/csrc/jit/script/builtin_functions.h>
+#include <ATen/core/interned_strings.h>
+#include <torch/csrc/jit/script/sugared_value.h>
 
 namespace torch {
 namespace nested_tensor {
@@ -123,6 +127,10 @@ THP_ListNestedTensor jit_apply_function(
   return THP_ListNestedTensor(_ListNestedTensor(nested_node));
 }
 
+// TODO: This should support 3 types of functions
+// fn might be scripted (i.e. StrongFunctionPtr)
+// fn might be a builtin (need to resolve!)
+// fn might be neither, so we just dispatch to some regular python for-loops (not fast!)
 py::cpp_function jit_tensorwise() {
   return py::cpp_function([](py::object fn) {
     return py::cpp_function([fn](py::args args, py::kwargs kwargs) {
@@ -148,5 +156,25 @@ py::cpp_function jit_tensorwise() {
   });
 }
 
+void resolve_builtin(py::object obj) {
+  py::object builtin_name =
+      py::module::import("torch.jit").attr("_find_builtin")(obj);
+  auto asdf = std::make_shared<BuiltinFunction>(
+        Symbol::fromQualString(py::str(builtin_name)), c10::nullopt);
+  // torch::jit::script::BuiltinFunction asdf = py::cast<torch::jit::script::BuiltinFunction>(builtin);
+  std::cout << "asdf: " << asdf << std::endl;
+  // Symbol ss = c10::InternedStrings::symbol("add");
+  // const std::vector<Function*>& s = torch::jit::script::getAllBuiltinFunctionsFor(ss);
+  // for (size_t i = 0; i < s.size(); i++) {
+  //   std::cout << s[i] << std::endl;
+  // }
+  // std::cout << "HEEE" << std::endl;
+  // std::vector<std::shared_ptr<Operator>> ops = torch::jit::getAllOperators();
+  // for (size_t i = 0; i < ops.size(); i++) {
+  //   std::cout << "000" << std::endl;
+  //   std::cout << ops[i]->schema() << std::endl;
+  // }
+}
+
 } // namespace nested_tensor
 } // namespace torch
diff --git a/nestedtensor/csrc/jit_list_apply.h b/nestedtensor/csrc/jit_list_apply.h
index 5177a42d..081b5089 100644
--- a/nestedtensor/csrc/jit_list_apply.h
+++ b/nestedtensor/csrc/jit_list_apply.h
@@ -13,5 +13,6 @@ THP_ListNestedTensor jit_apply_function(
     std::vector<THP_ListNestedTensor> nts_,
     py::object fn);
 py::cpp_function jit_tensorwise();
+void resolve_builtin(py::object);
 } // namespace nested_tensor
 } // namespace torch
diff --git a/nestedtensor/csrc/py_init.cpp b/nestedtensor/csrc/py_init.cpp
index b1092b16..73152bea 100644
--- a/nestedtensor/csrc/py_init.cpp
+++ b/nestedtensor/csrc/py_init.cpp
@@ -165,4 +165,5 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
 
   m.def("jit_apply_function", &torch::nested_tensor::jit_apply_function);
   m.def("jit_tensorwise", &torch::nested_tensor::jit_tensorwise);
+  m.def("resolve_builtin", &torch::nested_tensor::resolve_builtin);
 }
diff --git a/nestedtensor/nested/monkey_patch.py b/nestedtensor/nested/monkey_patch.py
index 64daa2f7..1c936d60 100644
--- a/nestedtensor/nested/monkey_patch.py
+++ b/nestedtensor/nested/monkey_patch.py
@@ -37,8 +37,9 @@ def set_wrapped_torch_function(function_name, wrapper):
             getattr(torch, function_name))
 
     def set_wrapped_jit_torch_function(function_name, wrapper):
-        jit_function_dispatch[getattr(torch, function_name)] = wrapper(
-            torch.jit.script(getattr(torch, function_name)))
+        return 
+        # jit_function_dispatch[getattr(torch, function_name)] = wrapper(
+        #     torch.jit.script(getattr(torch, function_name)))
 
     def set_function(key, function):
         function_dispatch[key] = function
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index bafad79e..d42caa89 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev201912174+ee5f59f'
-git_version = 'ee5f59f8f7311b5a99a956e0fc3de9a37e3f7c19'
+__version__ = '0.0.1.dev201912190+925416d'
+git_version = '925416dd39e8469dbf50541ff2566694e0c95a73'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From ba84c9cc4075fe00fd151295e1ea4b3e9251177c Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Wed, 18 Dec 2019 19:22:19 -0800
Subject: [PATCH 08/49] Checkpoint

---
 nestedtensor/csrc/jit_list_apply.cpp | 70 +++++++++++++++++++---------
 nestedtensor/csrc/jit_list_apply.h   |  2 +-
 nestedtensor/version.py              |  4 +-
 3 files changed, 52 insertions(+), 24 deletions(-)

diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index 8689ce19..cc951f8d 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -1,8 +1,8 @@
+#include <ATen/core/interned_strings.h>
 #include <jit_list_apply.h>
-#include <python_list_nested_tensor.h>
 #include <python_buffer_nested_tensor.h>
+#include <python_list_nested_tensor.h>
 #include <torch/csrc/jit/script/builtin_functions.h>
-#include <ATen/core/interned_strings.h>
 #include <torch/csrc/jit/script/sugared_value.h>
 
 namespace torch {
@@ -130,7 +130,8 @@ THP_ListNestedTensor jit_apply_function(
 // TODO: This should support 3 types of functions
 // fn might be scripted (i.e. StrongFunctionPtr)
 // fn might be a builtin (need to resolve!)
-// fn might be neither, so we just dispatch to some regular python for-loops (not fast!)
+// fn might be neither, so we just dispatch to some regular python for-loops
+// (not fast!)
 py::cpp_function jit_tensorwise() {
   return py::cpp_function([](py::object fn) {
     return py::cpp_function([fn](py::args args, py::kwargs kwargs) {
@@ -142,8 +143,10 @@ py::cpp_function jit_tensorwise() {
           nested_nodes.push_back(ArgWrapper(
               py::cast<THP_ListNestedTensor>(args[i]).data().get_structure()));
         } else if (py::isinstance<THP_BufferNestedTensor>(args[i])) {
-          nested_nodes.push_back(ArgWrapper(
-              py::cast<THP_BufferNestedTensor>(args[i]).data().get_structure()));
+          nested_nodes.push_back(
+              ArgWrapper(py::cast<THP_BufferNestedTensor>(args[i])
+                             .data()
+                             .get_structure()));
         } else {
           nested_nodes.push_back(ArgWrapper(toTypeInferredIValue(args[i])));
         }
@@ -156,24 +159,49 @@ py::cpp_function jit_tensorwise() {
   });
 }
 
-void resolve_builtin(py::object obj) {
+// const std::vector<Function*>& w =
+//     torch::jit::script::getAllBuiltinFunctionsFor(asdf->symbol);
+// for (size_t i = 0; i < w.size(); i++) {
+//   std::cout << w[i]->getSchema() << std::endl;
+// }
+
+void resolve_builtin(py::object obj, py::args args) {
+  std::vector<TypePtr> arg_types;
+  for (size_t i = 0; i < args.size(); i++) {
+    arg_types.push_back(toTypeInferredIValue(args[i]).type());
+  }
+  for (size_t i = 0; i < arg_types.size(); i++) {
+    std::cout << "\targ_types[" << i << "]: " << arg_types[i]->str();
+  }
+  std::cout << std::endl;
   py::object builtin_name =
       py::module::import("torch.jit").attr("_find_builtin")(obj);
-  auto asdf = std::make_shared<BuiltinFunction>(
-        Symbol::fromQualString(py::str(builtin_name)), c10::nullopt);
-  // torch::jit::script::BuiltinFunction asdf = py::cast<torch::jit::script::BuiltinFunction>(builtin);
-  std::cout << "asdf: " << asdf << std::endl;
-  // Symbol ss = c10::InternedStrings::symbol("add");
-  // const std::vector<Function*>& s = torch::jit::script::getAllBuiltinFunctionsFor(ss);
-  // for (size_t i = 0; i < s.size(); i++) {
-  //   std::cout << s[i] << std::endl;
-  // }
-  // std::cout << "HEEE" << std::endl;
-  // std::vector<std::shared_ptr<Operator>> ops = torch::jit::getAllOperators();
-  // for (size_t i = 0; i < ops.size(); i++) {
-  //   std::cout << "000" << std::endl;
-  //   std::cout << ops[i]->schema() << std::endl;
-  // }
+  auto builtin = std::make_shared<torch::jit::script::BuiltinFunction>(
+      c10::Symbol::fromQualString(py::str(builtin_name)), c10::nullopt);
+  const std::vector<std::shared_ptr<Operator>>& ops =
+      torch::jit::getAllOperatorsFor(builtin->symbol);
+  std::vector<std::vector<TypePtr>> candidate_arg_types;
+  for (size_t i = 0; i < ops.size(); i++) {
+    const std::vector<Argument>& op_args = ops[i]->schema().arguments();
+    for (size_t j = 0; j < op_args.size(); j++) {
+      std::cout << "args[" << j << "]: " << op_args[j].type()->str();
+    }
+    std::cout << std::endl;
+
+    if (op_args.size() != arg_types.size()) {
+      continue;
+    }
+    bool match = true;
+    for (size_t j = 0; j < op_args.size(); j++) {
+      match = match && (op_args[j].type()->kind() == arg_types[j]->kind());
+    }
+    if (match) {
+      for (size_t j = 0; j < op_args.size(); j++) {
+        std::cout << "\targs[" << j << "]: " << op_args[j].type()->str();
+      }
+      std::cout << std::endl;
+    }
+  }
 }
 
 } // namespace nested_tensor
diff --git a/nestedtensor/csrc/jit_list_apply.h b/nestedtensor/csrc/jit_list_apply.h
index 081b5089..780b2478 100644
--- a/nestedtensor/csrc/jit_list_apply.h
+++ b/nestedtensor/csrc/jit_list_apply.h
@@ -13,6 +13,6 @@ THP_ListNestedTensor jit_apply_function(
     std::vector<THP_ListNestedTensor> nts_,
     py::object fn);
 py::cpp_function jit_tensorwise();
-void resolve_builtin(py::object);
+void resolve_builtin(py::object, py::args);
 } // namespace nested_tensor
 } // namespace torch
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index d42caa89..35dcf4e6 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev201912190+925416d'
-git_version = '925416dd39e8469dbf50541ff2566694e0c95a73'
+__version__ = '0.0.1.dev201912193+9a5d8fa'
+git_version = '9a5d8faf4a88f15e6870a8ceda16956714473cb3'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From 98a1ebc2f76eda00aafa03ec5e32df34f01eb1c2 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Wed, 18 Dec 2019 19:40:58 -0800
Subject: [PATCH 09/49] Checkpoint

---
 nestedtensor/csrc/jit_list_apply.cpp | 55 +++++++++++++++-------------
 nestedtensor/version.py              |  4 +-
 2 files changed, 31 insertions(+), 28 deletions(-)

diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index cc951f8d..1eaff834 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -165,42 +165,45 @@ py::cpp_function jit_tensorwise() {
 //   std::cout << w[i]->getSchema() << std::endl;
 // }
 
-void resolve_builtin(py::object obj, py::args args) {
-  std::vector<TypePtr> arg_types;
-  for (size_t i = 0; i < args.size(); i++) {
-    arg_types.push_back(toTypeInferredIValue(args[i]).type());
-  }
-  for (size_t i = 0; i < arg_types.size(); i++) {
-    std::cout << "\targ_types[" << i << "]: " << arg_types[i]->str();
-  }
-  std::cout << std::endl;
+void resolve_builtin(py::object obj, py::args py_args) {
+  // std::vector<Argument> args;
+  // for (size_t i = 0; i < args.size(); i++) {
+  //   Argument
+  //   TypePtr type_ptr = toTypeInferredIValue(args[i]).type();
+  //   Argument(
+  // }
+  // for (size_t i = 0; i < arg_types.size(); i++) {
+  //   std::cout << "\targ_types[" << i << "]: " << arg_types[i]->str();
+  // }
+  // std::cout << std::endl;
   py::object builtin_name =
       py::module::import("torch.jit").attr("_find_builtin")(obj);
   auto builtin = std::make_shared<torch::jit::script::BuiltinFunction>(
       c10::Symbol::fromQualString(py::str(builtin_name)), c10::nullopt);
-  const std::vector<std::shared_ptr<Operator>>& ops =
-      torch::jit::getAllOperatorsFor(builtin->symbol);
-  std::vector<std::vector<TypePtr>> candidate_arg_types;
+  auto ops = torch::jit::getAllOperatorsFor(builtin->symbol);
+
   for (size_t i = 0; i < ops.size(); i++) {
     const std::vector<Argument>& op_args = ops[i]->schema().arguments();
     for (size_t j = 0; j < op_args.size(); j++) {
-      std::cout << "args[" << j << "]: " << op_args[j].type()->str();
+      std::cout << "\top_args[" << j << "]: " << op_args[j].type()->str();
+      std::cout << "\top_args[" << j << "] name: " << op_args[j].name();
     }
     std::cout << std::endl;
 
-    if (op_args.size() != arg_types.size()) {
-      continue;
-    }
-    bool match = true;
-    for (size_t j = 0; j < op_args.size(); j++) {
-      match = match && (op_args[j].type()->kind() == arg_types[j]->kind());
-    }
-    if (match) {
-      for (size_t j = 0; j < op_args.size(); j++) {
-        std::cout << "\targs[" << j << "]: " << op_args[j].type()->str();
-      }
-      std::cout << std::endl;
-    }
+    // if (op_args.size() != arg_types.size()) {
+    //   continue;
+    // }
+    // bool match = true;
+    // for (size_t j = 0; j < op_args.size(); j++) {
+    //   match = match && (op_args[j].type()->kind() == arg_types[j]->kind());
+    // }
+    // if (match) {
+    //   std::cout << "MATCHED: ";
+    //   for (size_t j = 0; j < op_args.size(); j++) {
+    //     std::cout << "\top_args[" << j << "]: " << op_args[j].type()->str();
+    //   }
+    //   std::cout << std::endl;
+    // }
   }
 }
 
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index 35dcf4e6..ad27be6d 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev201912193+9a5d8fa'
-git_version = '9a5d8faf4a88f15e6870a8ceda16956714473cb3'
+__version__ = '0.0.1.dev201912193+ba84c9c'
+git_version = 'ba84c9cc4075fe00fd151295e1ea4b3e9251177c'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From 30be2c7a467e3d9ebf36d98fa252485a2d061e4d Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Wed, 18 Dec 2019 21:30:41 -0800
Subject: [PATCH 10/49] Checkpoint

---
 nestedtensor/csrc/jit_list_apply.cpp | 78 +++++++++++++++++++---------
 nestedtensor/csrc/jit_list_apply.h   |  2 +-
 nestedtensor/version.py              |  4 +-
 3 files changed, 56 insertions(+), 28 deletions(-)

diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index 1eaff834..963c46e7 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -165,13 +165,13 @@ py::cpp_function jit_tensorwise() {
 //   std::cout << w[i]->getSchema() << std::endl;
 // }
 
-void resolve_builtin(py::object obj, py::args py_args) {
-  // std::vector<Argument> args;
-  // for (size_t i = 0; i < args.size(); i++) {
-  //   Argument
-  //   TypePtr type_ptr = toTypeInferredIValue(args[i]).type();
-  //   Argument(
-  // }
+// TODO: Write comparison operation based on a subset of Argument comparison
+at::Tensor resolve_builtin(py::object obj, py::args py_args) {
+  std::vector<Argument> args;
+  for (size_t i = 0; i < py_args.size(); i++) {
+    TypePtr type_ptr = tryToInferType(py_args[i]).type();
+    args.push_back(Argument("", type_ptr));
+  }
   // for (size_t i = 0; i < arg_types.size(); i++) {
   //   std::cout << "\targ_types[" << i << "]: " << arg_types[i]->str();
   // }
@@ -180,31 +180,59 @@ void resolve_builtin(py::object obj, py::args py_args) {
       py::module::import("torch.jit").attr("_find_builtin")(obj);
   auto builtin = std::make_shared<torch::jit::script::BuiltinFunction>(
       c10::Symbol::fromQualString(py::str(builtin_name)), c10::nullopt);
-  auto ops = torch::jit::getAllOperatorsFor(builtin->symbol);
+  const std::vector<std::shared_ptr<Operator>>& ops =
+      torch::jit::getAllOperatorsFor(builtin->symbol);
 
   for (size_t i = 0; i < ops.size(); i++) {
     const std::vector<Argument>& op_args = ops[i]->schema().arguments();
+    size_t num_args = 0; // TODO: Kwarg support
     for (size_t j = 0; j < op_args.size(); j++) {
-      std::cout << "\top_args[" << j << "]: " << op_args[j].type()->str();
-      std::cout << "\top_args[" << j << "] name: " << op_args[j].name();
+      if (!op_args[j].kwarg_only()) {
+        num_args++;
+      }
+    }
+    if (args.size() != num_args) {
+      continue;
+    }
+    bool match = true;
+    // NOTE: Assuming args come before kwargs
+    for (size_t j = 0; j < args.size(); j++) {
+      Argument op_arg = op_args[j];
+      // TODO: Check why_not using isSubtypeOfExt
+      std::cout << "\top_arg.type(): " << op_arg.type()->str();
+      std::cout << "\top_args[" << j << "].type(): " << args[j].type()->str();
+      // TODO: Separate this out into two runs, first for exact match, then
+      // subtype match (maybe)
+      // TODO: This doesn't seem to work with float < Scalar?
+      match =
+          match && (args[j].type()->isSubtypeOfExt(op_arg.type(), &std::cout));
+      // NOTE: Ignoring name!
+      // TODO: Ignoring N.value() (argument order)
+      // TODO: The first number of arguments must not be kwarg because of our
+      // size check. This also rests on the assuming that args come before
+      // kwargs.
+      TORCH_CHECK(!op_args[j].kwarg_only());
+      // TODO: Ignoring alias_info().value()
     }
     std::cout << std::endl;
-
-    // if (op_args.size() != arg_types.size()) {
-    //   continue;
-    // }
-    // bool match = true;
-    // for (size_t j = 0; j < op_args.size(); j++) {
-    //   match = match && (op_args[j].type()->kind() == arg_types[j]->kind());
-    // }
-    // if (match) {
-    //   std::cout << "MATCHED: ";
-    //   for (size_t j = 0; j < op_args.size(); j++) {
-    //     std::cout << "\top_args[" << j << "]: " << op_args[j].type()->str();
-    //   }
-    //   std::cout << std::endl;
-    // }
+    if (true) {
+      std::cout << "MATCHED: ";
+      for (size_t j = 0; j < op_args.size(); j++) {
+        std::cout << "\top_args[" << j << "]: " << op_args[j].type()->str();
+      }
+      std::cout << std::endl;
+      std::shared_ptr<Operator> op_i = ops[i];
+      Operation operation = op_i->getOperation();
+      std::vector<c10::IValue> operation_args;
+      for (size_t i = 0; i < py_args.size(); i++) {
+        operation_args.push_back(toTypeInferredIValue(py_args[i]));
+      }
+      // TODO: Needs to take default value into account.
+      std::cout << "RESULT: " << operation(operation_args) << std::endl;
+      std::cout << "RAN IT" << std::endl;
+    }
   }
+  return torch::ones({});
 }
 
 } // namespace nested_tensor
diff --git a/nestedtensor/csrc/jit_list_apply.h b/nestedtensor/csrc/jit_list_apply.h
index 780b2478..28a0404e 100644
--- a/nestedtensor/csrc/jit_list_apply.h
+++ b/nestedtensor/csrc/jit_list_apply.h
@@ -13,6 +13,6 @@ THP_ListNestedTensor jit_apply_function(
     std::vector<THP_ListNestedTensor> nts_,
     py::object fn);
 py::cpp_function jit_tensorwise();
-void resolve_builtin(py::object, py::args);
+at::Tensor resolve_builtin(py::object, py::args);
 } // namespace nested_tensor
 } // namespace torch
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index ad27be6d..2126f9cb 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev201912193+ba84c9c'
-git_version = 'ba84c9cc4075fe00fd151295e1ea4b3e9251177c'
+__version__ = '0.0.1.dev201912195+98a1ebc'
+git_version = '98a1ebc2f76eda00aafa03ec5e32df34f01eb1c2'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From d610758a184dc9bf59e4f29e9a7ad75ed670b1ab Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Thu, 19 Dec 2019 12:37:13 -0800
Subject: [PATCH 11/49] Checkpoint

---
 nestedtensor/csrc/jit_list_apply.cpp | 160 +++++++++++++++++----------
 nestedtensor/version.py              |   4 +-
 2 files changed, 105 insertions(+), 59 deletions(-)

diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index 963c46e7..3ecb7716 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -3,11 +3,15 @@
 #include <python_buffer_nested_tensor.h>
 #include <python_list_nested_tensor.h>
 #include <torch/csrc/jit/script/builtin_functions.h>
+#include <torch/csrc/jit/script/schema_matching.h>
 #include <torch/csrc/jit/script/sugared_value.h>
 
 namespace torch {
 namespace nested_tensor {
 
+using namespace torch::jit;
+using namespace torch::jit::script;
+
 struct ArgWrapper {
   ArgWrapper(TensorNode nested_tensor)
       : _is_nested_tensor(true), _nested_tensor(nested_tensor) {}
@@ -166,72 +170,114 @@ py::cpp_function jit_tensorwise() {
 // }
 
 // TODO: Write comparison operation based on a subset of Argument comparison
-at::Tensor resolve_builtin(py::object obj, py::args py_args) {
-  std::vector<Argument> args;
-  for (size_t i = 0; i < py_args.size(); i++) {
-    TypePtr type_ptr = tryToInferType(py_args[i]).type();
-    args.push_back(Argument("", type_ptr));
-  }
+at::Tensor resolve_builtin(
+    py::object obj,
+    py::args py_args,
+    py::kwargs py_kwargs) {
   // for (size_t i = 0; i < arg_types.size(); i++) {
   //   std::cout << "\targ_types[" << i << "]: " << arg_types[i]->str();
   // }
   // std::cout << std::endl;
   py::object builtin_name =
       py::module::import("torch.jit").attr("_find_builtin")(obj);
-  auto builtin = std::make_shared<torch::jit::script::BuiltinFunction>(
-      c10::Symbol::fromQualString(py::str(builtin_name)), c10::nullopt);
-  const std::vector<std::shared_ptr<Operator>>& ops =
-      torch::jit::getAllOperatorsFor(builtin->symbol);
-
-  for (size_t i = 0; i < ops.size(); i++) {
-    const std::vector<Argument>& op_args = ops[i]->schema().arguments();
-    size_t num_args = 0; // TODO: Kwarg support
-    for (size_t j = 0; j < op_args.size(); j++) {
-      if (!op_args[j].kwarg_only()) {
-        num_args++;
-      }
-    }
-    if (args.size() != num_args) {
-      continue;
-    }
-    bool match = true;
-    // NOTE: Assuming args come before kwargs
-    for (size_t j = 0; j < args.size(); j++) {
-      Argument op_arg = op_args[j];
-      // TODO: Check why_not using isSubtypeOfExt
-      std::cout << "\top_arg.type(): " << op_arg.type()->str();
-      std::cout << "\top_args[" << j << "].type(): " << args[j].type()->str();
-      // TODO: Separate this out into two runs, first for exact match, then
-      // subtype match (maybe)
-      // TODO: This doesn't seem to work with float < Scalar?
-      match =
-          match && (args[j].type()->isSubtypeOfExt(op_arg.type(), &std::cout));
-      // NOTE: Ignoring name!
-      // TODO: Ignoring N.value() (argument order)
-      // TODO: The first number of arguments must not be kwarg because of our
-      // size check. This also rests on the assuming that args come before
-      // kwargs.
-      TORCH_CHECK(!op_args[j].kwarg_only());
-      // TODO: Ignoring alias_info().value()
-    }
-    std::cout << std::endl;
-    if (true) {
-      std::cout << "MATCHED: ";
-      for (size_t j = 0; j < op_args.size(); j++) {
-        std::cout << "\top_args[" << j << "]: " << op_args[j].type()->str();
-      }
-      std::cout << std::endl;
-      std::shared_ptr<Operator> op_i = ops[i];
-      Operation operation = op_i->getOperation();
-      std::vector<c10::IValue> operation_args;
-      for (size_t i = 0; i < py_args.size(); i++) {
-        operation_args.push_back(toTypeInferredIValue(py_args[i]));
+  auto name = c10::Symbol::fromQualString(py::str(builtin_name));
+
+  std::cout << "builtin_name: " << builtin_name << std::endl;
+  std::cout << "name: " << name << std::endl;
+
+  const auto& variants = getAllOperatorsFor(name);
+  const auto& builtin_functions = getAllBuiltinFunctionsFor(name);
+
+  std::stringstream failure_messages;
+  std::vector<const FunctionSchema*> schemas;
+  for (const std::shared_ptr<Operator>& op : variants) {
+    schemas.push_back(&op->schema());
+  }
+  for (const auto method : builtin_functions) {
+    method->ensure_defined();
+    schemas.push_back(&method->getSchema());
+  }
+
+  // Go through each Schema candidate based on the overloads
+  // The order here matters and is given by the way we construct schemas.
+  // This is a subset of matchSchemas within jit/script/schema_matching.cpp
+  // and only implements the argument matching based on features such as types.
+  // It could eventually live in the JIT as a subcomponent that can implement
+  // overload resolution generically and outside a graph context.
+  //
+  // In essence we spend most of our time resolving types (e.g. turn
+  // single floats into lists of floats, resolving concrete types) or dealing
+  // with the unordered nature of kwargs.
+  for (size_t i = 0; i < schemas.size(); i++) {
+    const FunctionSchema* schema = schemas[i];
+    std::cout << "schema[" << i << "]:\t" << *schemas[i];
+    std::cout << " - overload_name: " << schemas[i]->overload_name()
+              << std::endl;
+    size_t processed_py_args = 0;
+    const std::vector<Argument>& schema_args = schema.arguments();
+    // For each argument in the Schema, see if it can be matched up with the
+    // given python arguments to determine whether it's the right overload.
+    for (size_t j = 0; j < schema_args.size(); j++) {
+      // TODO: Support for self as in tryMatchArgument?
+      Argument schema_arg = schema_args[i];
+      if (!schema_arg.only() && processed_py_args < py_args.size() {
+        // TODO: Add support to allow conversions.
+        TypePtr type_ptr = tryToInferType(py_args[i]).type();
       }
-      // TODO: Needs to take default value into account.
-      std::cout << "RESULT: " << operation(operation_args) << std::endl;
-      std::cout << "RAN IT" << std::endl;
     }
   }
+
+  // for (size_t i = 0; i < ops.size(); i++) {
+  //   const std::vector<Argument>& op_args = ops[i]->schema().arguments();
+  //   size_t num_args = 0; // TODO: Kwarg support
+  //   for (size_t j = 0; j < op_args.size(); j++) {
+  //     if (!op_args[j].kwarg_only()) {
+  //       num_args++;
+  //     }
+  //   }
+  //   if (args.size() != num_args) {
+  //     continue;
+  //   }
+  //   bool match = true;
+  //   // NOTE: Assuming args come before kwargs
+  //   for (size_t j = 0; j < args.size(); j++) {
+  //     Argument op_arg = op_args[j];
+  //     // TODO: Check why_not using isSubtypeOfExt
+  //     std::cout << "\top_arg.type(): " << op_arg.type()->str();
+  //     std::cout << "\top_args[" << j << "].type(): " <<
+  //     args[j].type()->str();
+  //     // TODO: Separate this out into two runs, first for exact match, then
+  //     // subtype match (maybe)
+  //     // TODO: This doesn't seem to work with float < Scalar?
+  //     match =
+  //         match && (args[j].type()->isSubtypeOfExt(op_arg.type(),
+  //         &std::cout));
+  //     // NOTE: Ignoring name!
+  //     // TODO: Ignoring N.value() (argument order)
+  //     // TODO: The first number of arguments must not be kwarg because of our
+  //     // size check. This also rests on the assuming that args come before
+  //     // kwargs.
+  //     TORCH_CHECK(!op_args[j].kwarg_only());
+  //     // TODO: Ignoring alias_info().value()
+  //   }
+  //   std::cout << std::endl;
+  //   if (true) {
+  //     std::cout << "MATCHED: ";
+  //     for (size_t j = 0; j < op_args.size(); j++) {
+  //       std::cout << "\top_args[" << j << "]: " << op_args[j].type()->str();
+  //     }
+  //     std::cout << std::endl;
+  //     std::shared_ptr<Operator> op_i = ops[i];
+  //     Operation operation = op_i->getOperation();
+  //     std::vector<c10::IValue> operation_args;
+  //     for (size_t i = 0; i < py_args.size(); i++) {
+  //       operation_args.push_back(toTypeInferredIValue(py_args[i]));
+  //     }
+  //     // TODO: Needs to take default value into account.
+  //     std::cout << "RESULT: " << operation(operation_args) << std::endl;
+  //     std::cout << "RAN IT" << std::endl;
+  //   }
+  // }
   return torch::ones({});
 }
 
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index 2126f9cb..7e562699 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev201912195+98a1ebc'
-git_version = '98a1ebc2f76eda00aafa03ec5e32df34f01eb1c2'
+__version__ = '0.0.1.dev2019121916+30be2c7'
+git_version = '30be2c7a467e3d9ebf36d98fa252485a2d061e4d'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From eee010ddbec748509e3aeeaa10d545d1f50bac5c Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Thu, 19 Dec 2019 18:54:54 -0800
Subject: [PATCH 12/49] Checkpoint

---
 nestedtensor/csrc/jit_list_apply.cpp | 66 +++++++++++++++++++++++++---
 nestedtensor/csrc/jit_list_apply.h   |  5 ++-
 nestedtensor/version.py              |  4 +-
 3 files changed, 66 insertions(+), 9 deletions(-)

diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index 3ecb7716..1662a899 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -16,6 +16,8 @@ struct ArgWrapper {
   ArgWrapper(TensorNode nested_tensor)
       : _is_nested_tensor(true), _nested_tensor(nested_tensor) {}
   ArgWrapper(c10::IValue ivalue) : _is_nested_tensor(false), _ivalue(ivalue) {}
+  ArgWrapper(std::string name, c10::IValue ivalue)
+      : _name(name), _is_nested_tensor(false), _ivalue(ivalue) {}
 
   bool is_nested_tensor() {
     return _is_nested_tensor;
@@ -29,7 +31,12 @@ struct ArgWrapper {
     return _nested_tensor;
   }
 
+  std::string name() {
+    return _name;
+  }
+
  private:
+  std::string _name;
   bool _is_nested_tensor;
   c10::IValue _ivalue;
   TensorNode _nested_tensor;
@@ -169,11 +176,27 @@ py::cpp_function jit_tensorwise() {
 //   std::cout << w[i]->getSchema() << std::endl;
 // }
 
+// c10::optional<size_t> findName(const std::string& name, 
+//     std::vector<std::string> py_kwargs_keys) {
+//   for (size_t i = 0; i < py_kwargs_keys.size(); i++) {
+//     if (name == py_kwargs_keys[i]) {
+//       return i;
+//     }
+//   }
+//   return c10::nullopt;
+// }
+
 // TODO: Write comparison operation based on a subset of Argument comparison
 at::Tensor resolve_builtin(
     py::object obj,
     py::args py_args,
     py::kwargs py_kwargs) {
+  // std::vector<py::object> py_args = py_args_;
+  // std::unordered_map<std::string, py::object> py_kwargs = py_kwargs_;
+  // std::vector<std::string> py_kwargs_keys;
+  // for (size_t i = 0; i < py_kwargs.size(); i++) {
+  //   py_kwargs_keys.push_back(std::string(py::str(std::get<0>(py_kwargs.begin()[i]))));
+  // }
   // for (size_t i = 0; i < arg_types.size(); i++) {
   //   std::cout << "\targ_types[" << i << "]: " << arg_types[i]->str();
   // }
@@ -213,18 +236,47 @@ at::Tensor resolve_builtin(
     std::cout << "schema[" << i << "]:\t" << *schemas[i];
     std::cout << " - overload_name: " << schemas[i]->overload_name()
               << std::endl;
-    size_t processed_py_args = 0;
-    const std::vector<Argument>& schema_args = schema.arguments();
+    // In the end it's only a match when this counter fully depleted the args.
+    size_t py_args_i = 0;
+    size_t used_kwargs = 0;
+    std::vector<bool> used_kwarg(py_kwargs.size(), false);
+    const std::vector<Argument>& schema_args = schema->arguments();
+    std::vector<ArgWrapper> parse_py_args;
     // For each argument in the Schema, see if it can be matched up with the
     // given python arguments to determine whether it's the right overload.
+    //
+    // First we resolve the python arguments to build list of candidate
+    // wrapped arguments. It's not enough to parse these arguments
+    // outside of a given Schema because of the type environment
+    // and conversions. It's possible to match a Python call
+    // signature to an overload with different types such as
+    // Scalar and Tensor etc. simply by requiring conversion.
     for (size_t j = 0; j < schema_args.size(); j++) {
       // TODO: Support for self as in tryMatchArgument?
       Argument schema_arg = schema_args[i];
-      if (!schema_arg.only() && processed_py_args < py_args.size() {
+      if (!schema_arg.kwarg_only() && py_args_i < py_args.size()) {
         // TODO: Add support to allow conversions.
-        TypePtr type_ptr = tryToInferType(py_args[i]).type();
+        IValue type_ptr = toTypeInferredIValue(py_args[py_args_i]);
+        parse_py_args.emplace_back(ArgWrapper(type_ptr));
+        py_args_i++;
+      } else if (py_kwargs.contains(schema_arg.name().c_str())) {
+        // TODO: Check for no presence of duplicates in given schemas[i]
+        // auto item = py_kwargs.begin()[*kwarg_idx];
+        // std::string py_kwarg_key = std::string(py::str(std::get<0>(item)));
+        py::handle py_kwarg_object = py_kwargs[schema_arg.name().c_str()];
+        parse_py_args.emplace_back(
+            ArgWrapper(schema_arg.name(), toTypeInferredIValue(py_kwarg_object)));
+        used_kwargs++;
+      } else if (schema_arg.default_value()) {
+        parse_py_args.emplace_back(ArgWrapper(*schema_arg.default_value()));
+      } else {
+        std::cout << "FAIL" << std::endl;
       }
     }
+    if (py_args_i == py_args.size() - 1 && used_kwargs == py_kwargs.size()) {
+      std::cout << "WIN - ";
+      std::cout << "schema: " << schema;
+    }
   }
 
   // for (size_t i = 0; i < ops.size(); i++) {
@@ -254,7 +306,8 @@ at::Tensor resolve_builtin(
   //         &std::cout));
   //     // NOTE: Ignoring name!
   //     // TODO: Ignoring N.value() (argument order)
-  //     // TODO: The first number of arguments must not be kwarg because of our
+  //     // TODO: The first number of arguments must not be kwarg because of
+  //     our
   //     // size check. This also rests on the assuming that args come before
   //     // kwargs.
   //     TORCH_CHECK(!op_args[j].kwarg_only());
@@ -264,7 +317,8 @@ at::Tensor resolve_builtin(
   //   if (true) {
   //     std::cout << "MATCHED: ";
   //     for (size_t j = 0; j < op_args.size(); j++) {
-  //       std::cout << "\top_args[" << j << "]: " << op_args[j].type()->str();
+  //       std::cout << "\top_args[" << j << "]: " <<
+  //       op_args[j].type()->str();
   //     }
   //     std::cout << std::endl;
   //     std::shared_ptr<Operator> op_i = ops[i];
diff --git a/nestedtensor/csrc/jit_list_apply.h b/nestedtensor/csrc/jit_list_apply.h
index 28a0404e..45e5d851 100644
--- a/nestedtensor/csrc/jit_list_apply.h
+++ b/nestedtensor/csrc/jit_list_apply.h
@@ -13,6 +13,9 @@ THP_ListNestedTensor jit_apply_function(
     std::vector<THP_ListNestedTensor> nts_,
     py::object fn);
 py::cpp_function jit_tensorwise();
-at::Tensor resolve_builtin(py::object, py::args);
+at::Tensor resolve_builtin(
+    py::object obj,
+    py::args py_args,
+    py::kwargs py_kwargs);
 } // namespace nested_tensor
 } // namespace torch
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index 7e562699..7dc14f05 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev2019121916+30be2c7'
-git_version = '30be2c7a467e3d9ebf36d98fa252485a2d061e4d'
+__version__ = '0.0.1.dev201912202+d610758'
+git_version = 'd610758a184dc9bf59e4f29e9a7ad75ed670b1ab'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From 4e7555a68e89838619b88fe459947418ab02417f Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Thu, 19 Dec 2019 18:56:38 -0800
Subject: [PATCH 13/49] Checkpoint

---
 nestedtensor/csrc/jit_list_apply.cpp | 86 +---------------------------
 nestedtensor/version.py              |  4 +-
 2 files changed, 4 insertions(+), 86 deletions(-)

diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index 1662a899..243b6f80 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -170,37 +170,11 @@ py::cpp_function jit_tensorwise() {
   });
 }
 
-// const std::vector<Function*>& w =
-//     torch::jit::script::getAllBuiltinFunctionsFor(asdf->symbol);
-// for (size_t i = 0; i < w.size(); i++) {
-//   std::cout << w[i]->getSchema() << std::endl;
-// }
-
-// c10::optional<size_t> findName(const std::string& name, 
-//     std::vector<std::string> py_kwargs_keys) {
-//   for (size_t i = 0; i < py_kwargs_keys.size(); i++) {
-//     if (name == py_kwargs_keys[i]) {
-//       return i;
-//     }
-//   }
-//   return c10::nullopt;
-// }
-
 // TODO: Write comparison operation based on a subset of Argument comparison
 at::Tensor resolve_builtin(
     py::object obj,
     py::args py_args,
     py::kwargs py_kwargs) {
-  // std::vector<py::object> py_args = py_args_;
-  // std::unordered_map<std::string, py::object> py_kwargs = py_kwargs_;
-  // std::vector<std::string> py_kwargs_keys;
-  // for (size_t i = 0; i < py_kwargs.size(); i++) {
-  //   py_kwargs_keys.push_back(std::string(py::str(std::get<0>(py_kwargs.begin()[i]))));
-  // }
-  // for (size_t i = 0; i < arg_types.size(); i++) {
-  //   std::cout << "\targ_types[" << i << "]: " << arg_types[i]->str();
-  // }
-  // std::cout << std::endl;
   py::object builtin_name =
       py::module::import("torch.jit").attr("_find_builtin")(obj);
   auto name = c10::Symbol::fromQualString(py::str(builtin_name));
@@ -261,11 +235,9 @@ at::Tensor resolve_builtin(
         py_args_i++;
       } else if (py_kwargs.contains(schema_arg.name().c_str())) {
         // TODO: Check for no presence of duplicates in given schemas[i]
-        // auto item = py_kwargs.begin()[*kwarg_idx];
-        // std::string py_kwarg_key = std::string(py::str(std::get<0>(item)));
         py::handle py_kwarg_object = py_kwargs[schema_arg.name().c_str()];
-        parse_py_args.emplace_back(
-            ArgWrapper(schema_arg.name(), toTypeInferredIValue(py_kwarg_object)));
+        parse_py_args.emplace_back(ArgWrapper(
+            schema_arg.name(), toTypeInferredIValue(py_kwarg_object)));
         used_kwargs++;
       } else if (schema_arg.default_value()) {
         parse_py_args.emplace_back(ArgWrapper(*schema_arg.default_value()));
@@ -278,60 +250,6 @@ at::Tensor resolve_builtin(
       std::cout << "schema: " << schema;
     }
   }
-
-  // for (size_t i = 0; i < ops.size(); i++) {
-  //   const std::vector<Argument>& op_args = ops[i]->schema().arguments();
-  //   size_t num_args = 0; // TODO: Kwarg support
-  //   for (size_t j = 0; j < op_args.size(); j++) {
-  //     if (!op_args[j].kwarg_only()) {
-  //       num_args++;
-  //     }
-  //   }
-  //   if (args.size() != num_args) {
-  //     continue;
-  //   }
-  //   bool match = true;
-  //   // NOTE: Assuming args come before kwargs
-  //   for (size_t j = 0; j < args.size(); j++) {
-  //     Argument op_arg = op_args[j];
-  //     // TODO: Check why_not using isSubtypeOfExt
-  //     std::cout << "\top_arg.type(): " << op_arg.type()->str();
-  //     std::cout << "\top_args[" << j << "].type(): " <<
-  //     args[j].type()->str();
-  //     // TODO: Separate this out into two runs, first for exact match, then
-  //     // subtype match (maybe)
-  //     // TODO: This doesn't seem to work with float < Scalar?
-  //     match =
-  //         match && (args[j].type()->isSubtypeOfExt(op_arg.type(),
-  //         &std::cout));
-  //     // NOTE: Ignoring name!
-  //     // TODO: Ignoring N.value() (argument order)
-  //     // TODO: The first number of arguments must not be kwarg because of
-  //     our
-  //     // size check. This also rests on the assuming that args come before
-  //     // kwargs.
-  //     TORCH_CHECK(!op_args[j].kwarg_only());
-  //     // TODO: Ignoring alias_info().value()
-  //   }
-  //   std::cout << std::endl;
-  //   if (true) {
-  //     std::cout << "MATCHED: ";
-  //     for (size_t j = 0; j < op_args.size(); j++) {
-  //       std::cout << "\top_args[" << j << "]: " <<
-  //       op_args[j].type()->str();
-  //     }
-  //     std::cout << std::endl;
-  //     std::shared_ptr<Operator> op_i = ops[i];
-  //     Operation operation = op_i->getOperation();
-  //     std::vector<c10::IValue> operation_args;
-  //     for (size_t i = 0; i < py_args.size(); i++) {
-  //       operation_args.push_back(toTypeInferredIValue(py_args[i]));
-  //     }
-  //     // TODO: Needs to take default value into account.
-  //     std::cout << "RESULT: " << operation(operation_args) << std::endl;
-  //     std::cout << "RAN IT" << std::endl;
-  //   }
-  // }
   return torch::ones({});
 }
 
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index 7dc14f05..0b3c305c 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev201912202+d610758'
-git_version = 'd610758a184dc9bf59e4f29e9a7ad75ed670b1ab'
+__version__ = '0.0.1.dev201912202+eee010d'
+git_version = 'eee010ddbec748509e3aeeaa10d545d1f50bac5c'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From c9b2bef345cde60912cc9d78ef46b96c54e8fb3e Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Thu, 19 Dec 2019 19:56:14 -0800
Subject: [PATCH 14/49] Checkpoint

---
 nestedtensor/csrc/jit_list_apply.cpp | 39 +++++++++++++++++++++-------
 nestedtensor/csrc/jit_list_apply.h   |  2 +-
 nestedtensor/version.py              |  4 +--
 3 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index 243b6f80..ae911603 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -208,8 +208,7 @@ at::Tensor resolve_builtin(
   for (size_t i = 0; i < schemas.size(); i++) {
     const FunctionSchema* schema = schemas[i];
     std::cout << "schema[" << i << "]:\t" << *schemas[i];
-    std::cout << " - overload_name: " << schemas[i]->overload_name()
-              << std::endl;
+    std::cout << " - overload_name: " << schemas[i]->overload_name();
     // In the end it's only a match when this counter fully depleted the args.
     size_t py_args_i = 0;
     size_t used_kwargs = 0;
@@ -225,13 +224,14 @@ at::Tensor resolve_builtin(
     // and conversions. It's possible to match a Python call
     // signature to an overload with different types such as
     // Scalar and Tensor etc. simply by requiring conversion.
+    bool fail = false;
     for (size_t j = 0; j < schema_args.size(); j++) {
       // TODO: Support for self as in tryMatchArgument?
-      Argument schema_arg = schema_args[i];
+      Argument schema_arg = schema_args[j];
       if (!schema_arg.kwarg_only() && py_args_i < py_args.size()) {
         // TODO: Add support to allow conversions.
-        IValue type_ptr = toTypeInferredIValue(py_args[py_args_i]);
-        parse_py_args.emplace_back(ArgWrapper(type_ptr));
+        IValue ivalue = toTypeInferredIValue(py_args[py_args_i]);
+        parse_py_args.emplace_back(ArgWrapper(ivalue));
         py_args_i++;
       } else if (py_kwargs.contains(schema_arg.name().c_str())) {
         // TODO: Check for no presence of duplicates in given schemas[i]
@@ -242,13 +242,34 @@ at::Tensor resolve_builtin(
       } else if (schema_arg.default_value()) {
         parse_py_args.emplace_back(ArgWrapper(*schema_arg.default_value()));
       } else {
-        std::cout << "FAIL" << std::endl;
+        // The given schema cannot find either a positional or keyword argument to match against
+        // for this given schema argument. There also is no default value specified for this 
+        // schema argument. Therefore this schema cannot be the correct overload.
+        fail = true;
+        break;
       }
     }
-    if (py_args_i == py_args.size() - 1 && used_kwargs == py_kwargs.size()) {
-      std::cout << "WIN - ";
-      std::cout << "schema: " << schema;
+    if (!fail && 
+        // Check whether all positional arguments were matched by given Schema
+        (py_args.size() == py_args_i) &&
+        // Check if all kwargs were matched by given Schema
+        (used_kwargs == py_kwargs.size())
+        ) {
+      bool types_match = true;
+      TypeEnv type_env;
+      for (size_t j = 0; j < parse_py_args.size(); j++) {
+        std::cout << "parse_py_args[" << j << "]: " << parse_py_args[j].ivalue().type()->str()
+                  << std::endl;
+        MatchTypeReturn match = matchTypeVariables(
+            schema_args[j].type(), parse_py_args[j].ivalue().type(), type_env);
+        types_match = types_match && match.success();
+      }
+      if (types_match) {
+        std::cout << "\t=== WIN" << std::endl;
+        continue;
+      }
     }
+    std::cout << "\t=== FAIL" << std::endl;
   }
   return torch::ones({});
 }
diff --git a/nestedtensor/csrc/jit_list_apply.h b/nestedtensor/csrc/jit_list_apply.h
index 45e5d851..5a21a4aa 100644
--- a/nestedtensor/csrc/jit_list_apply.h
+++ b/nestedtensor/csrc/jit_list_apply.h
@@ -16,6 +16,6 @@ py::cpp_function jit_tensorwise();
 at::Tensor resolve_builtin(
     py::object obj,
     py::args py_args,
-    py::kwargs py_kwargs);
+    py::kwargs py_kwargs = {});
 } // namespace nested_tensor
 } // namespace torch
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index 0b3c305c..aee3c7fa 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev201912202+eee010d'
-git_version = 'eee010ddbec748509e3aeeaa10d545d1f50bac5c'
+__version__ = '0.0.1.dev201912203+4e7555a'
+git_version = '4e7555a68e89838619b88fe459947418ab02417f'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From 49d9887b58b1453e009c0256ba2dac7f1e19ce67 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Thu, 19 Dec 2019 20:08:43 -0800
Subject: [PATCH 15/49] Checkpoint

---
 nestedtensor/csrc/jit_list_apply.cpp | 4 ++--
 nestedtensor/version.py              | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index ae911603..056e70b9 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -258,8 +258,8 @@ at::Tensor resolve_builtin(
       bool types_match = true;
       TypeEnv type_env;
       for (size_t j = 0; j < parse_py_args.size(); j++) {
-        std::cout << "parse_py_args[" << j << "]: " << parse_py_args[j].ivalue().type()->str()
-                  << std::endl;
+        std::cout << " ; parse_py_args[" << j << "]: " << parse_py_args[j].ivalue().type()->str();
+        // Now that we found that the overall schema matches, we need to check whether the types match.
         MatchTypeReturn match = matchTypeVariables(
             schema_args[j].type(), parse_py_args[j].ivalue().type(), type_env);
         types_match = types_match && match.success();
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index aee3c7fa..cf4b9529 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev201912203+4e7555a'
-git_version = '4e7555a68e89838619b88fe459947418ab02417f'
+__version__ = '0.0.1.dev201912204+c9b2bef'
+git_version = 'c9b2bef345cde60912cc9d78ef46b96c54e8fb3e'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From 5347bdad6c72e1248069f4e4b122d7b83ea5ca95 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Thu, 19 Dec 2019 20:29:08 -0800
Subject: [PATCH 16/49] Checkpoint

---
 nestedtensor/csrc/jit_list_apply.cpp | 203 ++++++++++++++-------------
 nestedtensor/version.py              |   4 +-
 2 files changed, 109 insertions(+), 98 deletions(-)

diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index 056e70b9..2391c54d 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -138,40 +138,75 @@ THP_ListNestedTensor jit_apply_function(
   return THP_ListNestedTensor(_ListNestedTensor(nested_node));
 }
 
-// TODO: This should support 3 types of functions
-// fn might be scripted (i.e. StrongFunctionPtr)
-// fn might be a builtin (need to resolve!)
-// fn might be neither, so we just dispatch to some regular python for-loops
-// (not fast!)
-py::cpp_function jit_tensorwise() {
-  return py::cpp_function([](py::object fn) {
-    return py::cpp_function([fn](py::args args, py::kwargs kwargs) {
-      auto sfn = py::cast<StrongFunctionPtr>(fn);
-      Function& f = *sfn.function_;
-      std::vector<ArgWrapper> nested_nodes;
-      for (size_t i = 0; i < args.size(); i++) {
-        if (py::isinstance<THP_ListNestedTensor>(args[i])) {
-          nested_nodes.push_back(ArgWrapper(
-              py::cast<THP_ListNestedTensor>(args[i]).data().get_structure()));
-        } else if (py::isinstance<THP_BufferNestedTensor>(args[i])) {
-          nested_nodes.push_back(
-              ArgWrapper(py::cast<THP_BufferNestedTensor>(args[i])
-                             .data()
-                             .get_structure()));
-        } else {
-          nested_nodes.push_back(ArgWrapper(toTypeInferredIValue(args[i])));
-        }
-      }
-      py::gil_scoped_release release;
-      TensorNode result = apply_jit_function(nested_nodes, f);
-      py::gil_scoped_acquire acquire;
-      return THP_ListNestedTensor(_ListNestedTensor(result));
-    });
-  });
+static bool try_match_schema(
+    const FunctionSchema* schema,
+    py::args py_args,
+    py::kwargs py_kwargs) {
+  // In the end it's only a match when this counter fully depleted the args.
+  size_t py_args_i = 0;
+  size_t used_kwargs = 0;
+  std::vector<bool> used_kwarg(py_kwargs.size(), false);
+  const std::vector<Argument>& schema_args = schema->arguments();
+  std::vector<ArgWrapper> parse_py_args;
+  // For each argument in the Schema, see if it can be matched up with the
+  // given python arguments to determine whether it's the right overload.
+  //
+  // First we resolve the python arguments to build list of candidate
+  // wrapped arguments. It's not enough to parse these arguments
+  // outside of a given Schema because of the type environment
+  // and conversions. It's possible to match a Python call
+  // signature to an overload with different types such as
+  // Scalar and Tensor etc. simply by requiring conversion.
+  bool fail = false;
+  for (size_t j = 0; j < schema_args.size(); j++) {
+    // TODO: Support for self as in tryMatchArgument?
+    Argument schema_arg = schema_args[j];
+    if (!schema_arg.kwarg_only() && py_args_i < py_args.size()) {
+      // TODO: Add support to allow conversions.
+      IValue ivalue = toTypeInferredIValue(py_args[py_args_i]);
+      parse_py_args.emplace_back(ArgWrapper(ivalue));
+      py_args_i++;
+    } else if (py_kwargs.contains(schema_arg.name().c_str())) {
+      // TODO: Check for no presence of duplicates in given schema
+      py::handle py_kwarg_object = py_kwargs[schema_arg.name().c_str()];
+      parse_py_args.emplace_back(
+          ArgWrapper(schema_arg.name(), toTypeInferredIValue(py_kwarg_object)));
+      used_kwargs++;
+    } else if (schema_arg.default_value()) {
+      parse_py_args.emplace_back(ArgWrapper(*schema_arg.default_value()));
+    } else {
+      // The given schema cannot find either a positional or keyword argument to
+      // match against for this given schema argument. There also is no default
+      // value specified for this schema argument. Therefore this schema cannot
+      // be the correct overload.
+      return false;
+    }
+  }
+  if (
+      // Check whether all positional arguments were matched by given Schema
+      (py_args.size() == py_args_i) &&
+      // Check if all kwargs were matched by given Schema
+      (used_kwargs == py_kwargs.size())) {
+    bool types_match = true;
+    TypeEnv type_env;
+    for (size_t j = 0; j < parse_py_args.size(); j++) {
+      std::cout << " ; parse_py_args[" << j
+                << "]: " << parse_py_args[j].ivalue().type()->str();
+      // Now that we found that the overall schema matches, we need to check
+      // whether the types match.
+      MatchTypeReturn match = matchTypeVariables(
+          schema_args[j].type(), parse_py_args[j].ivalue().type(), type_env);
+      types_match = types_match && match.success();
+    }
+    if (types_match) {
+      return true;
+    }
+  }
+  return false;
 }
 
 // TODO: Write comparison operation based on a subset of Argument comparison
-at::Tensor resolve_builtin(
+c10::optional<Function*> resolve_builtin(
     py::object obj,
     py::args py_args,
     py::kwargs py_kwargs) {
@@ -185,14 +220,17 @@ at::Tensor resolve_builtin(
   const auto& variants = getAllOperatorsFor(name);
   const auto& builtin_functions = getAllBuiltinFunctionsFor(name);
 
+  // TODO: Move this into jit_tensorwise and add support for all 3 cases.
   std::stringstream failure_messages;
   std::vector<const FunctionSchema*> schemas;
   for (const std::shared_ptr<Operator>& op : variants) {
     schemas.push_back(&op->schema());
   }
-  for (const auto method : builtin_functions) {
+  for (Function* method : builtin_functions) {
     method->ensure_defined();
-    schemas.push_back(&method->getSchema());
+    if (try_match_schema(&method->getSchema())) {
+      return method;
+    }
   }
 
   // Go through each Schema candidate based on the overloads
@@ -206,73 +244,46 @@ at::Tensor resolve_builtin(
   // single floats into lists of floats, resolving concrete types) or dealing
   // with the unordered nature of kwargs.
   for (size_t i = 0; i < schemas.size(); i++) {
-    const FunctionSchema* schema = schemas[i];
-    std::cout << "schema[" << i << "]:\t" << *schemas[i];
-    std::cout << " - overload_name: " << schemas[i]->overload_name();
-    // In the end it's only a match when this counter fully depleted the args.
-    size_t py_args_i = 0;
-    size_t used_kwargs = 0;
-    std::vector<bool> used_kwarg(py_kwargs.size(), false);
-    const std::vector<Argument>& schema_args = schema->arguments();
-    std::vector<ArgWrapper> parse_py_args;
-    // For each argument in the Schema, see if it can be matched up with the
-    // given python arguments to determine whether it's the right overload.
-    //
-    // First we resolve the python arguments to build list of candidate
-    // wrapped arguments. It's not enough to parse these arguments
-    // outside of a given Schema because of the type environment
-    // and conversions. It's possible to match a Python call
-    // signature to an overload with different types such as
-    // Scalar and Tensor etc. simply by requiring conversion.
-    bool fail = false;
-    for (size_t j = 0; j < schema_args.size(); j++) {
-      // TODO: Support for self as in tryMatchArgument?
-      Argument schema_arg = schema_args[j];
-      if (!schema_arg.kwarg_only() && py_args_i < py_args.size()) {
-        // TODO: Add support to allow conversions.
-        IValue ivalue = toTypeInferredIValue(py_args[py_args_i]);
-        parse_py_args.emplace_back(ArgWrapper(ivalue));
-        py_args_i++;
-      } else if (py_kwargs.contains(schema_arg.name().c_str())) {
-        // TODO: Check for no presence of duplicates in given schemas[i]
-        py::handle py_kwarg_object = py_kwargs[schema_arg.name().c_str()];
-        parse_py_args.emplace_back(ArgWrapper(
-            schema_arg.name(), toTypeInferredIValue(py_kwarg_object)));
-        used_kwargs++;
-      } else if (schema_arg.default_value()) {
-        parse_py_args.emplace_back(ArgWrapper(*schema_arg.default_value()));
-      } else {
-        // The given schema cannot find either a positional or keyword argument to match against
-        // for this given schema argument. There also is no default value specified for this 
-        // schema argument. Therefore this schema cannot be the correct overload.
-        fail = true;
-        break;
-      }
-    }
-    if (!fail && 
-        // Check whether all positional arguments were matched by given Schema
-        (py_args.size() == py_args_i) &&
-        // Check if all kwargs were matched by given Schema
-        (used_kwargs == py_kwargs.size())
-        ) {
-      bool types_match = true;
-      TypeEnv type_env;
-      for (size_t j = 0; j < parse_py_args.size(); j++) {
-        std::cout << " ; parse_py_args[" << j << "]: " << parse_py_args[j].ivalue().type()->str();
-        // Now that we found that the overall schema matches, we need to check whether the types match.
-        MatchTypeReturn match = matchTypeVariables(
-            schema_args[j].type(), parse_py_args[j].ivalue().type(), type_env);
-        types_match = types_match && match.success();
-      }
-      if (types_match) {
-        std::cout << "\t=== WIN" << std::endl;
-        continue;
-      }
+    if (try_match_schema(schemas[i], py_args, py_kwargs)) {
+      std::cout << "schema[" << i << "]:\t" << *schemas[i];
+      std::cout << " - overload_name: " << schemas[i]->overload_name();
+      std::cout << "WIN" << std::endl;
     }
-    std::cout << "\t=== FAIL" << std::endl;
   }
   return torch::ones({});
 }
 
+// TODO: This should support 3 types of functions
+// fn might be scripted (i.e. StrongFunctionPtr)
+// fn might be a builtin (need to resolve!)
+// fn might be neither, so we just dispatch to some regular python for-loops
+// (not fast!)
+py::cpp_function jit_tensorwise() {
+  return py::cpp_function([](py::object fn) {
+    return py::cpp_function([fn](py::args args, py::kwargs kwargs) {
+      auto sfn = py::cast<StrongFunctionPtr>(fn);
+      Function& f = *sfn.function_;
+      std::vector<ArgWrapper> nested_nodes;
+      for (size_t i = 0; i < args.size(); i++) {
+        if (py::isinstance<THP_ListNestedTensor>(args[i])) {
+          nested_nodes.push_back(ArgWrapper(
+              py::cast<THP_ListNestedTensor>(args[i]).data().get_structure()));
+        } else if (py::isinstance<THP_BufferNestedTensor>(args[i])) {
+          nested_nodes.push_back(
+              ArgWrapper(py::cast<THP_BufferNestedTensor>(args[i])
+                             .data()
+                             .get_structure()));
+        } else {
+          nested_nodes.push_back(ArgWrapper(toTypeInferredIValue(args[i])));
+        }
+      }
+      py::gil_scoped_release release;
+      TensorNode result = apply_jit_function(nested_nodes, f);
+      py::gil_scoped_acquire acquire;
+      return THP_ListNestedTensor(_ListNestedTensor(result));
+    });
+  });
+}
+
 } // namespace nested_tensor
 } // namespace torch
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index cf4b9529..91c3a076 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev201912204+c9b2bef'
-git_version = 'c9b2bef345cde60912cc9d78ef46b96c54e8fb3e'
+__version__ = '0.0.1.dev201912204+49d9887'
+git_version = '49d9887b58b1453e009c0256ba2dac7f1e19ce67'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From 8574dc883bb198df8a4c5cadc5fd6baf2f6c54a1 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Fri, 20 Dec 2019 17:22:10 -0800
Subject: [PATCH 17/49] Checkpoint

---
 nestedtensor/csrc/jit_list_apply.cpp | 153 ++++++++++++++-------------
 nestedtensor/version.py              |   4 +-
 2 files changed, 84 insertions(+), 73 deletions(-)

diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index 2391c54d..340c1d18 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -43,9 +43,8 @@ struct ArgWrapper {
 };
 
 // TODO: Assert that one arg must be a nestedtensor?
-static TensorNode apply_jit_function(
-    std::vector<ArgWrapper>& args,
-    Function& fn) {
+template <class F>
+static TensorNode apply_jit_function(std::vector<ArgWrapper>& args, F fn) {
   bool all_leaf = true;
   for (size_t i = 0; i < args.size(); i++) {
     if (args[i].is_nested_tensor()) {
@@ -133,15 +132,15 @@ THP_ListNestedTensor jit_apply_function(
     nested_nodes.push_back(ArgWrapper(nts[i].get_structure()));
   }
   py::gil_scoped_release release;
-  TensorNode nested_node = apply_jit_function(nested_nodes, callee);
+  TensorNode nested_node = apply_jit_function<Function&>(nested_nodes, callee);
   py::gil_scoped_acquire acquire;
   return THP_ListNestedTensor(_ListNestedTensor(nested_node));
 }
 
 static bool try_match_schema(
     const FunctionSchema* schema,
-    py::args py_args,
-    py::kwargs py_kwargs) {
+    const std::vector<ArgWrapper>& py_args,
+    const std::unordered_map<std::string, ArgWrapper>& py_kwargs) {
   // In the end it's only a match when this counter fully depleted the args.
   size_t py_args_i = 0;
   size_t used_kwargs = 0;
@@ -157,20 +156,16 @@ static bool try_match_schema(
   // and conversions. It's possible to match a Python call
   // signature to an overload with different types such as
   // Scalar and Tensor etc. simply by requiring conversion.
-  bool fail = false;
   for (size_t j = 0; j < schema_args.size(); j++) {
     // TODO: Support for self as in tryMatchArgument?
     Argument schema_arg = schema_args[j];
     if (!schema_arg.kwarg_only() && py_args_i < py_args.size()) {
       // TODO: Add support to allow conversions.
-      IValue ivalue = toTypeInferredIValue(py_args[py_args_i]);
-      parse_py_args.emplace_back(ArgWrapper(ivalue));
+      parse_py_args.push_back(py_args[py_args_i]);
       py_args_i++;
     } else if (py_kwargs.contains(schema_arg.name().c_str())) {
       // TODO: Check for no presence of duplicates in given schema
-      py::handle py_kwarg_object = py_kwargs[schema_arg.name().c_str()];
-      parse_py_args.emplace_back(
-          ArgWrapper(schema_arg.name(), toTypeInferredIValue(py_kwarg_object)));
+      parse_py_args.push_back(py_kwargs[schema_arg.name().c_str()]);
       used_kwargs++;
     } else if (schema_arg.default_value()) {
       parse_py_args.emplace_back(ArgWrapper(*schema_arg.default_value()));
@@ -206,51 +201,55 @@ static bool try_match_schema(
 }
 
 // TODO: Write comparison operation based on a subset of Argument comparison
-c10::optional<Function*> resolve_builtin(
-    py::object obj,
-    py::args py_args,
-    py::kwargs py_kwargs) {
+// TODO: Move this into jit_tensorwise and add support for all 3 cases.
+// TODO: Template apply_jit_function to work with Operation and Function.
+c10::optional<Symbol> is_builtin(py::object fn) {
   py::object builtin_name =
-      py::module::import("torch.jit").attr("_find_builtin")(obj);
-  auto name = c10::Symbol::fromQualString(py::str(builtin_name));
-
-  std::cout << "builtin_name: " << builtin_name << std::endl;
-  std::cout << "name: " << name << std::endl;
+      py::module::import("torch.jit").attr("_find_builtin")(fn);
+  Symbol name = c10::Symbol::fromQualString(py::str(builtin_name));
 
+  // TODO: Is there a cheaper way to do this?
   const auto& variants = getAllOperatorsFor(name);
-  const auto& builtin_functions = getAllBuiltinFunctionsFor(name);
-
-  // TODO: Move this into jit_tensorwise and add support for all 3 cases.
-  std::stringstream failure_messages;
-  std::vector<const FunctionSchema*> schemas;
-  for (const std::shared_ptr<Operator>& op : variants) {
-    schemas.push_back(&op->schema());
+  if (variants.size() == 0) {
+    return c10::nullopt;
   }
-  for (Function* method : builtin_functions) {
-    method->ensure_defined();
-    if (try_match_schema(&method->getSchema())) {
-      return method;
-    }
+  const auto& builtin_functions = getAllBuiltinFunctionsFor(name);
+  if (builtin_functions.size() == 0) {
+    return c10::nullopt;
   }
+  return name;
+}
 
-  // Go through each Schema candidate based on the overloads
-  // The order here matters and is given by the way we construct schemas.
-  // This is a subset of matchSchemas within jit/script/schema_matching.cpp
-  // and only implements the argument matching based on features such as types.
-  // It could eventually live in the JIT as a subcomponent that can implement
-  // overload resolution generically and outside a graph context.
-  //
-  // In essence we spend most of our time resolving types (e.g. turn
-  // single floats into lists of floats, resolving concrete types) or dealing
-  // with the unordered nature of kwargs.
-  for (size_t i = 0; i < schemas.size(); i++) {
-    if (try_match_schema(schemas[i], py_args, py_kwargs)) {
-      std::cout << "schema[" << i << "]:\t" << *schemas[i];
-      std::cout << " - overload_name: " << schemas[i]->overload_name();
-      std::cout << "WIN" << std::endl;
-    }
+//  // Go through each Schema candidate based on the overloads
+//  // The order here matters and is given by the way we construct schemas.
+//  // This is a subset of matchSchemas within jit/script/schema_matching.cpp
+//  // and only implements the argument matching based on features such as
+//  types.
+//  // It could eventually live in the JIT as a subcomponent that can implement
+//  // overload resolution generically and outside a graph context.
+//  //
+//  // In essence we spend most of our time resolving types (e.g. turn
+//  // single floats into lists of floats, resolving concrete types) or dealing
+//  // with the unordered nature of kwargs.
+//  for (size_t i = 0; i < schemas.size(); i++) {
+//    if (try_match_schema(schemas[i], py_args, py_kwargs)) {
+//      std::cout << "schema[" << i << "]:\t" << *schemas[i];
+//      std::cout << " - overload_name: " << schemas[i]->overload_name();
+//      std::cout << "WIN" << std::endl;
+//    }
+//  }
+//  return torch::ones({});
+} // namespace nested_tensor
+
+ArgWrapper wrap_arg(py::object arg) {
+  if (py::isinstance<THP_ListNestedTensor>(arg)) {
+    return ArgWrapper(
+        py::cast<THP_ListNestedTensor>(arg).data().get_structure());
+  } else if (py::isinstance<THP_BufferNestedTensor>(arg)) {
+    return ArgWrapper(
+        py::cast<THP_BufferNestedTensor>(arg).data().get_structure());
   }
-  return torch::ones({});
+  return ArgWrapper(toTypeInferredIValue(arg));
 }
 
 // TODO: This should support 3 types of functions
@@ -260,30 +259,42 @@ c10::optional<Function*> resolve_builtin(
 // (not fast!)
 py::cpp_function jit_tensorwise() {
   return py::cpp_function([](py::object fn) {
-    return py::cpp_function([fn](py::args args, py::kwargs kwargs) {
-      auto sfn = py::cast<StrongFunctionPtr>(fn);
-      Function& f = *sfn.function_;
-      std::vector<ArgWrapper> nested_nodes;
-      for (size_t i = 0; i < args.size(); i++) {
-        if (py::isinstance<THP_ListNestedTensor>(args[i])) {
-          nested_nodes.push_back(ArgWrapper(
-              py::cast<THP_ListNestedTensor>(args[i]).data().get_structure()));
-        } else if (py::isinstance<THP_BufferNestedTensor>(args[i])) {
-          nested_nodes.push_back(
-              ArgWrapper(py::cast<THP_BufferNestedTensor>(args[i])
-                             .data()
-                             .get_structure()));
-        } else {
-          nested_nodes.push_back(ArgWrapper(toTypeInferredIValue(args[i])));
+    return py::cpp_function([fn](py::args args_, py::kwargs kwargs_) {
+      std::vector<ArgWrapper> args;
+      for (size_t i = 0; i < args_.size(); i++) {
+        nested_nodes.push_back(wrap_arg(args_[i]));
+      }
+      std::unordered_map<std::string, ArgWrapper> kwargs;
+      for (const auto& pair : kwargs_) {
+        kwargs[pair.first] = wrap_arg(paid.second);
+      }
+
+      if (py::isinstance<StrongFunctionPtr>(fn)) {
+        auto sfn = py::cast<StrongFunctionPtr>(fn);
+        Function& f = *sfn.function_;
+        py::gil_scoped_release release;
+        result = apply_jit_function(args, f);
+        py::gil_scoped_acquire acquire;
+        return THP_ListNestedTensor(_ListNestedTensor(result));
+      }
+      if (auto names = is_builtin(fn)) {
+        for (const auto& op : getAllOperatorsFor(name)) {
+          if (try_match_schema(&op->schema)) {
+            return apply_jit_function(args, op->getOperation());
+          }
+        }
+        for (const auto& op : getAllBuiltinFunctionsFor(name)) {
+          if (try_match_schema(&op->schema)) {
+            return apply_jit_function(args, op);
+          }
         }
       }
-      py::gil_scoped_release release;
-      TensorNode result = apply_jit_function(nested_nodes, f);
-      py::gil_scoped_acquire acquire;
-      return THP_ListNestedTensor(_ListNestedTensor(result));
+      // TODO: Need implementation of generic python version.
+      std::cout << "FAIL!" << std::endl;
+      return torch::ones({});
     });
   });
-}
+} // namespace torch
 
-} // namespace nested_tensor
+} // namespace torch
 } // namespace torch
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index 91c3a076..4e87a71e 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev201912204+49d9887'
-git_version = '49d9887b58b1453e009c0256ba2dac7f1e19ce67'
+__version__ = '0.0.1.dev201912211+5347bda'
+git_version = '5347bdad6c72e1248069f4e4b122d7b83ea5ca95'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From 37a0ea4e4a1b26e6568f76283ce32fde41d57de0 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Fri, 20 Dec 2019 18:06:29 -0800
Subject: [PATCH 18/49] Checkpoint

---
 nestedtensor/csrc/jit_list_apply.cpp | 6 +++---
 nestedtensor/version.py              | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index 340c1d18..1aff1430 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -163,9 +163,9 @@ static bool try_match_schema(
       // TODO: Add support to allow conversions.
       parse_py_args.push_back(py_args[py_args_i]);
       py_args_i++;
-    } else if (py_kwargs.contains(schema_arg.name().c_str())) {
+    } else if (py_kwargs.find(schema_arg.name().c_str()) != py_kwargs.end()) {
       // TODO: Check for no presence of duplicates in given schema
-      parse_py_args.push_back(py_kwargs[schema_arg.name().c_str()]);
+      parse_py_args.push_back(py_kwargs.at(schema_arg.name().c_str()));
       used_kwargs++;
     } else if (schema_arg.default_value()) {
       parse_py_args.emplace_back(ArgWrapper(*schema_arg.default_value()));
@@ -241,7 +241,7 @@ c10::optional<Symbol> is_builtin(py::object fn) {
 //  return torch::ones({});
 } // namespace nested_tensor
 
-ArgWrapper wrap_arg(py::object arg) {
+static ArgWrapper wrap_arg(py::object arg) {
   if (py::isinstance<THP_ListNestedTensor>(arg)) {
     return ArgWrapper(
         py::cast<THP_ListNestedTensor>(arg).data().get_structure());
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index 4e87a71e..1d6e7ab3 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev201912211+5347bda'
-git_version = '5347bdad6c72e1248069f4e4b122d7b83ea5ca95'
+__version__ = '0.0.1.dev201912211+8574dc8'
+git_version = '8574dc883bb198df8a4c5cadc5fd6baf2f6c54a1'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From 5cd97488b62fbf3366787b72a01a8d624380b54b Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Sat, 21 Dec 2019 21:55:05 -0800
Subject: [PATCH 19/49] Checkpoint

---
 nestedtensor/csrc/jit_list_apply.cpp | 68 ++++++++++++++++++----------
 nestedtensor/csrc/jit_list_apply.h   | 22 +++++++--
 nestedtensor/csrc/py_init.cpp        |  1 -
 nestedtensor/version.py              |  4 +-
 4 files changed, 64 insertions(+), 31 deletions(-)

diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index 1aff1430..d0f7e4e4 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -42,9 +42,10 @@ struct ArgWrapper {
   TensorNode _nested_tensor;
 };
 
+
 // TODO: Assert that one arg must be a nestedtensor?
 template <class F>
-static TensorNode apply_jit_function(std::vector<ArgWrapper>& args, F fn) {
+static TensorNode apply_jit_function(std::vector<ArgWrapper>& args, F& fn) {
   bool all_leaf = true;
   for (size_t i = 0; i < args.size(); i++) {
     if (args[i].is_nested_tensor()) {
@@ -73,9 +74,10 @@ static TensorNode apply_jit_function(std::vector<ArgWrapper>& args, F fn) {
         }
       }
     }
+    // TODO: getSchema().checkAndNormalizeInputs(stack, kwargs);?
     c10::List<at::Tensor> results;
     for (size_t i = 0; i < stacks.size(); i++) {
-      results.push_back(fn(stacks[i]).toTensor());
+      results.push_back(run_function<F>(stacks[i], fn));
     }
     return TensorNode(results);
   } else {
@@ -107,11 +109,30 @@ static TensorNode apply_jit_function(std::vector<ArgWrapper>& args, F fn) {
           local_args.push_back(ArgWrapper(args[j].ivalue()));
         }
       }
-      result.push_back(apply_jit_function(local_args, fn));
+      result.push_back(apply_jit_function<F>(local_args, fn));
     }
     return TensorNode(result);
   }
 }
+
+template <class F>
+static THP_ListNestedTensor apply_jit_function_helper(
+    std::vector<ArgWrapper>& args,
+    std::unordered_map<std::string, ArgWrapper> kwargs,
+    F& op) {
+  std::vector<ArgWrapper> flat_args;
+  for (size_t i = 0; i < args.size(); i++) {
+    flat_args.push_back(args[i]);
+  }
+  for (auto kwarg : kwargs) {
+    flat_args.push_back(kwarg.second);
+  }
+  py::gil_scoped_release release;
+  TensorNode result = apply_jit_function(flat_args, op);
+  py::gil_scoped_acquire acquire;
+  return THP_ListNestedTensor(_ListNestedTensor(result));
+}
+
 THP_ListNestedTensor jit_apply_function(
     std::vector<THP_ListNestedTensor> nts_,
     py::object fn) {
@@ -132,7 +153,7 @@ THP_ListNestedTensor jit_apply_function(
     nested_nodes.push_back(ArgWrapper(nts[i].get_structure()));
   }
   py::gil_scoped_release release;
-  TensorNode nested_node = apply_jit_function<Function&>(nested_nodes, callee);
+  TensorNode nested_node = apply_jit_function<Function>(nested_nodes, callee);
   py::gil_scoped_acquire acquire;
   return THP_ListNestedTensor(_ListNestedTensor(nested_node));
 }
@@ -239,7 +260,6 @@ c10::optional<Symbol> is_builtin(py::object fn) {
 //    }
 //  }
 //  return torch::ones({});
-} // namespace nested_tensor
 
 static ArgWrapper wrap_arg(py::object arg) {
   if (py::isinstance<THP_ListNestedTensor>(arg)) {
@@ -262,39 +282,39 @@ py::cpp_function jit_tensorwise() {
     return py::cpp_function([fn](py::args args_, py::kwargs kwargs_) {
       std::vector<ArgWrapper> args;
       for (size_t i = 0; i < args_.size(); i++) {
-        nested_nodes.push_back(wrap_arg(args_[i]));
+        args.push_back(wrap_arg(args_[i]));
       }
       std::unordered_map<std::string, ArgWrapper> kwargs;
-      for (const auto& pair : kwargs_) {
-        kwargs[pair.first] = wrap_arg(paid.second);
+      for (const std::pair<py::handle, py::handle>& pair : kwargs_) {
+        kwargs.emplace(std::make_pair(
+            std::string(py::str(pair.first)),
+            wrap_arg(py::reinterpret_borrow<py::object>(pair.second))));
       }
 
       if (py::isinstance<StrongFunctionPtr>(fn)) {
         auto sfn = py::cast<StrongFunctionPtr>(fn);
-        Function& f = *sfn.function_;
-        py::gil_scoped_release release;
-        result = apply_jit_function(args, f);
-        py::gil_scoped_acquire acquire;
-        return THP_ListNestedTensor(_ListNestedTensor(result));
+        Function& op = *sfn.function_;
+        return apply_jit_function_helper<Function>(args, kwargs, op);
       }
-      if (auto names = is_builtin(fn)) {
-        for (const auto& op : getAllOperatorsFor(name)) {
-          if (try_match_schema(&op->schema)) {
-            return apply_jit_function(args, op->getOperation());
+      if (auto name = is_builtin(fn)) {
+        for (const auto& op : getAllOperatorsFor(*name)) {
+          if (try_match_schema(&op->schema(), args, kwargs)) {
+            Operation actual = op->getOperation();
+            return apply_jit_function_helper<Operation>(args, kwargs, actual);
           }
         }
-        for (const auto& op : getAllBuiltinFunctionsFor(name)) {
-          if (try_match_schema(&op->schema)) {
-            return apply_jit_function(args, op);
+        for (const auto& op : getAllBuiltinFunctionsFor(*name)) {
+          if (try_match_schema(&op->getSchema(), args, kwargs)) {
+            return apply_jit_function_helper<Function>(args, kwargs, *op);
           }
         }
       }
       // TODO: Need implementation of generic python version.
       std::cout << "FAIL!" << std::endl;
-      return torch::ones({});
+      TensorNode result;
+      return THP_ListNestedTensor(_ListNestedTensor(result));
     });
   });
-} // namespace torch
-
-} // namespace torch
+}
+} // namespace nested_tensor
 } // namespace torch
diff --git a/nestedtensor/csrc/jit_list_apply.h b/nestedtensor/csrc/jit_list_apply.h
index 5a21a4aa..eb6c272e 100644
--- a/nestedtensor/csrc/jit_list_apply.h
+++ b/nestedtensor/csrc/jit_list_apply.h
@@ -9,13 +9,27 @@
 
 namespace torch {
 namespace nested_tensor {
+
+// TODO Expand to IValues to support generic lists?
+template <class F>
+inline at::Tensor run_function(std::vector<c10::IValue> stack, F& fn);
+
+template <>
+inline at::Tensor run_function(std::vector<c10::IValue> stack, Function& fn) {
+  return fn(stack).toTensor();
+}
+
+template <>
+inline at::Tensor run_function(std::vector<c10::IValue> stack, Operation& fn) {
+  fn(stack);
+  return stack.front().toTensor();
+}
+
 THP_ListNestedTensor jit_apply_function(
     std::vector<THP_ListNestedTensor> nts_,
     py::object fn);
+
 py::cpp_function jit_tensorwise();
-at::Tensor resolve_builtin(
-    py::object obj,
-    py::args py_args,
-    py::kwargs py_kwargs = {});
+
 } // namespace nested_tensor
 } // namespace torch
diff --git a/nestedtensor/csrc/py_init.cpp b/nestedtensor/csrc/py_init.cpp
index 73152bea..b1092b16 100644
--- a/nestedtensor/csrc/py_init.cpp
+++ b/nestedtensor/csrc/py_init.cpp
@@ -165,5 +165,4 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
 
   m.def("jit_apply_function", &torch::nested_tensor::jit_apply_function);
   m.def("jit_tensorwise", &torch::nested_tensor::jit_tensorwise);
-  m.def("resolve_builtin", &torch::nested_tensor::resolve_builtin);
 }
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index 1d6e7ab3..2b3c86c5 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev201912211+8574dc8'
-git_version = '8574dc883bb198df8a4c5cadc5fd6baf2f6c54a1'
+__version__ = '0.0.1.dev201912225+37a0ea4'
+git_version = '37a0ea4e4a1b26e6568f76283ce32fde41d57de0'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From ca4faafe2a96329b404bdf1cbec000eba8e16378 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Sat, 21 Dec 2019 22:31:39 -0800
Subject: [PATCH 20/49] Checkpoint

---
 benchmarks/jit_tensorwise.py         | 16 ++++++++++++----
 nestedtensor/csrc/jit_list_apply.cpp | 20 +++++++++++++++-----
 nestedtensor/csrc/jit_list_apply.h   | 16 ++++++++++++++--
 nestedtensor/version.py              |  4 ++--
 4 files changed, 43 insertions(+), 13 deletions(-)

diff --git a/benchmarks/jit_tensorwise.py b/benchmarks/jit_tensorwise.py
index a2bac43c..547ad9ac 100644
--- a/benchmarks/jit_tensorwise.py
+++ b/benchmarks/jit_tensorwise.py
@@ -10,7 +10,15 @@ def f(i, w):
 
 
 if __name__ == "__main__":
-    r = f(nestedtensor._C._ListNestedTensor([torch.randn(1, 3, 10, 20)]),
-        nestedtensor._C._ListNestedTensor([torch.randn(5, 3, 3, 3)]))
-    
-    print(r.nested_size())
+    # r = f(nestedtensor._C._ListNestedTensor([torch.randn(1, 3, 10, 20)]),
+    #     nestedtensor._C._ListNestedTensor([torch.randn(5, 3, 3, 3)]))
+    # 
+    # print(r.nested_size())
+
+    na = nestedtensor._C.jit_tensorwise()(torch.add)
+    print("111")
+    print(na(nestedtensor._C._ListNestedTensor([torch.randn(1, 2)]),
+        nestedtensor._C._ListNestedTensor([torch.randn(1, 2)]),
+        torch.tensor(3.0),
+        ))
+    print("222")
diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index d0f7e4e4..3d7a670f 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -24,6 +24,11 @@ struct ArgWrapper {
   }
 
   c10::IValue ivalue() {
+    if (_is_nested_tensor) {
+      TensorNode first_tensor_node = get_first_leaf(_nested_tensor);
+      // TODO: What if this is empty?
+      return c10::IValue(first_tensor_node.payload(0));
+    }
     return _ivalue;
   }
 
@@ -42,7 +47,6 @@ struct ArgWrapper {
   TensorNode _nested_tensor;
 };
 
-
 // TODO: Assert that one arg must be a nestedtensor?
 template <class F>
 static TensorNode apply_jit_function(std::vector<ArgWrapper>& args, F& fn) {
@@ -206,12 +210,13 @@ static bool try_match_schema(
     bool types_match = true;
     TypeEnv type_env;
     for (size_t j = 0; j < parse_py_args.size(); j++) {
-      std::cout << " ; parse_py_args[" << j
-                << "]: " << parse_py_args[j].ivalue().type()->str();
+      // std::cout << " ; parse_py_args[" << j
+      //           << "]: " << parse_py_args[j].ivalue().type()->str();
       // Now that we found that the overall schema matches, we need to check
       // whether the types match.
-      MatchTypeReturn match = matchTypeVariables(
-          schema_args[j].type(), parse_py_args[j].ivalue().type(), type_env);
+      TypePtr type_j = parse_py_args[j].ivalue().type();
+      MatchTypeReturn match =
+          matchTypeVariables(schema_args[j].type(), type_j, type_env);
       types_match = types_match && match.success();
     }
     if (types_match) {
@@ -292,6 +297,7 @@ py::cpp_function jit_tensorwise() {
       }
 
       if (py::isinstance<StrongFunctionPtr>(fn)) {
+        std::cout << "is StrongFunctionPtr" << std::endl;
         auto sfn = py::cast<StrongFunctionPtr>(fn);
         Function& op = *sfn.function_;
         return apply_jit_function_helper<Function>(args, kwargs, op);
@@ -299,12 +305,16 @@ py::cpp_function jit_tensorwise() {
       if (auto name = is_builtin(fn)) {
         for (const auto& op : getAllOperatorsFor(*name)) {
           if (try_match_schema(&op->schema(), args, kwargs)) {
+            std::cout << "is builtin Operation with schema: " << op->schema()
+                      << std::endl;
             Operation actual = op->getOperation();
             return apply_jit_function_helper<Operation>(args, kwargs, actual);
           }
         }
         for (const auto& op : getAllBuiltinFunctionsFor(*name)) {
           if (try_match_schema(&op->getSchema(), args, kwargs)) {
+            std::cout << "is builtin Function with schema: " << op->getSchema()
+                      << std::endl;
             return apply_jit_function_helper<Function>(args, kwargs, *op);
           }
         }
diff --git a/nestedtensor/csrc/jit_list_apply.h b/nestedtensor/csrc/jit_list_apply.h
index eb6c272e..d13e9563 100644
--- a/nestedtensor/csrc/jit_list_apply.h
+++ b/nestedtensor/csrc/jit_list_apply.h
@@ -16,13 +16,25 @@ inline at::Tensor run_function(std::vector<c10::IValue> stack, F& fn);
 
 template <>
 inline at::Tensor run_function(std::vector<c10::IValue> stack, Function& fn) {
-  return fn(stack).toTensor();
+  std::cout << "run_function_Function" << std::endl;
+  c10::IValue result = fn(stack);
+  std::cout << "finished result_Function" << std::endl;
+  return result.toTensor();
 }
 
 template <>
 inline at::Tensor run_function(std::vector<c10::IValue> stack, Operation& fn) {
+  size_t i = 0;
+  for (c10::IValue& ival : stack) {
+    std::cout << "ival " << i << " : " << ival << std::endl;
+    i++;
+  }
+  std::cout << "run_function_Operation" << std::endl;
   fn(stack);
-  return stack.front().toTensor();
+  std::cout << "run_function_Operation stack finished" << std::endl;
+  c10::IValue result = stack.front();
+  std::cout << "finished result_Operation" << std::endl;
+  return result.toTensor();
 }
 
 THP_ListNestedTensor jit_apply_function(
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index 2b3c86c5..03303579 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev201912225+37a0ea4'
-git_version = '37a0ea4e4a1b26e6568f76283ce32fde41d57de0'
+__version__ = '0.0.1.dev201912226+5cd9748'
+git_version = '5cd97488b62fbf3366787b72a01a8d624380b54b'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From 0a9e50df27d6580d5f2f2d8be344c2fe36709d09 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Sat, 21 Dec 2019 22:36:33 -0800
Subject: [PATCH 21/49] Checkpoint

---
 benchmarks/jit_tensorwise.py       | 4 ++--
 nestedtensor/csrc/jit_list_apply.h | 2 +-
 nestedtensor/version.py            | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/benchmarks/jit_tensorwise.py b/benchmarks/jit_tensorwise.py
index 547ad9ac..3c195a27 100644
--- a/benchmarks/jit_tensorwise.py
+++ b/benchmarks/jit_tensorwise.py
@@ -18,7 +18,7 @@ def f(i, w):
     na = nestedtensor._C.jit_tensorwise()(torch.add)
     print("111")
     print(na(nestedtensor._C._ListNestedTensor([torch.randn(1, 2)]),
-        nestedtensor._C._ListNestedTensor([torch.randn(1, 2)]),
-        torch.tensor(3.0),
+        3.0,
+        3.0
         ))
     print("222")
diff --git a/nestedtensor/csrc/jit_list_apply.h b/nestedtensor/csrc/jit_list_apply.h
index d13e9563..d66e254e 100644
--- a/nestedtensor/csrc/jit_list_apply.h
+++ b/nestedtensor/csrc/jit_list_apply.h
@@ -26,7 +26,7 @@ template <>
 inline at::Tensor run_function(std::vector<c10::IValue> stack, Operation& fn) {
   size_t i = 0;
   for (c10::IValue& ival : stack) {
-    std::cout << "ival " << i << " : " << ival << std::endl;
+    std::cout << "ival " << i << " : " << ival.tagKind() << std::endl;
     i++;
   }
   std::cout << "run_function_Operation" << std::endl;
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index 03303579..e2428445 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev201912226+5cd9748'
-git_version = '5cd97488b62fbf3366787b72a01a8d624380b54b'
+__version__ = '0.0.1.dev201912226+ca4faaf'
+git_version = 'ca4faafe2a96329b404bdf1cbec000eba8e16378'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From 636e4fdd6b55c1b73c97ed3cb0455b771db4fdd9 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Sat, 21 Dec 2019 22:39:05 -0800
Subject: [PATCH 22/49] Checkpoint

---
 benchmarks/jit_tensorwise.py         | 6 +++---
 nestedtensor/csrc/jit_list_apply.cpp | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/benchmarks/jit_tensorwise.py b/benchmarks/jit_tensorwise.py
index 3c195a27..0e03b4fc 100644
--- a/benchmarks/jit_tensorwise.py
+++ b/benchmarks/jit_tensorwise.py
@@ -17,8 +17,8 @@ def f(i, w):
 
     na = nestedtensor._C.jit_tensorwise()(torch.add)
     print("111")
-    print(na(nestedtensor._C._ListNestedTensor([torch.randn(1, 2)]),
-        3.0,
-        3.0
+    print(na(
+        nestedtensor._C._ListNestedTensor([torch.randn(1, 2)]),
+        nestedtensor._C._ListNestedTensor([torch.randn(1, 2)]),
         ))
     print("222")
diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index 3d7a670f..ce12c5da 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -162,6 +162,7 @@ THP_ListNestedTensor jit_apply_function(
   return THP_ListNestedTensor(_ListNestedTensor(nested_node));
 }
 
+// TODO: Write separate C++ test for overloads as test cases
 static bool try_match_schema(
     const FunctionSchema* schema,
     const std::vector<ArgWrapper>& py_args,

From ffa374ef98019505f681206b389f71170df11690 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Sat, 21 Dec 2019 23:31:06 -0800
Subject: [PATCH 23/49] Checkpoint

---
 benchmarks/jit_tensorwise.py         |  1 +
 nestedtensor/csrc/jit_list_apply.cpp | 25 +++++++++++++++++++++----
 nestedtensor/version.py              |  4 ++--
 3 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/benchmarks/jit_tensorwise.py b/benchmarks/jit_tensorwise.py
index 0e03b4fc..38eefecb 100644
--- a/benchmarks/jit_tensorwise.py
+++ b/benchmarks/jit_tensorwise.py
@@ -20,5 +20,6 @@ def f(i, w):
     print(na(
         nestedtensor._C._ListNestedTensor([torch.randn(1, 2)]),
         nestedtensor._C._ListNestedTensor([torch.randn(1, 2)]),
+        3.0
         ))
     print("222")
diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index ce12c5da..2ec9bdd7 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -167,6 +167,7 @@ static bool try_match_schema(
     const FunctionSchema* schema,
     const std::vector<ArgWrapper>& py_args,
     const std::unordered_map<std::string, ArgWrapper>& py_kwargs) {
+  std::cout << "Checking match for schema: " << *schema << std::endl;
   // In the end it's only a match when this counter fully depleted the args.
   size_t py_args_i = 0;
   size_t used_kwargs = 0;
@@ -194,12 +195,15 @@ static bool try_match_schema(
       parse_py_args.push_back(py_kwargs.at(schema_arg.name().c_str()));
       used_kwargs++;
     } else if (schema_arg.default_value()) {
+      // TODO: How is this converted to ScalarType if it's a int (usually)?
+      // What mechanism currently does this kind of conversion.
       parse_py_args.emplace_back(ArgWrapper(*schema_arg.default_value()));
     } else {
       // The given schema cannot find either a positional or keyword argument to
       // match against for this given schema argument. There also is no default
       // value specified for this schema argument. Therefore this schema cannot
       // be the correct overload.
+      std::cout << "ARGS COUNT OFF!" << std::endl;
       return false;
     }
   }
@@ -212,18 +216,31 @@ static bool try_match_schema(
     TypeEnv type_env;
     for (size_t j = 0; j < parse_py_args.size(); j++) {
       // std::cout << " ; parse_py_args[" << j
-      //           << "]: " << parse_py_args[j].ivalue().type()->str();
+      //           << "]: " << type_j->str();
       // Now that we found that the overall schema matches, we need to check
       // whether the types match.
+      // TODO: Need Subtypes and argument type conversions (e.g. convert one
+      // float to list of floats with right number of elements).
+      // MatchTypeReturn match =
+      //     matchTypeVariables(schema_args[j].type(), type_j, type_env);
       TypePtr type_j = parse_py_args[j].ivalue().type();
-      MatchTypeReturn match =
-          matchTypeVariables(schema_args[j].type(), type_j, type_env);
-      types_match = types_match && match.success();
+      std::cout << " x parse_py_args[" << j << "]: " << type_j->str();
+      std::cout << "\t=\t"
+                << "schema_args[" << j << "]: " << schema_args[j].type()->str();
+      // TODO: We want to know whether the actual argument is a convertible
+      // subtype to the one used in the schema.
+      // XXX: CONTINUE HERE!
+      types_match = types_match && matchTypeVariables(schema_args[j].type(), type_j, type_env).success();
+      std::cout << "\t types_match: " << types_match;
+      std::cout << std::endl;
     }
+    std::cout << std::endl;
     if (types_match) {
+      std::cout << "FOUND IT!" << std::endl;
       return true;
     }
   }
+  std::cout << "ARGS SIZES MISMATCHED" << std::endl;
   return false;
 }
 
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index e2428445..0200dc30 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev201912226+ca4faaf'
-git_version = 'ca4faafe2a96329b404bdf1cbec000eba8e16378'
+__version__ = '0.0.1.dev201912227+636e4fd'
+git_version = '636e4fdd6b55c1b73c97ed3cb0455b771db4fdd9'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From a7d9095b88155c245af3e9a16d1f8a88b0df87d7 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Wed, 25 Dec 2019 18:46:16 -0800
Subject: [PATCH 24/49] Checkpoint

---
 benchmarks/jit_tensorwise.py         |  1 -
 nestedtensor/csrc/jit_list_apply.cpp | 20 +++++++++++++++++---
 nestedtensor/version.py              |  4 ++--
 3 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/benchmarks/jit_tensorwise.py b/benchmarks/jit_tensorwise.py
index 38eefecb..0e03b4fc 100644
--- a/benchmarks/jit_tensorwise.py
+++ b/benchmarks/jit_tensorwise.py
@@ -20,6 +20,5 @@ def f(i, w):
     print(na(
         nestedtensor._C._ListNestedTensor([torch.randn(1, 2)]),
         nestedtensor._C._ListNestedTensor([torch.randn(1, 2)]),
-        3.0
         ))
     print("222")
diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index 2ec9bdd7..e0148d87 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -40,6 +40,15 @@ struct ArgWrapper {
     return _name;
   }
 
+  // XXX: CONTINUE!
+  void standardize() {
+    if (!is_nested_tensor()) {
+      if (_ivalue.isScalar()) {
+        _ivalue = c10::IValue(_ivalue.toScalar());
+      }
+    }
+  }
+
  private:
   std::string _name;
   bool _is_nested_tensor;
@@ -197,7 +206,9 @@ static bool try_match_schema(
     } else if (schema_arg.default_value()) {
       // TODO: How is this converted to ScalarType if it's a int (usually)?
       // What mechanism currently does this kind of conversion.
-      parse_py_args.emplace_back(ArgWrapper(*schema_arg.default_value()));
+      auto default_arg_wrapper = ArgWrapper(*schema_arg.default_value());
+      default_arg_wrapper.standardize();
+      parse_py_args.emplace_back(default_arg_wrapper);
     } else {
       // The given schema cannot find either a positional or keyword argument to
       // match against for this given schema argument. There also is no default
@@ -229,8 +240,11 @@ static bool try_match_schema(
                 << "schema_args[" << j << "]: " << schema_args[j].type()->str();
       // TODO: We want to know whether the actual argument is a convertible
       // subtype to the one used in the schema.
-      // XXX: CONTINUE HERE!
-      types_match = types_match && matchTypeVariables(schema_args[j].type(), type_j, type_env).success();
+      // TODO: Need type env?
+      // types_match = types_match && matchTypeVariables(schema_args[j].type(),
+      // type_j, type_env).success();
+      types_match =
+          types_match && (schema_args[j].type()->kind() == type_j->kind());
       std::cout << "\t types_match: " << types_match;
       std::cout << std::endl;
     }
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index 0200dc30..02629aa6 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev201912227+636e4fd'
-git_version = '636e4fdd6b55c1b73c97ed3cb0455b771db4fdd9'
+__version__ = '0.0.1.dev2019122422+ffa374e'
+git_version = 'ffa374ef98019505f681206b389f71170df11690'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From 8911706629b5b310b60d8cace987c75c8d138ba5 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Wed, 25 Dec 2019 19:31:19 -0800
Subject: [PATCH 25/49] Checkpoint

---
 nestedtensor/csrc/jit_list_apply.cpp | 171 +++++++++++++--------------
 nestedtensor/version.py              |   4 +-
 2 files changed, 86 insertions(+), 89 deletions(-)

diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index e0148d87..b5dc288e 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -40,15 +40,6 @@ struct ArgWrapper {
     return _name;
   }
 
-  // XXX: CONTINUE!
-  void standardize() {
-    if (!is_nested_tensor()) {
-      if (_ivalue.isScalar()) {
-        _ivalue = c10::IValue(_ivalue.toScalar());
-      }
-    }
-  }
-
  private:
   std::string _name;
   bool _is_nested_tensor;
@@ -128,18 +119,34 @@ static TensorNode apply_jit_function(std::vector<ArgWrapper>& args, F& fn) {
   }
 }
 
-template <class F>
-static THP_ListNestedTensor apply_jit_function_helper(
-    std::vector<ArgWrapper>& args,
-    std::unordered_map<std::string, ArgWrapper> kwargs,
-    F& op) {
+static ArgWrapper wrap_arg(py::object arg) {
+  if (py::isinstance<THP_ListNestedTensor>(arg)) {
+    return ArgWrapper(
+        py::cast<THP_ListNestedTensor>(arg).data().get_structure());
+  } else if (py::isinstance<THP_BufferNestedTensor>(arg)) {
+    return ArgWrapper(
+        py::cast<THP_BufferNestedTensor>(arg).data().get_structure());
+  }
+  return ArgWrapper(toTypeInferredIValue(arg));
+}
+
+static c10::optional<std::vector<ArgWrapper>> flatten_args(
+    py::args args_,
+    py::kwargs kwargs_) {
   std::vector<ArgWrapper> flat_args;
-  for (size_t i = 0; i < args.size(); i++) {
-    flat_args.push_back(args[i]);
+  for (size_t i = 0; i < args_.size(); i++) {
+    flat_args.push_back(wrap_arg(args_[i]));
   }
-  for (auto kwarg : kwargs) {
-    flat_args.push_back(kwarg.second);
+  std::unordered_map<std::string, ArgWrapper> kwargs;
+  for (const std::pair<py::handle, py::handle>& pair : kwargs_) {
+    flat_args.push_back(py::reinterpret_borrow<py::object>(pair.second));
   }
+}
+
+template <class F>
+static THP_ListNestedTensor apply_jit_function_helper(
+    std::vector<ArgWrapper>& flat_args,
+    F& op) {
   py::gil_scoped_release release;
   TensorNode result = apply_jit_function(flat_args, op);
   py::gil_scoped_acquire acquire;
@@ -172,17 +179,19 @@ THP_ListNestedTensor jit_apply_function(
 }
 
 // TODO: Write separate C++ test for overloads as test cases
-static bool try_match_schema(
+static c10::optional<std::vector<ArgWrapper>> try_match_schema(
     const FunctionSchema* schema,
-    const std::vector<ArgWrapper>& py_args,
-    const std::unordered_map<std::string, ArgWrapper>& py_kwargs) {
+    py::args py_args,
+    py::kwargs py_kwargs) {
+  // const std::vector<ArgWrapper>& py_args,
+  // const std::unordered_map<std::string, ArgWrapper>& py_kwargs) {
   std::cout << "Checking match for schema: " << *schema << std::endl;
   // In the end it's only a match when this counter fully depleted the args.
   size_t py_args_i = 0;
   size_t used_kwargs = 0;
-  std::vector<bool> used_kwarg(py_kwargs.size(), false);
   const std::vector<Argument>& schema_args = schema->arguments();
   std::vector<ArgWrapper> parse_py_args;
+
   // For each argument in the Schema, see if it can be matched up with the
   // given python arguments to determine whether it's the right overload.
   //
@@ -195,67 +204,75 @@ static bool try_match_schema(
   for (size_t j = 0; j < schema_args.size(); j++) {
     // TODO: Support for self as in tryMatchArgument?
     Argument schema_arg = schema_args[j];
+    py::object py_arg;
     if (!schema_arg.kwarg_only() && py_args_i < py_args.size()) {
       // TODO: Add support to allow conversions.
-      parse_py_args.push_back(py_args[py_args_i]);
+      py_arg = py_args[py_args_i];
       py_args_i++;
     } else if (py_kwargs.find(schema_arg.name().c_str()) != py_kwargs.end()) {
       // TODO: Check for no presence of duplicates in given schema
-      parse_py_args.push_back(py_kwargs.at(schema_arg.name().c_str()));
+      py_arg = py_kwargs[schema_arg.name().c_str()];
       used_kwargs++;
     } else if (schema_arg.default_value()) {
       // TODO: How is this converted to ScalarType if it's a int (usually)?
       // What mechanism currently does this kind of conversion.
-      auto default_arg_wrapper = ArgWrapper(*schema_arg.default_value());
-      default_arg_wrapper.standardize();
-      parse_py_args.emplace_back(default_arg_wrapper);
+      py_arg = toPyObject(*schema_arg.default_value());
     } else {
       // The given schema cannot find either a positional or keyword argument to
       // match against for this given schema argument. There also is no default
       // value specified for this schema argument. Therefore this schema cannot
       // be the correct overload.
       std::cout << "ARGS COUNT OFF!" << std::endl;
-      return false;
+      return c10::nullopt;
     }
+    // TODO: NestedTensor support
+    IValue ivalue = toIValue(py_arg, schema_arg.type());
+    parse_py_args.push_back(ArgWrapper(ivalue));
   }
   if (
       // Check whether all positional arguments were matched by given Schema
       (py_args.size() == py_args_i) &&
       // Check if all kwargs were matched by given Schema
       (used_kwargs == py_kwargs.size())) {
-    bool types_match = true;
-    TypeEnv type_env;
-    for (size_t j = 0; j < parse_py_args.size(); j++) {
-      // std::cout << " ; parse_py_args[" << j
-      //           << "]: " << type_j->str();
-      // Now that we found that the overall schema matches, we need to check
-      // whether the types match.
-      // TODO: Need Subtypes and argument type conversions (e.g. convert one
-      // float to list of floats with right number of elements).
-      // MatchTypeReturn match =
-      //     matchTypeVariables(schema_args[j].type(), type_j, type_env);
-      TypePtr type_j = parse_py_args[j].ivalue().type();
-      std::cout << " x parse_py_args[" << j << "]: " << type_j->str();
-      std::cout << "\t=\t"
-                << "schema_args[" << j << "]: " << schema_args[j].type()->str();
-      // TODO: We want to know whether the actual argument is a convertible
-      // subtype to the one used in the schema.
-      // TODO: Need type env?
-      // types_match = types_match && matchTypeVariables(schema_args[j].type(),
-      // type_j, type_env).success();
-      types_match =
-          types_match && (schema_args[j].type()->kind() == type_j->kind());
-      std::cout << "\t types_match: " << types_match;
-      std::cout << std::endl;
-    }
-    std::cout << std::endl;
-    if (types_match) {
-      std::cout << "FOUND IT!" << std::endl;
-      return true;
-    }
+    //    bool types_match = true;
+    //    TypeEnv type_env;
+    //    for (size_t j = 0; j < parse_py_args.size(); j++) {
+    //      // std::cout << " ; parse_py_args[" << j
+    //      //           << "]: " << type_j->str();
+    //      // Now that we found that the overall schema matches, we need to
+    //      check
+    //      // whether the types match.
+    //      // TODO: Need Subtypes and argument type conversions (e.g. convert
+    //      one
+    //      // float to list of floats with right number of elements).
+    //      // MatchTypeReturn match =
+    //      //     matchTypeVariables(schema_args[j].type(), type_j, type_env);
+    //      TypePtr type_j = parse_py_args[j].ivalue().type();
+    //      std::cout << " x parse_py_args[" << j << "]: " << type_j->str();
+    //      std::cout << "\t=\t"
+    //                << "schema_args[" << j << "]: " <<
+    //                schema_args[j].type()->str();
+    //      // TODO: We want to know whether the actual argument is a
+    //      convertible
+    //      // subtype to the one used in the schema.
+    //      // TODO: Need type env?
+    //      // types_match = types_match &&
+    //      matchTypeVariables(schema_args[j].type(),
+    //      // type_j, type_env).success();
+    //      types_match =
+    //          types_match && (schema_args[j].type()->kind() ==
+    //          type_j->kind());
+    //      std::cout << "\t types_match: " << types_match;
+    //      std::cout << std::endl;
+    //    }
+    //    std::cout << std::endl;
+    //    if (types_match) {
+    std::cout << "FOUND IT!" << std::endl;
+    return parse_py_args;
+    //    }
   }
   std::cout << "ARGS SIZES MISMATCHED" << std::endl;
-  return false;
+  return c10::nullopt;
 }
 
 // TODO: Write comparison operation based on a subset of Argument comparison
@@ -298,17 +315,6 @@ c10::optional<Symbol> is_builtin(py::object fn) {
 //  }
 //  return torch::ones({});
 
-static ArgWrapper wrap_arg(py::object arg) {
-  if (py::isinstance<THP_ListNestedTensor>(arg)) {
-    return ArgWrapper(
-        py::cast<THP_ListNestedTensor>(arg).data().get_structure());
-  } else if (py::isinstance<THP_BufferNestedTensor>(arg)) {
-    return ArgWrapper(
-        py::cast<THP_BufferNestedTensor>(arg).data().get_structure());
-  }
-  return ArgWrapper(toTypeInferredIValue(arg));
-}
-
 // TODO: This should support 3 types of functions
 // fn might be scripted (i.e. StrongFunctionPtr)
 // fn might be a builtin (need to resolve!)
@@ -316,38 +322,29 @@ static ArgWrapper wrap_arg(py::object arg) {
 // (not fast!)
 py::cpp_function jit_tensorwise() {
   return py::cpp_function([](py::object fn) {
-    return py::cpp_function([fn](py::args args_, py::kwargs kwargs_) {
-      std::vector<ArgWrapper> args;
-      for (size_t i = 0; i < args_.size(); i++) {
-        args.push_back(wrap_arg(args_[i]));
-      }
-      std::unordered_map<std::string, ArgWrapper> kwargs;
-      for (const std::pair<py::handle, py::handle>& pair : kwargs_) {
-        kwargs.emplace(std::make_pair(
-            std::string(py::str(pair.first)),
-            wrap_arg(py::reinterpret_borrow<py::object>(pair.second))));
-      }
-
+    return py::cpp_function([fn](py::args args, py::kwargs kwargs) {
       if (py::isinstance<StrongFunctionPtr>(fn)) {
         std::cout << "is StrongFunctionPtr" << std::endl;
         auto sfn = py::cast<StrongFunctionPtr>(fn);
         Function& op = *sfn.function_;
-        return apply_jit_function_helper<Function>(args, kwargs, op);
+        std::vector<ArgWrapper> flat_args = flatten_args(args, kwargs);
+        return apply_jit_function_helper<Function>(flat_args, op);
       }
       if (auto name = is_builtin(fn)) {
         for (const auto& op : getAllOperatorsFor(*name)) {
-          if (try_match_schema(&op->schema(), args, kwargs)) {
+          if (auto flat_args = try_match_schema(&op->schema(), args, kwargs)) {
             std::cout << "is builtin Operation with schema: " << op->schema()
                       << std::endl;
             Operation actual = op->getOperation();
-            return apply_jit_function_helper<Operation>(args, kwargs, actual);
+            return apply_jit_function_helper<Operation>(*flat_args, actual);
           }
         }
         for (const auto& op : getAllBuiltinFunctionsFor(*name)) {
-          if (try_match_schema(&op->getSchema(), args, kwargs)) {
+          if (auto flat_args =
+                  try_match_schema(&op->getSchema(), args, kwargs)) {
             std::cout << "is builtin Function with schema: " << op->getSchema()
                       << std::endl;
-            return apply_jit_function_helper<Function>(args, kwargs, *op);
+            return apply_jit_function_helper<Function>(*flat_args, *op);
           }
         }
       }
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index 02629aa6..1582265c 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev2019122422+ffa374e'
-git_version = 'ffa374ef98019505f681206b389f71170df11690'
+__version__ = '0.0.1.dev201912262+a7d9095'
+git_version = 'a7d9095b88155c245af3e9a16d1f8a88b0df87d7'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From 27bc9dc76a80351c2d97ad1e8b9d12792c1e3012 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Wed, 25 Dec 2019 20:05:39 -0800
Subject: [PATCH 26/49] Checkpoint

---
 benchmarks/jit_tensorwise.py         | 10 ++++++++-
 nestedtensor/csrc/jit_list_apply.cpp | 31 +++++++++++++++++++++-------
 nestedtensor/version.py              |  4 ++--
 3 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/benchmarks/jit_tensorwise.py b/benchmarks/jit_tensorwise.py
index 0e03b4fc..627e1656 100644
--- a/benchmarks/jit_tensorwise.py
+++ b/benchmarks/jit_tensorwise.py
@@ -16,9 +16,17 @@ def f(i, w):
     # print(r.nested_size())
 
     na = nestedtensor._C.jit_tensorwise()(torch.add)
+
     print("111")
     print(na(
-        nestedtensor._C._ListNestedTensor([torch.randn(1, 2)]),
+        4.0,
         nestedtensor._C._ListNestedTensor([torch.randn(1, 2)]),
         ))
     print("222")
+
+    # print("333")
+    # print(na(
+    #     torch.randn(1, 2),
+    #     torch.randn(1, 2),
+    #     ))
+    # print("444")
diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index b5dc288e..30a21408 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -119,18 +119,27 @@ static TensorNode apply_jit_function(std::vector<ArgWrapper>& args, F& fn) {
   }
 }
 
-static ArgWrapper wrap_arg(py::object arg) {
+// NestedTensor taken as Tensor type
+static ArgWrapper wrap_arg(
+    py::object arg,
+    c10::optional<c10::TypePtr> type_ptr = c10::nullopt) {
   if (py::isinstance<THP_ListNestedTensor>(arg)) {
+    TORCH_CHECK((*type_ptr)->kind() == TensorType::Kind);
     return ArgWrapper(
         py::cast<THP_ListNestedTensor>(arg).data().get_structure());
   } else if (py::isinstance<THP_BufferNestedTensor>(arg)) {
+    TORCH_CHECK((*type_ptr)->kind() == TensorType::Kind);
     return ArgWrapper(
         py::cast<THP_BufferNestedTensor>(arg).data().get_structure());
   }
-  return ArgWrapper(toTypeInferredIValue(arg));
+  if (type_ptr) {
+    return ArgWrapper(toIValue(arg, *type_ptr));
+  } else {
+    return ArgWrapper(toTypeInferredIValue(arg));
+  }
 }
 
-static c10::optional<std::vector<ArgWrapper>> flatten_args(
+static std::vector<ArgWrapper> flatten_args(
     py::args args_,
     py::kwargs kwargs_) {
   std::vector<ArgWrapper> flat_args;
@@ -139,8 +148,10 @@ static c10::optional<std::vector<ArgWrapper>> flatten_args(
   }
   std::unordered_map<std::string, ArgWrapper> kwargs;
   for (const std::pair<py::handle, py::handle>& pair : kwargs_) {
-    flat_args.push_back(py::reinterpret_borrow<py::object>(pair.second));
+    flat_args.push_back(
+        wrap_arg(py::reinterpret_borrow<py::object>(pair.second)));
   }
+  return flat_args;
 }
 
 template <class F>
@@ -209,7 +220,7 @@ static c10::optional<std::vector<ArgWrapper>> try_match_schema(
       // TODO: Add support to allow conversions.
       py_arg = py_args[py_args_i];
       py_args_i++;
-    } else if (py_kwargs.find(schema_arg.name().c_str()) != py_kwargs.end()) {
+    } else if (py_kwargs.contains(schema_arg.name().c_str())) {
       // TODO: Check for no presence of duplicates in given schema
       py_arg = py_kwargs[schema_arg.name().c_str()];
       used_kwargs++;
@@ -226,8 +237,13 @@ static c10::optional<std::vector<ArgWrapper>> try_match_schema(
       return c10::nullopt;
     }
     // TODO: NestedTensor support
-    IValue ivalue = toIValue(py_arg, schema_arg.type());
-    parse_py_args.push_back(ArgWrapper(ivalue));
+    try {
+      ArgWrapper arg = wrap_arg(py_arg, schema_arg.type());
+      parse_py_args.push_back(arg);
+    } catch (std::exception& e) {
+      // std::cout << "Wrap arg exception: " << e.what() << std::endl;
+      return c10::nullopt;
+    }
   }
   if (
       // Check whether all positional arguments were matched by given Schema
@@ -330,6 +346,7 @@ py::cpp_function jit_tensorwise() {
         std::vector<ArgWrapper> flat_args = flatten_args(args, kwargs);
         return apply_jit_function_helper<Function>(flat_args, op);
       }
+      // TODO: Support for no NestedTensor arguments
       if (auto name = is_builtin(fn)) {
         for (const auto& op : getAllOperatorsFor(*name)) {
           if (auto flat_args = try_match_schema(&op->schema(), args, kwargs)) {
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index 1582265c..ca2b54ee 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev201912262+a7d9095'
-git_version = 'a7d9095b88155c245af3e9a16d1f8a88b0df87d7'
+__version__ = '0.0.1.dev201912264+8911706'
+git_version = '8911706629b5b310b60d8cace987c75c8d138ba5'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From bea2e46df08f2c06375155e1a482db88591ee964 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Wed, 25 Dec 2019 20:24:25 -0800
Subject: [PATCH 27/49] Checkpoint

---
 benchmarks/jit_tensorwise.py         | 8 ++++++--
 benchmarks/nearest_neighbors.py      | 4 ++--
 nestedtensor/csrc/jit_list_apply.cpp | 4 ++--
 nestedtensor/nested/monkey_patch.py  | 6 +++---
 nestedtensor/nested/nested.py        | 1 +
 nestedtensor/version.py              | 4 ++--
 test/test_nested_tensor_nary.py      | 4 ++++
 7 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/benchmarks/jit_tensorwise.py b/benchmarks/jit_tensorwise.py
index 627e1656..17670e32 100644
--- a/benchmarks/jit_tensorwise.py
+++ b/benchmarks/jit_tensorwise.py
@@ -15,14 +15,18 @@ def f(i, w):
     # 
     # print(r.nested_size())
 
-    na = nestedtensor._C.jit_tensorwise()(torch.add)
+    na = nestedtensor._C.jit_tensorwise()(torch.mul)
 
     print("111")
+    out = nestedtensor._C._ListNestedTensor([torch.randn(1, 2)])
     print(na(
-        4.0,
         nestedtensor._C._ListNestedTensor([torch.randn(1, 2)]),
+        4.0,
+        out = out
         ))
     print("222")
+    print('out')
+    print(out)
 
     # print("333")
     # print(na(
diff --git a/benchmarks/nearest_neighbors.py b/benchmarks/nearest_neighbors.py
index fc5326a2..f9a77fdd 100644
--- a/benchmarks/nearest_neighbors.py
+++ b/benchmarks/nearest_neighbors.py
@@ -139,10 +139,10 @@ def benchmark_fn(fn, run_time = 15.0):
     gen_results_nested_mv = gen_algorithm_nested_mv(keys, sub_clusters)
     gen_results_nested_jit_mv = gen_algorithm_nested_jit_mv(keys, sub_clusters)
 
+    print(benchmark_fn(gen_results_nested_mv))
     print(benchmark_fn(gen_results_naive))
     print(benchmark_fn(gen_results_mv))
-    print(benchmark_fn(gen_results_nested_mv))
-    print(benchmark_fn(gen_results_nested_jit_mv))
+    # print(benchmark_fn(gen_results_nested_jit_mv))
     # import cProfile, pstats, io
     # pr = cProfile.Profile()
     # pr.enable()
diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index 30a21408..ba56e068 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -241,7 +241,7 @@ static c10::optional<std::vector<ArgWrapper>> try_match_schema(
       ArgWrapper arg = wrap_arg(py_arg, schema_arg.type());
       parse_py_args.push_back(arg);
     } catch (std::exception& e) {
-      // std::cout << "Wrap arg exception: " << e.what() << std::endl;
+      std::cout << "Wrap arg exception: " << e.what() << std::endl;
       return c10::nullopt;
     }
   }
@@ -366,7 +366,7 @@ py::cpp_function jit_tensorwise() {
         }
       }
       // TODO: Need implementation of generic python version.
-      std::cout << "FAIL!" << std::endl;
+      std::cout << "FAIL! Can't find something for " << fn << std::endl;
       TensorNode result;
       return THP_ListNestedTensor(_ListNestedTensor(result));
     });
diff --git a/nestedtensor/nested/monkey_patch.py b/nestedtensor/nested/monkey_patch.py
index 1c936d60..f74aba10 100644
--- a/nestedtensor/nested/monkey_patch.py
+++ b/nestedtensor/nested/monkey_patch.py
@@ -37,9 +37,8 @@ def set_wrapped_torch_function(function_name, wrapper):
             getattr(torch, function_name))
 
     def set_wrapped_jit_torch_function(function_name, wrapper):
-        return 
-        # jit_function_dispatch[getattr(torch, function_name)] = wrapper(
-        #     torch.jit.script(getattr(torch, function_name)))
+        jit_function_dispatch[getattr(torch, function_name)] = wrapper(
+            getattr(torch, function_name))
 
     def set_function(key, function):
         function_dispatch[key] = function
@@ -229,5 +228,6 @@ def set_function(key, function):
 
     # module.NestedTensor = NestedTensor
 
+    jit_function_dispatch[torch.mv] = _C.jit_tensorwise()(torch.mv)
     setattr(NestedTensor, '_NestedTensor__function_dispatch', function_dispatch)
     setattr(NestedTensor, '_NestedTensor__jit_function_dispatch', jit_function_dispatch)
diff --git a/nestedtensor/nested/nested.py b/nestedtensor/nested/nested.py
index 2440781e..74b278f8 100644
--- a/nestedtensor/nested/nested.py
+++ b/nestedtensor/nested/nested.py
@@ -325,6 +325,7 @@ def __torch_function__(self, func, args=(), kwargs=None):
         _local_func = None
         if func in NestedTensor.__jit_function_dispatch:
             if kwargs is None:
+                print("HBBBB")
                 _jit_local_func = NestedTensor.__jit_function_dispatch[func]
                 impl_args = [a._impl if isinstance(a, NestedTensor) else a for a in args]
                 return _jit_local_func(*impl_args)
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index ca2b54ee..ba434db9 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev201912264+8911706'
-git_version = '8911706629b5b310b60d8cace987c75c8d138ba5'
+__version__ = '0.0.1.dev201912264+27bc9dc'
+git_version = '27bc9dc76a80351c2d97ad1e8b9d12792c1e3012'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION
diff --git a/test/test_nested_tensor_nary.py b/test/test_nested_tensor_nary.py
index 0d6644f5..95577eeb 100644
--- a/test/test_nested_tensor_nary.py
+++ b/test/test_nested_tensor_nary.py
@@ -91,6 +91,10 @@ def method_inplace(x): return method_inplace_(x, 0.3)
         self.assertTrue(a2.nested_dim() == a3.nested_dim())
 
         def _close(t1, t2):
+            print('t1')
+            print(t1)
+            print('t2')
+            print(t2)
             self.assertTrue(((t1 - t2).abs() < 1e-6).all())
 
         if func__ not in ['mvlgamma']:

From 9722e9caec270e3b1b5494f49cc7e087f953690a Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Wed, 25 Dec 2019 20:41:42 -0800
Subject: [PATCH 28/49] Checkpoint

---
 benchmarks/jit_tensorwise.py         | 24 ++++++++++------
 nestedtensor/csrc/jit_list_apply.cpp | 41 ++++++++++++++++++----------
 nestedtensor/csrc/jit_list_apply.h   | 30 ++++++++++++++------
 nestedtensor/nested/nested.py        |  1 -
 nestedtensor/version.py              |  4 +--
 5 files changed, 65 insertions(+), 35 deletions(-)

diff --git a/benchmarks/jit_tensorwise.py b/benchmarks/jit_tensorwise.py
index 17670e32..dfb7520c 100644
--- a/benchmarks/jit_tensorwise.py
+++ b/benchmarks/jit_tensorwise.py
@@ -15,18 +15,24 @@ def f(i, w):
     # 
     # print(r.nested_size())
 
-    na = nestedtensor._C.jit_tensorwise()(torch.mul)
+    ## na = nestedtensor._C.jit_tensorwise()(torch.mul)
 
-    print("111")
-    out = nestedtensor._C._ListNestedTensor([torch.randn(1, 2)])
-    print(na(
+    ## print("111")
+    ## out = nestedtensor._C._ListNestedTensor([torch.randn(1, 2)])
+    ## print(na(
+    ##     nestedtensor._C._ListNestedTensor([torch.randn(1, 2)]),
+    ##     4.0,
+    ##     out = out
+    ##     ))
+    ## print("222")
+    ## print('out')
+    ## print(out)
+
+    nv = nestedtensor._C.jit_tensorwise()(torch.mv)
+    print(nv(
         nestedtensor._C._ListNestedTensor([torch.randn(1, 2)]),
-        4.0,
-        out = out
+        nestedtensor._C._ListNestedTensor([torch.randn(2)]),
         ))
-    print("222")
-    print('out')
-    print(out)
 
     # print("333")
     # print(na(
diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index ba56e068..7ac93db6 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -196,7 +196,9 @@ static c10::optional<std::vector<ArgWrapper>> try_match_schema(
     py::kwargs py_kwargs) {
   // const std::vector<ArgWrapper>& py_args,
   // const std::unordered_map<std::string, ArgWrapper>& py_kwargs) {
-  std::cout << "Checking match for schema: " << *schema << std::endl;
+  if (DEBUG) {
+    std::cout << "Checking match for schema: " << *schema << std::endl;
+  }
   // In the end it's only a match when this counter fully depleted the args.
   size_t py_args_i = 0;
   size_t used_kwargs = 0;
@@ -233,7 +235,9 @@ static c10::optional<std::vector<ArgWrapper>> try_match_schema(
       // match against for this given schema argument. There also is no default
       // value specified for this schema argument. Therefore this schema cannot
       // be the correct overload.
-      std::cout << "ARGS COUNT OFF!" << std::endl;
+      if (DEBUG) {
+        std::cout << "ARGS COUNT OFF!" << std::endl;
+      }
       return c10::nullopt;
     }
     // TODO: NestedTensor support
@@ -241,7 +245,9 @@ static c10::optional<std::vector<ArgWrapper>> try_match_schema(
       ArgWrapper arg = wrap_arg(py_arg, schema_arg.type());
       parse_py_args.push_back(arg);
     } catch (std::exception& e) {
-      std::cout << "Wrap arg exception: " << e.what() << std::endl;
+      if (DEBUG) {
+        std::cout << "Wrap arg exception: " << e.what() << std::endl;
+      }
       return c10::nullopt;
     }
   }
@@ -283,11 +289,15 @@ static c10::optional<std::vector<ArgWrapper>> try_match_schema(
     //    }
     //    std::cout << std::endl;
     //    if (types_match) {
-    std::cout << "FOUND IT!" << std::endl;
+    if (DEBUG) {
+      std::cout << "FOUND IT!" << std::endl;
+    }
     return parse_py_args;
     //    }
   }
-  std::cout << "ARGS SIZES MISMATCHED" << std::endl;
+  if (DEBUG) {
+    std::cout << "ARGS SIZES MISMATCHED" << std::endl;
+  }
   return c10::nullopt;
 }
 
@@ -301,11 +311,8 @@ c10::optional<Symbol> is_builtin(py::object fn) {
 
   // TODO: Is there a cheaper way to do this?
   const auto& variants = getAllOperatorsFor(name);
-  if (variants.size() == 0) {
-    return c10::nullopt;
-  }
   const auto& builtin_functions = getAllBuiltinFunctionsFor(name);
-  if (builtin_functions.size() == 0) {
+  if (variants.size() == 0 && builtin_functions.size() == 0) {
     return c10::nullopt;
   }
   return name;
@@ -350,8 +357,10 @@ py::cpp_function jit_tensorwise() {
       if (auto name = is_builtin(fn)) {
         for (const auto& op : getAllOperatorsFor(*name)) {
           if (auto flat_args = try_match_schema(&op->schema(), args, kwargs)) {
-            std::cout << "is builtin Operation with schema: " << op->schema()
-                      << std::endl;
+            if (DEBUG) {
+              std::cout << "is builtin Operation with schema: " << op->schema()
+                        << std::endl;
+            }
             Operation actual = op->getOperation();
             return apply_jit_function_helper<Operation>(*flat_args, actual);
           }
@@ -359,14 +368,18 @@ py::cpp_function jit_tensorwise() {
         for (const auto& op : getAllBuiltinFunctionsFor(*name)) {
           if (auto flat_args =
                   try_match_schema(&op->getSchema(), args, kwargs)) {
-            std::cout << "is builtin Function with schema: " << op->getSchema()
-                      << std::endl;
+            if (DEBUG) {
+              std::cout << "is builtin Function with schema: "
+                        << op->getSchema() << std::endl;
+            }
             return apply_jit_function_helper<Function>(*flat_args, *op);
           }
         }
       }
       // TODO: Need implementation of generic python version.
-      std::cout << "FAIL! Can't find something for " << fn << std::endl;
+      if (DEBUG) {
+        std::cout << "FAIL! Can't find something for " << fn << std::endl;
+      }
       TensorNode result;
       return THP_ListNestedTensor(_ListNestedTensor(result));
     });
diff --git a/nestedtensor/csrc/jit_list_apply.h b/nestedtensor/csrc/jit_list_apply.h
index d66e254e..18403ea0 100644
--- a/nestedtensor/csrc/jit_list_apply.h
+++ b/nestedtensor/csrc/jit_list_apply.h
@@ -10,30 +10,42 @@
 namespace torch {
 namespace nested_tensor {
 
+static bool DEBUG = false;
+
 // TODO Expand to IValues to support generic lists?
 template <class F>
 inline at::Tensor run_function(std::vector<c10::IValue> stack, F& fn);
 
 template <>
 inline at::Tensor run_function(std::vector<c10::IValue> stack, Function& fn) {
-  std::cout << "run_function_Function" << std::endl;
+  if (DEBUG) {
+    std::cout << "run_function_Function" << std::endl;
+  }
   c10::IValue result = fn(stack);
-  std::cout << "finished result_Function" << std::endl;
+  if (DEBUG) {
+    std::cout << "finished result_Function" << std::endl;
+  }
   return result.toTensor();
 }
 
 template <>
 inline at::Tensor run_function(std::vector<c10::IValue> stack, Operation& fn) {
-  size_t i = 0;
-  for (c10::IValue& ival : stack) {
-    std::cout << "ival " << i << " : " << ival.tagKind() << std::endl;
-    i++;
+  if (DEBUG) {
+    size_t i = 0;
+    for (c10::IValue& ival : stack) {
+      std::cout << "ival " << i << " : " << ival.tagKind() << std::endl;
+      i++;
+    }
+    std::cout << "run_function_Operation" << std::endl;
   }
-  std::cout << "run_function_Operation" << std::endl;
   fn(stack);
-  std::cout << "run_function_Operation stack finished" << std::endl;
+  if (DEBUG) {
+    std::cout << "run_function_Operation stack finished" << std::endl;
+  }
   c10::IValue result = stack.front();
-  std::cout << "finished result_Operation" << std::endl;
+  if (DEBUG) {
+    std::cout << "finished result_Operation" << std::endl;
+  }
   return result.toTensor();
 }
 
diff --git a/nestedtensor/nested/nested.py b/nestedtensor/nested/nested.py
index 74b278f8..2440781e 100644
--- a/nestedtensor/nested/nested.py
+++ b/nestedtensor/nested/nested.py
@@ -325,7 +325,6 @@ def __torch_function__(self, func, args=(), kwargs=None):
         _local_func = None
         if func in NestedTensor.__jit_function_dispatch:
             if kwargs is None:
-                print("HBBBB")
                 _jit_local_func = NestedTensor.__jit_function_dispatch[func]
                 impl_args = [a._impl if isinstance(a, NestedTensor) else a for a in args]
                 return _jit_local_func(*impl_args)
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index ba434db9..daec57b5 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev201912264+27bc9dc'
-git_version = '27bc9dc76a80351c2d97ad1e8b9d12792c1e3012'
+__version__ = '0.0.1.dev201912264+bea2e46'
+git_version = 'bea2e46df08f2c06375155e1a482db88591ee964'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From 688c7b61107326445e55d7ec82875974a64f131b Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Thu, 26 Dec 2019 08:38:01 -0800
Subject: [PATCH 29/49] Checkpoint

---
 nestedtensor/csrc/jit_list_apply.cpp | 7 ++++---
 nestedtensor/nested/monkey_patch.py  | 1 -
 nestedtensor/nested/nested.py        | 8 ++++----
 nestedtensor/nested/utils.py         | 1 +
 nestedtensor/version.py              | 4 ++--
 test/test_nested_tensor_nary.py      | 6 ++----
 6 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index 7ac93db6..c9c50576 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -190,6 +190,7 @@ THP_ListNestedTensor jit_apply_function(
 }
 
 // TODO: Write separate C++ test for overloads as test cases
+// TODO: Match return values!
 static c10::optional<std::vector<ArgWrapper>> try_match_schema(
     const FunctionSchema* schema,
     py::args py_args,
@@ -377,9 +378,9 @@ py::cpp_function jit_tensorwise() {
         }
       }
       // TODO: Need implementation of generic python version.
-      if (DEBUG) {
-        std::cout << "FAIL! Can't find something for " << fn << std::endl;
-      }
+      std::stringstream ss;
+      ss << "FAIL! Can't find something for " << fn;
+      TORCH_CHECK(false, ss.str());
       TensorNode result;
       return THP_ListNestedTensor(_ListNestedTensor(result));
     });
diff --git a/nestedtensor/nested/monkey_patch.py b/nestedtensor/nested/monkey_patch.py
index f74aba10..3bc3e75b 100644
--- a/nestedtensor/nested/monkey_patch.py
+++ b/nestedtensor/nested/monkey_patch.py
@@ -88,7 +88,6 @@ def set_function(key, function):
         set_nt_method(function_name + '_', utils.tensorwise())
         if function_name in ['fill']:
             continue
-        set_wrapped_torch_function(function_name, utils.tensorwise())
         set_wrapped_jit_torch_function(function_name, _C.jit_tensorwise())
         set_nt_method(function_name, utils.tensorwise())
     # <
diff --git a/nestedtensor/nested/nested.py b/nestedtensor/nested/nested.py
index 2440781e..4787a54c 100644
--- a/nestedtensor/nested/nested.py
+++ b/nestedtensor/nested/nested.py
@@ -322,12 +322,12 @@ def nested_stride(self, dim=None):
     # --- dependent on impl ends ---
 
     def __torch_function__(self, func, args=(), kwargs=None):
+        print('func.__name__: {}'.format(func.__name__))
         _local_func = None
         if func in NestedTensor.__jit_function_dispatch:
-            if kwargs is None:
-                _jit_local_func = NestedTensor.__jit_function_dispatch[func]
-                impl_args = [a._impl if isinstance(a, NestedTensor) else a for a in args]
-                return _jit_local_func(*impl_args)
+            _jit_local_func = NestedTensor.__jit_function_dispatch[func]
+            impl_args = [a._impl if isinstance(a, NestedTensor) else a for a in args]
+            return _jit_local_func(*impl_args)
         if func in NestedTensor.__function_dispatch:
             _local_func = NestedTensor.__function_dispatch[func]
             return _local_func(*args) if kwargs is None else _local_func(*args, **kwargs)
diff --git a/nestedtensor/nested/utils.py b/nestedtensor/nested/utils.py
index c02371a0..48021b11 100644
--- a/nestedtensor/nested/utils.py
+++ b/nestedtensor/nested/utils.py
@@ -176,6 +176,7 @@ def decorator(*_args, **_kwargs):
             def _func(*args, **kwargs):
                 if find_nested_tensor_dispatch_key(*args) is None:
                     # import pdb; pdb.set_trace()
+                    print("f.__name__: {}".format(f.__name__))
                     result = f(*args, **kwargs)
                     if not torch.is_tensor(result):
                         return tuple(result)
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index daec57b5..7435b5f3 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev201912264+bea2e46'
-git_version = 'bea2e46df08f2c06375155e1a482db88591ee964'
+__version__ = '0.0.1.dev2019122616+9722e9c'
+git_version = '9722e9caec270e3b1b5494f49cc7e087f953690a'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION
diff --git a/test/test_nested_tensor_nary.py b/test/test_nested_tensor_nary.py
index 95577eeb..9a516055 100644
--- a/test/test_nested_tensor_nary.py
+++ b/test/test_nested_tensor_nary.py
@@ -91,10 +91,8 @@ def method_inplace(x): return method_inplace_(x, 0.3)
         self.assertTrue(a2.nested_dim() == a3.nested_dim())
 
         def _close(t1, t2):
-            print('t1')
-            print(t1)
-            print('t2')
-            print(t2)
+            print("type(t1): {}".format(type(t1)))
+            print("type(t2): {}".format(type(t2)))
             self.assertTrue(((t1 - t2).abs() < 1e-6).all())
 
         if func__ not in ['mvlgamma']:

From 7a3cb4cd09302a53a1d1c85efe6ab63c04ee14aa Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Thu, 26 Dec 2019 18:38:52 -0800
Subject: [PATCH 30/49] Checkpoint

---
 nestedtensor/csrc/jit_list_apply.cpp     | 65 +++++++++++++++++-------
 nestedtensor/csrc/jit_list_apply.h       | 47 ++---------------
 nestedtensor/csrc/python_nested_tensor.h |  4 ++
 nestedtensor/version.py                  |  4 +-
 4 files changed, 58 insertions(+), 62 deletions(-)

diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index c9c50576..1692f33d 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -1,7 +1,5 @@
 #include <ATen/core/interned_strings.h>
 #include <jit_list_apply.h>
-#include <python_buffer_nested_tensor.h>
-#include <python_list_nested_tensor.h>
 #include <torch/csrc/jit/script/builtin_functions.h>
 #include <torch/csrc/jit/script/schema_matching.h>
 #include <torch/csrc/jit/script/sugared_value.h>
@@ -9,9 +7,48 @@
 namespace torch {
 namespace nested_tensor {
 
+namespace py = pybind11;
+
 using namespace torch::jit;
 using namespace torch::jit::script;
 
+// TODO Expand to IValues to support generic lists?
+template <class F>
+at::Tensor run_function(std::vector<c10::IValue> stack, F& fn);
+
+template <>
+at::Tensor run_function(std::vector<c10::IValue> stack, Function& fn) {
+  if (DEBUG) {
+    std::cout << "run_function_Function" << std::endl;
+  }
+  c10::IValue result = fn(stack);
+  if (DEBUG) {
+    std::cout << "finished result_Function" << std::endl;
+  }
+  return result.toTensor();
+}
+
+template <>
+at::Tensor run_function(std::vector<c10::IValue> stack, Operation& fn) {
+  if (DEBUG) {
+    size_t i = 0;
+    for (c10::IValue& ival : stack) {
+      std::cout << "ival " << i << " : " << ival.tagKind() << std::endl;
+      i++;
+    }
+    std::cout << "run_function_Operation" << std::endl;
+  }
+  fn(stack);
+  if (DEBUG) {
+    std::cout << "run_function_Operation stack finished" << std::endl;
+  }
+  c10::IValue result = stack.front();
+  if (DEBUG) {
+    std::cout << "finished result_Operation" << std::endl;
+  }
+  return result.toTensor();
+}
+
 struct ArgWrapper {
   ArgWrapper(TensorNode nested_tensor)
       : _is_nested_tensor(true), _nested_tensor(nested_tensor) {}
@@ -123,14 +160,10 @@ static TensorNode apply_jit_function(std::vector<ArgWrapper>& args, F& fn) {
 static ArgWrapper wrap_arg(
     py::object arg,
     c10::optional<c10::TypePtr> type_ptr = c10::nullopt) {
-  if (py::isinstance<THP_ListNestedTensor>(arg)) {
+  if (py::isinstance<THPNestedTensor>(arg)) {
     TORCH_CHECK((*type_ptr)->kind() == TensorType::Kind);
     return ArgWrapper(
-        py::cast<THP_ListNestedTensor>(arg).data().get_structure());
-  } else if (py::isinstance<THP_BufferNestedTensor>(arg)) {
-    TORCH_CHECK((*type_ptr)->kind() == TensorType::Kind);
-    return ArgWrapper(
-        py::cast<THP_BufferNestedTensor>(arg).data().get_structure());
+        py::cast<THPNestedTensor>(arg).get_structure());
   }
   if (type_ptr) {
     return ArgWrapper(toIValue(arg, *type_ptr));
@@ -155,22 +188,18 @@ static std::vector<ArgWrapper> flatten_args(
 }
 
 template <class F>
-static THP_ListNestedTensor apply_jit_function_helper(
+static THPNestedTensor apply_jit_function_helper(
     std::vector<ArgWrapper>& flat_args,
     F& op) {
   py::gil_scoped_release release;
   TensorNode result = apply_jit_function(flat_args, op);
   py::gil_scoped_acquire acquire;
-  return THP_ListNestedTensor(_ListNestedTensor(result));
+  return THPNestedTensor(_ListNestedTensor(result));
 }
 
-THP_ListNestedTensor jit_apply_function(
-    std::vector<THP_ListNestedTensor> nts_,
+THPNestedTensor jit_apply_function(
+    std::vector<THPNestedTensor> nts,
     py::object fn) {
-  std::vector<_ListNestedTensor> nts;
-  for (size_t i = 0; i < nts_.size(); i++) {
-    nts.push_back(nts_[i].data());
-  }
   auto sfn = py::cast<StrongFunctionPtr>(fn);
   auto tracing_state = tracer::getTracingState();
   TORCH_CHECK(!tracing_state, "doesnt support tracing");
@@ -186,7 +215,7 @@ THP_ListNestedTensor jit_apply_function(
   py::gil_scoped_release release;
   TensorNode nested_node = apply_jit_function<Function>(nested_nodes, callee);
   py::gil_scoped_acquire acquire;
-  return THP_ListNestedTensor(_ListNestedTensor(nested_node));
+  return THPNestedTensor(_ListNestedTensor(nested_node));
 }
 
 // TODO: Write separate C++ test for overloads as test cases
@@ -382,7 +411,7 @@ py::cpp_function jit_tensorwise() {
       ss << "FAIL! Can't find something for " << fn;
       TORCH_CHECK(false, ss.str());
       TensorNode result;
-      return THP_ListNestedTensor(_ListNestedTensor(result));
+      return THPNestedTensor(_ListNestedTensor(result));
     });
   });
 }
diff --git a/nestedtensor/csrc/jit_list_apply.h b/nestedtensor/csrc/jit_list_apply.h
index 18403ea0..44190296 100644
--- a/nestedtensor/csrc/jit_list_apply.h
+++ b/nestedtensor/csrc/jit_list_apply.h
@@ -1,7 +1,7 @@
 #include <Python.h>
 #include <pybind11/functional.h>
 #include <pybind11/stl.h>
-#include <python_list_nested_tensor.h>
+#include <python_nested_tensor.h>
 #include <torch/csrc/autograd/utils/wrap_outputs.h>
 #include <torch/csrc/jit/pybind_utils.h>
 #include <torch/csrc/utils/python_strings.h>
@@ -12,48 +12,11 @@ namespace nested_tensor {
 
 static bool DEBUG = false;
 
-// TODO Expand to IValues to support generic lists?
-template <class F>
-inline at::Tensor run_function(std::vector<c10::IValue> stack, F& fn);
+THPNestedTensor jit_apply_function(
+    std::vector<THPNestedTensor> nts_,
+    pybind11::object fn);
 
-template <>
-inline at::Tensor run_function(std::vector<c10::IValue> stack, Function& fn) {
-  if (DEBUG) {
-    std::cout << "run_function_Function" << std::endl;
-  }
-  c10::IValue result = fn(stack);
-  if (DEBUG) {
-    std::cout << "finished result_Function" << std::endl;
-  }
-  return result.toTensor();
-}
-
-template <>
-inline at::Tensor run_function(std::vector<c10::IValue> stack, Operation& fn) {
-  if (DEBUG) {
-    size_t i = 0;
-    for (c10::IValue& ival : stack) {
-      std::cout << "ival " << i << " : " << ival.tagKind() << std::endl;
-      i++;
-    }
-    std::cout << "run_function_Operation" << std::endl;
-  }
-  fn(stack);
-  if (DEBUG) {
-    std::cout << "run_function_Operation stack finished" << std::endl;
-  }
-  c10::IValue result = stack.front();
-  if (DEBUG) {
-    std::cout << "finished result_Operation" << std::endl;
-  }
-  return result.toTensor();
-}
-
-THP_ListNestedTensor jit_apply_function(
-    std::vector<THP_ListNestedTensor> nts_,
-    py::object fn);
-
-py::cpp_function jit_tensorwise();
+pybind11::cpp_function jit_tensorwise();
 
 } // namespace nested_tensor
 } // namespace torch
diff --git a/nestedtensor/csrc/python_nested_tensor.h b/nestedtensor/csrc/python_nested_tensor.h
index 56805e79..a8e01285 100644
--- a/nestedtensor/csrc/python_nested_tensor.h
+++ b/nestedtensor/csrc/python_nested_tensor.h
@@ -94,6 +94,10 @@ struct THPNestedTensor {
     return data_map<bool>(
         _data, [](auto data) { return data.is_contiguous(); });
   }
+  TensorNode get_structure() {
+    return data_map<TensorNode>(
+        _data, [](auto data) { return data.get_structure(); });
+  }
 
  private:
   c10::either<_ListNestedTensor, _BufferNestedTensor> _data;
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index 3261bbd0..929c5922 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev201912271+9e96318'
-git_version = '9e96318f047603470e68e268ec3a0ffa27410262'
+__version__ = '0.0.1.dev201912272+a0bd8a8'
+git_version = 'a0bd8a8e91f4c0d616f177dbd4376a747610321a'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From 64c437c9e5a450ede231a6de6901ef46c2b22acb Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Thu, 26 Dec 2019 18:42:33 -0800
Subject: [PATCH 31/49] Checkpoint

---
 nestedtensor/nested/nested.py   | 5 -----
 nestedtensor/nested/utils.py    | 1 -
 test/test_nested_tensor_nary.py | 2 --
 3 files changed, 8 deletions(-)

diff --git a/nestedtensor/nested/nested.py b/nestedtensor/nested/nested.py
index 02e91782..3246febc 100644
--- a/nestedtensor/nested/nested.py
+++ b/nestedtensor/nested/nested.py
@@ -263,11 +263,7 @@ def to_tensor(self, dim=0):
         if dim == 0:
             if None in self.size():
                 raise ValueError("Shape not Tensor compliant")
-            print('self.nested_size()')
-            print(self.nested_size())
             result = self._impl.to_tensor()
-            print('result.size()')
-            print(result.size())
             return result
         # If dim is bigger than nested_dim the NestedTensor is already
         # of Tensor for dimensions bigger than the given.
@@ -322,7 +318,6 @@ def nested_stride(self, dim=None):
     # --- dependent on impl ends ---
 
     def __torch_function__(self, func, args=(), kwargs=None):
-        print('func.__name__: {}'.format(func.__name__))
         _local_func = None
         if func in NestedTensor.__jit_function_dispatch:
             _jit_local_func = NestedTensor.__jit_function_dispatch[func]
diff --git a/nestedtensor/nested/utils.py b/nestedtensor/nested/utils.py
index 48021b11..c02371a0 100644
--- a/nestedtensor/nested/utils.py
+++ b/nestedtensor/nested/utils.py
@@ -176,7 +176,6 @@ def decorator(*_args, **_kwargs):
             def _func(*args, **kwargs):
                 if find_nested_tensor_dispatch_key(*args) is None:
                     # import pdb; pdb.set_trace()
-                    print("f.__name__: {}".format(f.__name__))
                     result = f(*args, **kwargs)
                     if not torch.is_tensor(result):
                         return tuple(result)
diff --git a/test/test_nested_tensor_nary.py b/test/test_nested_tensor_nary.py
index 9a516055..0d6644f5 100644
--- a/test/test_nested_tensor_nary.py
+++ b/test/test_nested_tensor_nary.py
@@ -91,8 +91,6 @@ def method_inplace(x): return method_inplace_(x, 0.3)
         self.assertTrue(a2.nested_dim() == a3.nested_dim())
 
         def _close(t1, t2):
-            print("type(t1): {}".format(type(t1)))
-            print("type(t2): {}".format(type(t2)))
             self.assertTrue(((t1 - t2).abs() < 1e-6).all())
 
         if func__ not in ['mvlgamma']:

From 774588f1c41560a21a3aa4bf7620ba817da1948b Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Thu, 26 Dec 2019 18:59:41 -0800
Subject: [PATCH 32/49] Checkpoint

---
 nestedtensor/csrc/jit_list_apply.h | 2 +-
 nestedtensor/nested/nested.py      | 2 +-
 nestedtensor/version.py            | 4 ++--
 test/test_nested_tensor_nary.py    | 8 ++++----
 test/utils.py                      | 2 +-
 5 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/nestedtensor/csrc/jit_list_apply.h b/nestedtensor/csrc/jit_list_apply.h
index 44190296..c1752556 100644
--- a/nestedtensor/csrc/jit_list_apply.h
+++ b/nestedtensor/csrc/jit_list_apply.h
@@ -10,7 +10,7 @@
 namespace torch {
 namespace nested_tensor {
 
-static bool DEBUG = false;
+static bool DEBUG = true;
 
 THPNestedTensor jit_apply_function(
     std::vector<THPNestedTensor> nts_,
diff --git a/nestedtensor/nested/nested.py b/nestedtensor/nested/nested.py
index 3246febc..b243a4ca 100644
--- a/nestedtensor/nested/nested.py
+++ b/nestedtensor/nested/nested.py
@@ -322,7 +322,7 @@ def __torch_function__(self, func, args=(), kwargs=None):
         if func in NestedTensor.__jit_function_dispatch:
             _jit_local_func = NestedTensor.__jit_function_dispatch[func]
             impl_args = [a._impl if isinstance(a, NestedTensor) else a for a in args]
-            return _jit_local_func(*impl_args)
+            return NestedTensor(_jit_local_func(*impl_args))
         if func in NestedTensor.__function_dispatch:
             _local_func = NestedTensor.__function_dispatch[func]
             return _local_func(*args) if kwargs is None else _local_func(*args, **kwargs)
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index 929c5922..d98060d0 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev201912272+a0bd8a8'
-git_version = 'a0bd8a8e91f4c0d616f177dbd4376a747610321a'
+__version__ = '0.0.1.dev201912272+64c437c'
+git_version = '64c437c9e5a450ede231a6de6901ef46c2b22acb'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION
diff --git a/test/test_nested_tensor_nary.py b/test/test_nested_tensor_nary.py
index 0d6644f5..69fa4b43 100644
--- a/test/test_nested_tensor_nary.py
+++ b/test/test_nested_tensor_nary.py
@@ -93,10 +93,10 @@ def method_inplace(x): return method_inplace_(x, 0.3)
         def _close(t1, t2):
             self.assertTrue(((t1 - t2).abs() < 1e-6).all())
 
-        if func__ not in ['mvlgamma']:
-            func(a1, out=a3)
-            # TODO: Abstract this
-            _close(func(a1), a3)
+        # if func__ not in ['mvlgamma']:
+        #     func(a1, out=a3)
+        #     # TODO: Abstract this
+        #     _close(func(a1), a3)
         _close(func(a1), a2)
         _close(method(a1), a2)
         _close(method_inplace(a1), a2)
diff --git a/test/utils.py b/test/utils.py
index f73916cc..e050babf 100644
--- a/test/utils.py
+++ b/test/utils.py
@@ -75,7 +75,7 @@ def gen_random_int(seed, low=0, high=2 ** 32):
 
 
 # TODO: Something occasionally causes a NaN here...
-def gen_nested_list(seed, nested_dim, tensor_dim, size_low=1, size_high=10):
+def gen_nested_list(seed, nested_dim, tensor_dim, size_low=1, size_high=2):
     tensors = []
     num_tensors = gen_random_int(
         (seed * nested_dim + seed) * 1024, low=size_low, high=size_high)

From bce6d71ee83b71d1f34e5bdf63498b2766deb251 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Fri, 27 Dec 2019 09:43:26 -0800
Subject: [PATCH 33/49] Checkpoint

---
 benchmarks/jit_tensorwise.py         | 22 +++++++++++-----------
 nestedtensor/csrc/jit_list_apply.cpp | 18 +++++++++++++-----
 nestedtensor/csrc/jit_list_apply.h   |  9 +--------
 nestedtensor/version.py              |  4 ++--
 4 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/benchmarks/jit_tensorwise.py b/benchmarks/jit_tensorwise.py
index dfb7520c..10fdcf60 100644
--- a/benchmarks/jit_tensorwise.py
+++ b/benchmarks/jit_tensorwise.py
@@ -28,15 +28,15 @@ def f(i, w):
     ## print('out')
     ## print(out)
 
-    nv = nestedtensor._C.jit_tensorwise()(torch.mv)
-    print(nv(
-        nestedtensor._C._ListNestedTensor([torch.randn(1, 2)]),
-        nestedtensor._C._ListNestedTensor([torch.randn(2)]),
-        ))
-
-    # print("333")
-    # print(na(
-    #     torch.randn(1, 2),
-    #     torch.randn(1, 2),
+    # nv = nestedtensor._C.jit_tensorwise()(torch.mv)
+    # print(nv(
+    #     nestedtensor._C._ListNestedTensor([torch.randn(1, 2)]),
+    #     nestedtensor._C._ListNestedTensor([torch.randn(2)]),
     #     ))
-    # print("444")
+
+    print("333")
+    print(na(
+        torch.randn(1, 2),
+        torch.randn(1, 2),
+        ))
+    print("444")
diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index 1692f33d..ecd68f6b 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -1,8 +1,5 @@
-#include <ATen/core/interned_strings.h>
 #include <jit_list_apply.h>
 #include <torch/csrc/jit/script/builtin_functions.h>
-#include <torch/csrc/jit/script/schema_matching.h>
-#include <torch/csrc/jit/script/sugared_value.h>
 
 namespace torch {
 namespace nested_tensor {
@@ -162,8 +159,7 @@ static ArgWrapper wrap_arg(
     c10::optional<c10::TypePtr> type_ptr = c10::nullopt) {
   if (py::isinstance<THPNestedTensor>(arg)) {
     TORCH_CHECK((*type_ptr)->kind() == TensorType::Kind);
-    return ArgWrapper(
-        py::cast<THPNestedTensor>(arg).get_structure());
+    return ArgWrapper(py::cast<THPNestedTensor>(arg).get_structure());
   }
   if (type_ptr) {
     return ArgWrapper(toIValue(arg, *type_ptr));
@@ -385,6 +381,18 @@ py::cpp_function jit_tensorwise() {
       }
       // TODO: Support for no NestedTensor arguments
       if (auto name = is_builtin(fn)) {
+        for (std::shared_ptr<Operator> op : getAllOperatorsFor(*name)) {
+          Stack stack;
+          try {
+            std::cout << "trying op->schema(): " << op->schema() << std::endl;
+            stack =
+                createStackForSchema(op->schema(), args, kwargs, c10::nullopt);
+          } catch (...) {
+            continue;
+          }
+          op->getOperation()(stack);
+        }
+        exit(1);
         for (const auto& op : getAllOperatorsFor(*name)) {
           if (auto flat_args = try_match_schema(&op->schema(), args, kwargs)) {
             if (DEBUG) {
diff --git a/nestedtensor/csrc/jit_list_apply.h b/nestedtensor/csrc/jit_list_apply.h
index c1752556..ac1ff09f 100644
--- a/nestedtensor/csrc/jit_list_apply.h
+++ b/nestedtensor/csrc/jit_list_apply.h
@@ -1,16 +1,9 @@
-#include <Python.h>
-#include <pybind11/functional.h>
-#include <pybind11/stl.h>
 #include <python_nested_tensor.h>
-#include <torch/csrc/autograd/utils/wrap_outputs.h>
-#include <torch/csrc/jit/pybind_utils.h>
-#include <torch/csrc/utils/python_strings.h>
-#include <torch/extension.h>
 
 namespace torch {
 namespace nested_tensor {
 
-static bool DEBUG = true;
+static bool DEBUG = false;
 
 THPNestedTensor jit_apply_function(
     std::vector<THPNestedTensor> nts_,
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index d98060d0..44900380 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev201912272+64c437c'
-git_version = '64c437c9e5a450ede231a6de6901ef46c2b22acb'
+__version__ = '0.0.1.dev2019122717+774588f'
+git_version = '774588f1c41560a21a3aa4bf7620ba817da1948b'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From 5f921db935db2b44647f48c0b6fa736164190c0c Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Fri, 27 Dec 2019 12:18:08 -0800
Subject: [PATCH 34/49] Checkpoint

---
 benchmarks/jit_tensorwise.py         | 36 ++++++++++++++--------------
 nestedtensor/csrc/jit_list_apply.cpp | 28 +++++++++++++++++-----
 nestedtensor/version.py              |  4 ++--
 3 files changed, 42 insertions(+), 26 deletions(-)

diff --git a/benchmarks/jit_tensorwise.py b/benchmarks/jit_tensorwise.py
index 10fdcf60..c481585a 100644
--- a/benchmarks/jit_tensorwise.py
+++ b/benchmarks/jit_tensorwise.py
@@ -15,18 +15,18 @@ def f(i, w):
     # 
     # print(r.nested_size())
 
-    ## na = nestedtensor._C.jit_tensorwise()(torch.mul)
-
-    ## print("111")
-    ## out = nestedtensor._C._ListNestedTensor([torch.randn(1, 2)])
-    ## print(na(
-    ##     nestedtensor._C._ListNestedTensor([torch.randn(1, 2)]),
-    ##     4.0,
-    ##     out = out
-    ##     ))
-    ## print("222")
-    ## print('out')
-    ## print(out)
+    na = nestedtensor._C.jit_tensorwise()(torch.mul)
+
+    print("111")
+    out = nestedtensor.as_nested_tensor([torch.randn(1, 2)])
+    print(na(
+        nestedtensor.as_nested_tensor([torch.randn(1, 2)])._impl,
+        torch.tensor(4.0),
+        out = out
+        ))
+    print("222")
+    print('out')
+    print(out)
 
     # nv = nestedtensor._C.jit_tensorwise()(torch.mv)
     # print(nv(
@@ -34,9 +34,9 @@ def f(i, w):
     #     nestedtensor._C._ListNestedTensor([torch.randn(2)]),
     #     ))
 
-    print("333")
-    print(na(
-        torch.randn(1, 2),
-        torch.randn(1, 2),
-        ))
-    print("444")
+    # print("333")
+    # print(na(
+    #     torch.randn(1, 2),
+    #     torch.randn(1, 2),
+    #     ))
+    # print("444")
diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index ecd68f6b..eba3359f 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -371,30 +371,46 @@ c10::optional<Symbol> is_builtin(py::object fn) {
 // (not fast!)
 py::cpp_function jit_tensorwise() {
   return py::cpp_function([](py::object fn) {
-    return py::cpp_function([fn](py::args args, py::kwargs kwargs) {
+    return py::cpp_function([fn](py::args args_, py::kwargs kwargs_) {
       if (py::isinstance<StrongFunctionPtr>(fn)) {
         std::cout << "is StrongFunctionPtr" << std::endl;
         auto sfn = py::cast<StrongFunctionPtr>(fn);
         Function& op = *sfn.function_;
-        std::vector<ArgWrapper> flat_args = flatten_args(args, kwargs);
+        std::vector<ArgWrapper> flat_args = flatten_args(args_, kwargs_);
         return apply_jit_function_helper<Function>(flat_args, op);
       }
       // TODO: Support for no NestedTensor arguments
       if (auto name = is_builtin(fn)) {
+        py::list args_vector;
+        std::cout << "args.size(): " << args_.size() << std::endl;
+        for (const auto& arg: args_) {
+          if (py::isinstance<THPNestedTensor>(arg)) {
+            std::cout << "assigning first tensor" << std::endl;
+            args_vector.append(_get_first_variable(
+                py::cast<THPNestedTensor>(arg).get_structure()));
+          } else {
+            args_vector.append(arg);
+          }
+        }
+        py::args args = py::args(args_vector);
+        std::cout << "new_args: " << args << std::endl;
         for (std::shared_ptr<Operator> op : getAllOperatorsFor(*name)) {
           Stack stack;
           try {
             std::cout << "trying op->schema(): " << op->schema() << std::endl;
             stack =
-                createStackForSchema(op->schema(), args, kwargs, c10::nullopt);
-          } catch (...) {
+                createStackForSchema(op->schema(), args, kwargs_, c10::nullopt);
+            break;
+          } catch (std::exception& e) {
+            std::cout << "e.what(): " << e.what() << std::endl;
             continue;
           }
           op->getOperation()(stack);
         }
         exit(1);
+        std::cout << "DONE createStackForSchema" << std::endl;
         for (const auto& op : getAllOperatorsFor(*name)) {
-          if (auto flat_args = try_match_schema(&op->schema(), args, kwargs)) {
+          if (auto flat_args = try_match_schema(&op->schema(), args, kwargs_)) {
             if (DEBUG) {
               std::cout << "is builtin Operation with schema: " << op->schema()
                         << std::endl;
@@ -405,7 +421,7 @@ py::cpp_function jit_tensorwise() {
         }
         for (const auto& op : getAllBuiltinFunctionsFor(*name)) {
           if (auto flat_args =
-                  try_match_schema(&op->getSchema(), args, kwargs)) {
+                  try_match_schema(&op->getSchema(), args, kwargs_)) {
             if (DEBUG) {
               std::cout << "is builtin Function with schema: "
                         << op->getSchema() << std::endl;
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index 44900380..18572bc2 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev2019122717+774588f'
-git_version = '774588f1c41560a21a3aa4bf7620ba817da1948b'
+__version__ = '0.0.1.dev2019122720+bce6d71'
+git_version = 'bce6d71ee83b71d1f34e5bdf63498b2766deb251'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From b644f1000123f2a142aa3076c32967957fd72aa1 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Fri, 27 Dec 2019 13:44:30 -0800
Subject: [PATCH 35/49] Checkpoint

---
 benchmarks/jit_tensorwise.py         |   3 +-
 nestedtensor/csrc/jit_list_apply.cpp | 392 ++++++---------------------
 nestedtensor/csrc/jit_list_apply.h   |   4 -
 nestedtensor/csrc/py_init.cpp        |   1 -
 nestedtensor/version.py              |   4 +-
 5 files changed, 87 insertions(+), 317 deletions(-)

diff --git a/benchmarks/jit_tensorwise.py b/benchmarks/jit_tensorwise.py
index c481585a..da692181 100644
--- a/benchmarks/jit_tensorwise.py
+++ b/benchmarks/jit_tensorwise.py
@@ -21,8 +21,7 @@ def f(i, w):
     out = nestedtensor.as_nested_tensor([torch.randn(1, 2)])
     print(na(
         nestedtensor.as_nested_tensor([torch.randn(1, 2)])._impl,
-        torch.tensor(4.0),
-        out = out
+        4.0,
         ))
     print("222")
     print('out')
diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index eba3359f..908e124a 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -10,11 +10,7 @@ using namespace torch::jit;
 using namespace torch::jit::script;
 
 // TODO Expand to IValues to support generic lists?
-template <class F>
-at::Tensor run_function(std::vector<c10::IValue> stack, F& fn);
-
-template <>
-at::Tensor run_function(std::vector<c10::IValue> stack, Function& fn) {
+at::Tensor run_function(Stack& stack, Function& fn) {
   if (DEBUG) {
     std::cout << "run_function_Function" << std::endl;
   }
@@ -25,8 +21,7 @@ at::Tensor run_function(std::vector<c10::IValue> stack, Function& fn) {
   return result.toTensor();
 }
 
-template <>
-at::Tensor run_function(std::vector<c10::IValue> stack, Operation& fn) {
+at::Tensor run_function(Stack& stack, Operation& fn) {
   if (DEBUG) {
     size_t i = 0;
     for (c10::IValue& ival : stack) {
@@ -46,287 +41,66 @@ at::Tensor run_function(std::vector<c10::IValue> stack, Operation& fn) {
   return result.toTensor();
 }
 
-struct ArgWrapper {
-  ArgWrapper(TensorNode nested_tensor)
-      : _is_nested_tensor(true), _nested_tensor(nested_tensor) {}
-  ArgWrapper(c10::IValue ivalue) : _is_nested_tensor(false), _ivalue(ivalue) {}
-  ArgWrapper(std::string name, c10::IValue ivalue)
-      : _name(name), _is_nested_tensor(false), _ivalue(ivalue) {}
-
-  bool is_nested_tensor() {
-    return _is_nested_tensor;
-  }
-
-  c10::IValue ivalue() {
-    if (_is_nested_tensor) {
-      TensorNode first_tensor_node = get_first_leaf(_nested_tensor);
-      // TODO: What if this is empty?
-      return c10::IValue(first_tensor_node.payload(0));
-    }
-    return _ivalue;
-  }
-
-  TensorNode nested_tensor() {
-    return _nested_tensor;
-  }
-
-  std::string name() {
-    return _name;
-  }
-
- private:
-  std::string _name;
-  bool _is_nested_tensor;
-  c10::IValue _ivalue;
-  TensorNode _nested_tensor;
-};
-
 // TODO: Assert that one arg must be a nestedtensor?
 template <class F>
-static TensorNode apply_jit_function(std::vector<ArgWrapper>& args, F& fn) {
+static TensorNode apply_jit_function(
+    const std::vector<TensorNode>& nested_nodes,
+    const std::set<size_t>& nested_arg_i,
+    Stack& stack_template,
+    F& fn) {
   bool all_leaf = true;
-  for (size_t i = 0; i < args.size(); i++) {
-    if (args[i].is_nested_tensor()) {
-      all_leaf = all_leaf && args[i].nested_tensor().is_leaf();
-    }
+  for (const auto& nested_node : nested_nodes) {
+    all_leaf = all_leaf && nested_node.is_leaf();
   }
   if (all_leaf) {
     // NOTE: We assume no named tensors and no sparse variables as
     // appropriate for TorchScript.
     // TODO: Assert leaf sizes match and are non-zero, otherwise this isn't
     // a NestedTensor function.
-    size_t leaf_size = 0;
-    for (size_t i = 0; i < args.size(); i++) {
-      if (args[i].is_nested_tensor()) {
-        leaf_size = args[i].nested_tensor().size();
-        break;
-      }
-    }
-    std::vector<std::vector<IValue>> stacks(leaf_size);
+    size_t leaf_size = nested_nodes[0].size();
+    c10::List<at::Tensor> results;
     for (size_t j = 0; j < leaf_size; j++) {
-      for (size_t i = 0; i < args.size(); i++) {
-        if (args[i].is_nested_tensor()) {
-          stacks[j].push_back(args[i].nested_tensor().payload(j));
-        } else {
-          stacks[j].push_back(args[i].ivalue());
+      Stack stack(stack_template);
+      size_t ni = 0;
+      for (size_t i = 0; i < stack.size(); i++) {
+        if (nested_arg_i.count(i)) {
+          stack[i] = nested_nodes[ni].payload(j);
         }
       }
-    }
-    // TODO: getSchema().checkAndNormalizeInputs(stack, kwargs);?
-    c10::List<at::Tensor> results;
-    for (size_t i = 0; i < stacks.size(); i++) {
-      results.push_back(run_function<F>(stacks[i], fn));
+      results.push_back(run_function(stack, fn));
     }
     return TensorNode(results);
   } else {
     bool broadcastable = true;
     size_t num_children = 0;
-    for (size_t i = 0; i < args.size(); i++) {
-      if (args[i].is_nested_tensor() && !args[i].nested_tensor().is_leaf()) {
+    for (const auto& nested_node : nested_nodes) {
+      if (!nested_node.is_leaf()) {
         if (num_children > 0) {
-          broadcastable = broadcastable &&
-              (num_children == args[i].nested_tensor().degree());
+          broadcastable =
+              broadcastable && (num_children == nested_node.degree());
         } else {
-          num_children = args[i].nested_tensor().degree();
+          num_children = nested_node.degree();
         }
       }
     }
     TORCH_CHECK(broadcastable, "Can't broadcast given nested tensors");
     std::vector<TensorNode> result;
     for (size_t i = 0; i < num_children; i++) {
-      std::vector<ArgWrapper> local_args;
-      for (size_t j = 0; j < args.size(); j++) {
-        if (args[j].is_nested_tensor()) {
-          if (args[j].nested_tensor().is_leaf()) {
-            local_args.push_back(args[j]);
-          } else {
-            local_args.push_back(
-                ArgWrapper(args[j].nested_tensor().children(i)));
-          }
+      std::vector<TensorNode> local_args;
+      for (const auto& nested_node : nested_nodes) {
+        if (nested_node.is_leaf()) {
+          local_args.push_back(nested_node);
         } else {
-          local_args.push_back(ArgWrapper(args[j].ivalue()));
+          local_args.push_back(nested_node.children(i));
         }
       }
-      result.push_back(apply_jit_function<F>(local_args, fn));
+      result.push_back(
+          apply_jit_function<F>(local_args, nested_arg_i, stack_template, fn));
     }
     return TensorNode(result);
   }
 }
 
-// NestedTensor taken as Tensor type
-static ArgWrapper wrap_arg(
-    py::object arg,
-    c10::optional<c10::TypePtr> type_ptr = c10::nullopt) {
-  if (py::isinstance<THPNestedTensor>(arg)) {
-    TORCH_CHECK((*type_ptr)->kind() == TensorType::Kind);
-    return ArgWrapper(py::cast<THPNestedTensor>(arg).get_structure());
-  }
-  if (type_ptr) {
-    return ArgWrapper(toIValue(arg, *type_ptr));
-  } else {
-    return ArgWrapper(toTypeInferredIValue(arg));
-  }
-}
-
-static std::vector<ArgWrapper> flatten_args(
-    py::args args_,
-    py::kwargs kwargs_) {
-  std::vector<ArgWrapper> flat_args;
-  for (size_t i = 0; i < args_.size(); i++) {
-    flat_args.push_back(wrap_arg(args_[i]));
-  }
-  std::unordered_map<std::string, ArgWrapper> kwargs;
-  for (const std::pair<py::handle, py::handle>& pair : kwargs_) {
-    flat_args.push_back(
-        wrap_arg(py::reinterpret_borrow<py::object>(pair.second)));
-  }
-  return flat_args;
-}
-
-template <class F>
-static THPNestedTensor apply_jit_function_helper(
-    std::vector<ArgWrapper>& flat_args,
-    F& op) {
-  py::gil_scoped_release release;
-  TensorNode result = apply_jit_function(flat_args, op);
-  py::gil_scoped_acquire acquire;
-  return THPNestedTensor(_ListNestedTensor(result));
-}
-
-THPNestedTensor jit_apply_function(
-    std::vector<THPNestedTensor> nts,
-    py::object fn) {
-  auto sfn = py::cast<StrongFunctionPtr>(fn);
-  auto tracing_state = tracer::getTracingState();
-  TORCH_CHECK(!tracing_state, "doesnt support tracing");
-  Function& callee = *sfn.function_;
-  auto schema = callee.getSchema();
-  TORCH_CHECK(
-      schema.arguments().size() == nts.size(),
-      "Give NestedTensors don't match function args.");
-  std::vector<ArgWrapper> nested_nodes;
-  for (size_t i = 0; i < nts.size(); i++) {
-    nested_nodes.push_back(ArgWrapper(nts[i].get_structure()));
-  }
-  py::gil_scoped_release release;
-  TensorNode nested_node = apply_jit_function<Function>(nested_nodes, callee);
-  py::gil_scoped_acquire acquire;
-  return THPNestedTensor(_ListNestedTensor(nested_node));
-}
-
-// TODO: Write separate C++ test for overloads as test cases
-// TODO: Match return values!
-static c10::optional<std::vector<ArgWrapper>> try_match_schema(
-    const FunctionSchema* schema,
-    py::args py_args,
-    py::kwargs py_kwargs) {
-  // const std::vector<ArgWrapper>& py_args,
-  // const std::unordered_map<std::string, ArgWrapper>& py_kwargs) {
-  if (DEBUG) {
-    std::cout << "Checking match for schema: " << *schema << std::endl;
-  }
-  // In the end it's only a match when this counter fully depleted the args.
-  size_t py_args_i = 0;
-  size_t used_kwargs = 0;
-  const std::vector<Argument>& schema_args = schema->arguments();
-  std::vector<ArgWrapper> parse_py_args;
-
-  // For each argument in the Schema, see if it can be matched up with the
-  // given python arguments to determine whether it's the right overload.
-  //
-  // First we resolve the python arguments to build list of candidate
-  // wrapped arguments. It's not enough to parse these arguments
-  // outside of a given Schema because of the type environment
-  // and conversions. It's possible to match a Python call
-  // signature to an overload with different types such as
-  // Scalar and Tensor etc. simply by requiring conversion.
-  for (size_t j = 0; j < schema_args.size(); j++) {
-    // TODO: Support for self as in tryMatchArgument?
-    Argument schema_arg = schema_args[j];
-    py::object py_arg;
-    if (!schema_arg.kwarg_only() && py_args_i < py_args.size()) {
-      // TODO: Add support to allow conversions.
-      py_arg = py_args[py_args_i];
-      py_args_i++;
-    } else if (py_kwargs.contains(schema_arg.name().c_str())) {
-      // TODO: Check for no presence of duplicates in given schema
-      py_arg = py_kwargs[schema_arg.name().c_str()];
-      used_kwargs++;
-    } else if (schema_arg.default_value()) {
-      // TODO: How is this converted to ScalarType if it's a int (usually)?
-      // What mechanism currently does this kind of conversion.
-      py_arg = toPyObject(*schema_arg.default_value());
-    } else {
-      // The given schema cannot find either a positional or keyword argument to
-      // match against for this given schema argument. There also is no default
-      // value specified for this schema argument. Therefore this schema cannot
-      // be the correct overload.
-      if (DEBUG) {
-        std::cout << "ARGS COUNT OFF!" << std::endl;
-      }
-      return c10::nullopt;
-    }
-    // TODO: NestedTensor support
-    try {
-      ArgWrapper arg = wrap_arg(py_arg, schema_arg.type());
-      parse_py_args.push_back(arg);
-    } catch (std::exception& e) {
-      if (DEBUG) {
-        std::cout << "Wrap arg exception: " << e.what() << std::endl;
-      }
-      return c10::nullopt;
-    }
-  }
-  if (
-      // Check whether all positional arguments were matched by given Schema
-      (py_args.size() == py_args_i) &&
-      // Check if all kwargs were matched by given Schema
-      (used_kwargs == py_kwargs.size())) {
-    //    bool types_match = true;
-    //    TypeEnv type_env;
-    //    for (size_t j = 0; j < parse_py_args.size(); j++) {
-    //      // std::cout << " ; parse_py_args[" << j
-    //      //           << "]: " << type_j->str();
-    //      // Now that we found that the overall schema matches, we need to
-    //      check
-    //      // whether the types match.
-    //      // TODO: Need Subtypes and argument type conversions (e.g. convert
-    //      one
-    //      // float to list of floats with right number of elements).
-    //      // MatchTypeReturn match =
-    //      //     matchTypeVariables(schema_args[j].type(), type_j, type_env);
-    //      TypePtr type_j = parse_py_args[j].ivalue().type();
-    //      std::cout << " x parse_py_args[" << j << "]: " << type_j->str();
-    //      std::cout << "\t=\t"
-    //                << "schema_args[" << j << "]: " <<
-    //                schema_args[j].type()->str();
-    //      // TODO: We want to know whether the actual argument is a
-    //      convertible
-    //      // subtype to the one used in the schema.
-    //      // TODO: Need type env?
-    //      // types_match = types_match &&
-    //      matchTypeVariables(schema_args[j].type(),
-    //      // type_j, type_env).success();
-    //      types_match =
-    //          types_match && (schema_args[j].type()->kind() ==
-    //          type_j->kind());
-    //      std::cout << "\t types_match: " << types_match;
-    //      std::cout << std::endl;
-    //    }
-    //    std::cout << std::endl;
-    //    if (types_match) {
-    if (DEBUG) {
-      std::cout << "FOUND IT!" << std::endl;
-    }
-    return parse_py_args;
-    //    }
-  }
-  if (DEBUG) {
-    std::cout << "ARGS SIZES MISMATCHED" << std::endl;
-  }
-  return c10::nullopt;
-}
-
 // TODO: Write comparison operation based on a subset of Argument comparison
 // TODO: Move this into jit_tensorwise and add support for all 3 cases.
 // TODO: Template apply_jit_function to work with Operation and Function.
@@ -337,33 +111,12 @@ c10::optional<Symbol> is_builtin(py::object fn) {
 
   // TODO: Is there a cheaper way to do this?
   const auto& variants = getAllOperatorsFor(name);
-  const auto& builtin_functions = getAllBuiltinFunctionsFor(name);
-  if (variants.size() == 0 && builtin_functions.size() == 0) {
+  if (variants.size() == 0) {
     return c10::nullopt;
   }
   return name;
 }
 
-//  // Go through each Schema candidate based on the overloads
-//  // The order here matters and is given by the way we construct schemas.
-//  // This is a subset of matchSchemas within jit/script/schema_matching.cpp
-//  // and only implements the argument matching based on features such as
-//  types.
-//  // It could eventually live in the JIT as a subcomponent that can implement
-//  // overload resolution generically and outside a graph context.
-//  //
-//  // In essence we spend most of our time resolving types (e.g. turn
-//  // single floats into lists of floats, resolving concrete types) or dealing
-//  // with the unordered nature of kwargs.
-//  for (size_t i = 0; i < schemas.size(); i++) {
-//    if (try_match_schema(schemas[i], py_args, py_kwargs)) {
-//      std::cout << "schema[" << i << "]:\t" << *schemas[i];
-//      std::cout << " - overload_name: " << schemas[i]->overload_name();
-//      std::cout << "WIN" << std::endl;
-//    }
-//  }
-//  return torch::ones({});
-
 // TODO: This should support 3 types of functions
 // fn might be scripted (i.e. StrongFunctionPtr)
 // fn might be a builtin (need to resolve!)
@@ -372,63 +125,86 @@ c10::optional<Symbol> is_builtin(py::object fn) {
 py::cpp_function jit_tensorwise() {
   return py::cpp_function([](py::object fn) {
     return py::cpp_function([fn](py::args args_, py::kwargs kwargs_) {
-      if (py::isinstance<StrongFunctionPtr>(fn)) {
-        std::cout << "is StrongFunctionPtr" << std::endl;
-        auto sfn = py::cast<StrongFunctionPtr>(fn);
-        Function& op = *sfn.function_;
-        std::vector<ArgWrapper> flat_args = flatten_args(args_, kwargs_);
-        return apply_jit_function_helper<Function>(flat_args, op);
-      }
+      std::cout << "given args_: " << args_ << std::endl;
+      // if (py::isinstance<StrongFunctionPtr>(fn)) {
+      //   std::cout << "is StrongFunctionPtr" << std::endl;
+      //   auto sfn = py::cast<StrongFunctionPtr>(fn);
+      //   Function& op = *sfn.function_;
+      //   std::vector<ArgWrapper> flat_args = flatten_args(args_, kwargs_);
+      //   return apply_jit_function_helper<Function>(flat_args, op);
+      // }
       // TODO: Support for no NestedTensor arguments
       if (auto name = is_builtin(fn)) {
         py::list args_vector;
+        std::set<size_t> nested_arg_i;
         std::cout << "args.size(): " << args_.size() << std::endl;
-        for (const auto& arg: args_) {
+        for (size_t i = 0; i < args_.size(); i++) {
+          py::object arg = args_[i];
           if (py::isinstance<THPNestedTensor>(arg)) {
             std::cout << "assigning first tensor" << std::endl;
             args_vector.append(_get_first_variable(
                 py::cast<THPNestedTensor>(arg).get_structure()));
+            nested_arg_i.insert(i);
           } else {
             args_vector.append(arg);
           }
         }
         py::args args = py::args(args_vector);
         std::cout << "new_args: " << args << std::endl;
+        Stack stack;
         for (std::shared_ptr<Operator> op : getAllOperatorsFor(*name)) {
-          Stack stack;
           try {
             std::cout << "trying op->schema(): " << op->schema() << std::endl;
             stack =
                 createStackForSchema(op->schema(), args, kwargs_, c10::nullopt);
-            break;
           } catch (std::exception& e) {
             std::cout << "e.what(): " << e.what() << std::endl;
             continue;
           }
-          op->getOperation()(stack);
-        }
-        exit(1);
-        std::cout << "DONE createStackForSchema" << std::endl;
-        for (const auto& op : getAllOperatorsFor(*name)) {
-          if (auto flat_args = try_match_schema(&op->schema(), args, kwargs_)) {
-            if (DEBUG) {
-              std::cout << "is builtin Operation with schema: " << op->schema()
-                        << std::endl;
+          std::vector<TensorNode> nested_nodes;
+          for (const auto& arg : args_) {
+            if (py::isinstance<THPNestedTensor>(arg)) {
+              nested_nodes.push_back(
+                  py::cast<THPNestedTensor>(arg).get_structure());
             }
-            Operation actual = op->getOperation();
-            return apply_jit_function_helper<Operation>(*flat_args, actual);
-          }
-        }
-        for (const auto& op : getAllBuiltinFunctionsFor(*name)) {
-          if (auto flat_args =
-                  try_match_schema(&op->getSchema(), args, kwargs_)) {
-            if (DEBUG) {
-              std::cout << "is builtin Function with schema: "
-                        << op->getSchema() << std::endl;
-            }
-            return apply_jit_function_helper<Function>(*flat_args, *op);
           }
+          auto operation = op->getOperation();
+          return THPNestedTensor(_ListNestedTensor(apply_jit_function(nested_nodes, nested_arg_i, stack, operation)));
+          // Stack stack2(stack);
+          // op->getOperation()(stack2);
+          // std::cout << "return value1: "
+          //           << torch::jit::createPyObjectForStack(std::move(stack2))
+          //           << std::endl;
+          // Stack stack3(stack);
+          // op->getOperation()(stack3);
+          // std::cout << "return value2: "
+          //           << torch::jit::createPyObjectForStack(std::move(stack3))
+          //           << std::endl;
         }
+        exit(1);
+        // std::cout << "DONE createStackForSchema" << std::endl;
+        // for (const auto& op : getAllOperatorsFor(*name)) {
+        //   if (auto flat_args = try_match_schema(&op->schema(), args,
+        //   kwargs_)) {
+        //     if (DEBUG) {
+        //       std::cout << "is builtin Operation with schema: " <<
+        //       op->schema()
+        //                 << std::endl;
+        //     }
+        //     Operation actual = op->getOperation();
+        //     return apply_jit_function_helper<Operation>(*flat_args, actual);
+        //   }
+        // }
+        // for (const auto& op : getAllBuiltinFunctionsFor(*name)) {
+        //   if (auto flat_args =
+        //           try_match_schema(&op->getSchema(), args, kwargs_)) {
+        //     if (DEBUG) {
+        //       std::cout << "is builtin Function with schema: "
+        //                 << op->getSchema() << std::endl;
+        //     }
+        //     return apply_jit_function_helper<Function>(*flat_args, *op);
+        //   }
+        // }
       }
       // TODO: Need implementation of generic python version.
       std::stringstream ss;
diff --git a/nestedtensor/csrc/jit_list_apply.h b/nestedtensor/csrc/jit_list_apply.h
index ac1ff09f..ffa4aa13 100644
--- a/nestedtensor/csrc/jit_list_apply.h
+++ b/nestedtensor/csrc/jit_list_apply.h
@@ -5,10 +5,6 @@ namespace nested_tensor {
 
 static bool DEBUG = false;
 
-THPNestedTensor jit_apply_function(
-    std::vector<THPNestedTensor> nts_,
-    pybind11::object fn);
-
 pybind11::cpp_function jit_tensorwise();
 
 } // namespace nested_tensor
diff --git a/nestedtensor/csrc/py_init.cpp b/nestedtensor/csrc/py_init.cpp
index 183c4cfc..f8e9dc11 100644
--- a/nestedtensor/csrc/py_init.cpp
+++ b/nestedtensor/csrc/py_init.cpp
@@ -110,7 +110,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
       .def("__str__", &torch::nested_tensor::THPNestedTensor::str)
       .def("__repr__", &torch::nested_tensor::THPNestedTensor::str);
 
-  m.def("jit_apply_function", &torch::nested_tensor::jit_apply_function);
   m.def("jit_tensorwise", &torch::nested_tensor::jit_tensorwise);
   m.def("as_nested_tensor", &torch::nested_tensor::as_nested_tensor);
   m.def(
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index 18572bc2..803bd23c 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev2019122720+bce6d71'
-git_version = 'bce6d71ee83b71d1f34e5bdf63498b2766deb251'
+__version__ = '0.0.1.dev2019122721+5f921db'
+git_version = '5f921db935db2b44647f48c0b6fa736164190c0c'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From 7ce1e243f4f622c674fce0dbae37d3b0f9b2a34f Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Fri, 27 Dec 2019 13:56:38 -0800
Subject: [PATCH 36/49] Checkpoint

---
 benchmarks/nearest_neighbors.py      |  6 +++---
 nestedtensor/csrc/jit_list_apply.cpp | 14 +++++++-------
 nestedtensor/nested/monkey_patch.py  |  6 ++++--
 nestedtensor/version.py              |  4 ++--
 4 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/benchmarks/nearest_neighbors.py b/benchmarks/nearest_neighbors.py
index f9a77fdd..26bd3f05 100644
--- a/benchmarks/nearest_neighbors.py
+++ b/benchmarks/nearest_neighbors.py
@@ -74,8 +74,8 @@ def gen_algorithm_nested_jit_mv(keys, sub_clusters):
         for cluster in sub_cluster:
             new_sub_cluster.append(torch.stack(cluster))
         new_sub_clusters.append(new_sub_cluster)
-    nested_sub_clusters = nestedtensor._ListNestedTensor(new_sub_clusters)
-    nested_keys = nestedtensor._ListNestedTensor(keys)
+    nested_sub_clusters = nestedtensor.as_nested_tensor(new_sub_clusters)
+    nested_keys = nestedtensor.as_nested_tensor(keys)
 
     @nestedtensor._C.jit_tensorwise()
     @torch.jit.script
@@ -137,7 +137,7 @@ def benchmark_fn(fn, run_time = 15.0):
     gen_results_naive = gen_algorithm_naive(keys, sub_clusters)
     gen_results_mv = gen_algorithm_mv(keys, sub_clusters)
     gen_results_nested_mv = gen_algorithm_nested_mv(keys, sub_clusters)
-    gen_results_nested_jit_mv = gen_algorithm_nested_jit_mv(keys, sub_clusters)
+    # gen_results_nested_jit_mv = gen_algorithm_nested_jit_mv(keys, sub_clusters)
 
     print(benchmark_fn(gen_results_nested_mv))
     print(benchmark_fn(gen_results_naive))
diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index 908e124a..80171592 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -125,7 +125,7 @@ c10::optional<Symbol> is_builtin(py::object fn) {
 py::cpp_function jit_tensorwise() {
   return py::cpp_function([](py::object fn) {
     return py::cpp_function([fn](py::args args_, py::kwargs kwargs_) {
-      std::cout << "given args_: " << args_ << std::endl;
+      // std::cout << "given args_: " << args_ << std::endl;
       // if (py::isinstance<StrongFunctionPtr>(fn)) {
       //   std::cout << "is StrongFunctionPtr" << std::endl;
       //   auto sfn = py::cast<StrongFunctionPtr>(fn);
@@ -137,11 +137,11 @@ py::cpp_function jit_tensorwise() {
       if (auto name = is_builtin(fn)) {
         py::list args_vector;
         std::set<size_t> nested_arg_i;
-        std::cout << "args.size(): " << args_.size() << std::endl;
+        // std::cout << "args.size(): " << args_.size() << std::endl;
         for (size_t i = 0; i < args_.size(); i++) {
           py::object arg = args_[i];
           if (py::isinstance<THPNestedTensor>(arg)) {
-            std::cout << "assigning first tensor" << std::endl;
+            // std::cout << "assigning first tensor" << std::endl;
             args_vector.append(_get_first_variable(
                 py::cast<THPNestedTensor>(arg).get_structure()));
             nested_arg_i.insert(i);
@@ -150,15 +150,15 @@ py::cpp_function jit_tensorwise() {
           }
         }
         py::args args = py::args(args_vector);
-        std::cout << "new_args: " << args << std::endl;
+        // std::cout << "new_args: " << args << std::endl;
         Stack stack;
         for (std::shared_ptr<Operator> op : getAllOperatorsFor(*name)) {
           try {
-            std::cout << "trying op->schema(): " << op->schema() << std::endl;
+            // std::cout << "trying op->schema(): " << op->schema() << std::endl;
             stack =
                 createStackForSchema(op->schema(), args, kwargs_, c10::nullopt);
           } catch (std::exception& e) {
-            std::cout << "e.what(): " << e.what() << std::endl;
+            // std::cout << "e.what(): " << e.what() << std::endl;
             continue;
           }
           std::vector<TensorNode> nested_nodes;
@@ -181,7 +181,7 @@ py::cpp_function jit_tensorwise() {
           //           << torch::jit::createPyObjectForStack(std::move(stack3))
           //           << std::endl;
         }
-        exit(1);
+        // exit(1);
         // std::cout << "DONE createStackForSchema" << std::endl;
         // for (const auto& op : getAllOperatorsFor(*name)) {
         //   if (auto flat_args = try_match_schema(&op->schema(), args,
diff --git a/nestedtensor/nested/monkey_patch.py b/nestedtensor/nested/monkey_patch.py
index 3bc3e75b..5f4168ba 100644
--- a/nestedtensor/nested/monkey_patch.py
+++ b/nestedtensor/nested/monkey_patch.py
@@ -88,7 +88,10 @@ def set_function(key, function):
         set_nt_method(function_name + '_', utils.tensorwise())
         if function_name in ['fill']:
             continue
-        set_wrapped_jit_torch_function(function_name, _C.jit_tensorwise())
+        if function_name in ['mvlgamma', 'clamp', 'clamp_min', 'clamp_max']:
+            set_wrapped_torch_function(function_name, utils.tensorwise())
+        else:
+            set_wrapped_jit_torch_function(function_name, _C.jit_tensorwise())
         set_nt_method(function_name, utils.tensorwise())
     # <
 
@@ -227,6 +230,5 @@ def set_function(key, function):
 
     # module.NestedTensor = NestedTensor
 
-    jit_function_dispatch[torch.mv] = _C.jit_tensorwise()(torch.mv)
     setattr(NestedTensor, '_NestedTensor__function_dispatch', function_dispatch)
     setattr(NestedTensor, '_NestedTensor__jit_function_dispatch', jit_function_dispatch)
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index 803bd23c..3a6c11ef 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev2019122721+5f921db'
-git_version = '5f921db935db2b44647f48c0b6fa736164190c0c'
+__version__ = '0.0.1.dev2019122721+b644f10'
+git_version = 'b644f1000123f2a142aa3076c32967957fd72aa1'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From a1305f1706cef8c32d52ca02c026e644c538f493 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Fri, 27 Dec 2019 14:00:11 -0800
Subject: [PATCH 37/49] Checkpoint

---
 nestedtensor/csrc/jit_list_apply.cpp | 59 ----------------------------
 nestedtensor/csrc/jit_list_apply.h   |  2 -
 nestedtensor/version.py              |  4 +-
 3 files changed, 2 insertions(+), 63 deletions(-)

diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index 80171592..f1e58971 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -11,33 +11,13 @@ using namespace torch::jit::script;
 
 // TODO Expand to IValues to support generic lists?
 at::Tensor run_function(Stack& stack, Function& fn) {
-  if (DEBUG) {
-    std::cout << "run_function_Function" << std::endl;
-  }
   c10::IValue result = fn(stack);
-  if (DEBUG) {
-    std::cout << "finished result_Function" << std::endl;
-  }
   return result.toTensor();
 }
 
 at::Tensor run_function(Stack& stack, Operation& fn) {
-  if (DEBUG) {
-    size_t i = 0;
-    for (c10::IValue& ival : stack) {
-      std::cout << "ival " << i << " : " << ival.tagKind() << std::endl;
-      i++;
-    }
-    std::cout << "run_function_Operation" << std::endl;
-  }
   fn(stack);
-  if (DEBUG) {
-    std::cout << "run_function_Operation stack finished" << std::endl;
-  }
   c10::IValue result = stack.front();
-  if (DEBUG) {
-    std::cout << "finished result_Operation" << std::endl;
-  }
   return result.toTensor();
 }
 
@@ -137,11 +117,9 @@ py::cpp_function jit_tensorwise() {
       if (auto name = is_builtin(fn)) {
         py::list args_vector;
         std::set<size_t> nested_arg_i;
-        // std::cout << "args.size(): " << args_.size() << std::endl;
         for (size_t i = 0; i < args_.size(); i++) {
           py::object arg = args_[i];
           if (py::isinstance<THPNestedTensor>(arg)) {
-            // std::cout << "assigning first tensor" << std::endl;
             args_vector.append(_get_first_variable(
                 py::cast<THPNestedTensor>(arg).get_structure()));
             nested_arg_i.insert(i);
@@ -150,15 +128,12 @@ py::cpp_function jit_tensorwise() {
           }
         }
         py::args args = py::args(args_vector);
-        // std::cout << "new_args: " << args << std::endl;
         Stack stack;
         for (std::shared_ptr<Operator> op : getAllOperatorsFor(*name)) {
           try {
-            // std::cout << "trying op->schema(): " << op->schema() << std::endl;
             stack =
                 createStackForSchema(op->schema(), args, kwargs_, c10::nullopt);
           } catch (std::exception& e) {
-            // std::cout << "e.what(): " << e.what() << std::endl;
             continue;
           }
           std::vector<TensorNode> nested_nodes;
@@ -170,41 +145,7 @@ py::cpp_function jit_tensorwise() {
           }
           auto operation = op->getOperation();
           return THPNestedTensor(_ListNestedTensor(apply_jit_function(nested_nodes, nested_arg_i, stack, operation)));
-          // Stack stack2(stack);
-          // op->getOperation()(stack2);
-          // std::cout << "return value1: "
-          //           << torch::jit::createPyObjectForStack(std::move(stack2))
-          //           << std::endl;
-          // Stack stack3(stack);
-          // op->getOperation()(stack3);
-          // std::cout << "return value2: "
-          //           << torch::jit::createPyObjectForStack(std::move(stack3))
-          //           << std::endl;
         }
-        // exit(1);
-        // std::cout << "DONE createStackForSchema" << std::endl;
-        // for (const auto& op : getAllOperatorsFor(*name)) {
-        //   if (auto flat_args = try_match_schema(&op->schema(), args,
-        //   kwargs_)) {
-        //     if (DEBUG) {
-        //       std::cout << "is builtin Operation with schema: " <<
-        //       op->schema()
-        //                 << std::endl;
-        //     }
-        //     Operation actual = op->getOperation();
-        //     return apply_jit_function_helper<Operation>(*flat_args, actual);
-        //   }
-        // }
-        // for (const auto& op : getAllBuiltinFunctionsFor(*name)) {
-        //   if (auto flat_args =
-        //           try_match_schema(&op->getSchema(), args, kwargs_)) {
-        //     if (DEBUG) {
-        //       std::cout << "is builtin Function with schema: "
-        //                 << op->getSchema() << std::endl;
-        //     }
-        //     return apply_jit_function_helper<Function>(*flat_args, *op);
-        //   }
-        // }
       }
       // TODO: Need implementation of generic python version.
       std::stringstream ss;
diff --git a/nestedtensor/csrc/jit_list_apply.h b/nestedtensor/csrc/jit_list_apply.h
index ffa4aa13..defc5e6f 100644
--- a/nestedtensor/csrc/jit_list_apply.h
+++ b/nestedtensor/csrc/jit_list_apply.h
@@ -3,8 +3,6 @@
 namespace torch {
 namespace nested_tensor {
 
-static bool DEBUG = false;
-
 pybind11::cpp_function jit_tensorwise();
 
 } // namespace nested_tensor
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index 3a6c11ef..1ca8f9d2 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev2019122721+b644f10'
-git_version = 'b644f1000123f2a142aa3076c32967957fd72aa1'
+__version__ = '0.0.1.dev2019122721+7ce1e24'
+git_version = '7ce1e243f4f622c674fce0dbae37d3b0f9b2a34f'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From a2ca671ba806919c8744dd944e6f34e2ec4bf1c0 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Fri, 27 Dec 2019 15:59:28 -0800
Subject: [PATCH 38/49] Checkpoint

---
 benchmarks/jit_tensorwise.py         | 50 ++++++++++++++------
 nestedtensor/csrc/jit_list_apply.cpp | 69 +++++++++++++++-------------
 nestedtensor/version.py              |  4 +-
 3 files changed, 75 insertions(+), 48 deletions(-)

diff --git a/benchmarks/jit_tensorwise.py b/benchmarks/jit_tensorwise.py
index da692181..cc9160dc 100644
--- a/benchmarks/jit_tensorwise.py
+++ b/benchmarks/jit_tensorwise.py
@@ -1,6 +1,7 @@
 import torch
 import nestedtensor
 import utils
+import time
 
 
 @nestedtensor._C.jit_tensorwise()
@@ -10,22 +11,43 @@ def f(i, w):
 
 
 if __name__ == "__main__":
-    # r = f(nestedtensor._C._ListNestedTensor([torch.randn(1, 3, 10, 20)]),
-    #     nestedtensor._C._ListNestedTensor([torch.randn(5, 3, 3, 3)]))
-    # 
+    w = torch.randn(128, 3, 7, 7).cuda()
+    inp1 = list(torch.randn(1024, 1, 3, 56, 56).cuda().unbind())
+    inp3 = nestedtensor.as_nested_tensor(inp1)._impl
+    # print(sum(inp.numel() for inp in inp1))
+    # print(inp3.numel())
+
+    t0 = time.time()
+    count = 0
+    while(time.time() - t0 < 10.0):
+        for inp in inp1:
+            r1 = torch.conv2d(inp, w)
+        torch.cuda.synchronize()
+        count += 1
+    print(count)
+
+    t0 = time.time()
+    count = 0
+    while(time.time() - t0 < 10.0):
+        r2 = f(inp3, w)
+        torch.cuda.synchronize()
+        count += 1
+    print(count)
+
+    
     # print(r.nested_size())
 
-    na = nestedtensor._C.jit_tensorwise()(torch.mul)
-
-    print("111")
-    out = nestedtensor.as_nested_tensor([torch.randn(1, 2)])
-    print(na(
-        nestedtensor.as_nested_tensor([torch.randn(1, 2)])._impl,
-        4.0,
-        ))
-    print("222")
-    print('out')
-    print(out)
+    # na = nestedtensor._C.jit_tensorwise()(torch.mul)
+
+    # print("111")
+    # out = nestedtensor.as_nested_tensor([torch.randn(1, 2)])
+    # print(na(
+    #     nestedtensor.as_nested_tensor([torch.randn(1, 2)])._impl,
+    #     4.0,
+    #     ))
+    # print("222")
+    # print('out')
+    # print(out)
 
     # nv = nestedtensor._C.jit_tensorwise()(torch.mv)
     # print(nv(
diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index f1e58971..21bdadb1 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -10,12 +10,12 @@ using namespace torch::jit;
 using namespace torch::jit::script;
 
 // TODO Expand to IValues to support generic lists?
-at::Tensor run_function(Stack& stack, Function& fn) {
+at::Tensor run_function(Stack&& stack, Function& fn) {
   c10::IValue result = fn(stack);
   return result.toTensor();
 }
 
-at::Tensor run_function(Stack& stack, Operation& fn) {
+at::Tensor run_function(Stack&& stack, Operation& fn) {
   fn(stack);
   c10::IValue result = stack.front();
   return result.toTensor();
@@ -47,7 +47,7 @@ static TensorNode apply_jit_function(
           stack[i] = nested_nodes[ni].payload(j);
         }
       }
-      results.push_back(run_function(stack, fn));
+      results.emplace_back(run_function(std::move(stack), fn));
     }
     return TensorNode(results);
   } else {
@@ -105,29 +105,39 @@ c10::optional<Symbol> is_builtin(py::object fn) {
 py::cpp_function jit_tensorwise() {
   return py::cpp_function([](py::object fn) {
     return py::cpp_function([fn](py::args args_, py::kwargs kwargs_) {
-      // std::cout << "given args_: " << args_ << std::endl;
-      // if (py::isinstance<StrongFunctionPtr>(fn)) {
-      //   std::cout << "is StrongFunctionPtr" << std::endl;
-      //   auto sfn = py::cast<StrongFunctionPtr>(fn);
-      //   Function& op = *sfn.function_;
-      //   std::vector<ArgWrapper> flat_args = flatten_args(args_, kwargs_);
-      //   return apply_jit_function_helper<Function>(flat_args, op);
-      // }
       // TODO: Support for no NestedTensor arguments
-      if (auto name = is_builtin(fn)) {
-        py::list args_vector;
-        std::set<size_t> nested_arg_i;
-        for (size_t i = 0; i < args_.size(); i++) {
-          py::object arg = args_[i];
-          if (py::isinstance<THPNestedTensor>(arg)) {
-            args_vector.append(_get_first_variable(
-                py::cast<THPNestedTensor>(arg).get_structure()));
-            nested_arg_i.insert(i);
-          } else {
-            args_vector.append(arg);
-          }
+      py::list args_vector;
+      std::set<size_t> nested_arg_i;
+      for (size_t i = 0; i < args_.size(); i++) {
+        py::object arg = args_[i];
+        if (py::isinstance<THPNestedTensor>(arg)) {
+          args_vector.append(_get_first_variable(
+              py::cast<THPNestedTensor>(arg).get_structure()));
+          nested_arg_i.insert(i);
+        } else {
+          args_vector.append(arg);
+        }
+      }
+      py::args args = py::args(args_vector);
+
+      std::vector<TensorNode> nested_nodes;
+      for (const auto& arg : args_) {
+        if (py::isinstance<THPNestedTensor>(arg)) {
+          nested_nodes.push_back(
+              py::cast<THPNestedTensor>(arg).get_structure());
         }
-        py::args args = py::args(args_vector);
+      }
+
+      if (py::isinstance<StrongFunctionPtr>(fn)) {
+        auto sfn = py::cast<StrongFunctionPtr>(fn);
+        Function& operation = *sfn.function_;
+        Stack stack = createStackForSchema(
+            operation.getSchema(), args, kwargs_, c10::nullopt);
+        py::gil_scoped_release release;
+        return THPNestedTensor(_ListNestedTensor(
+            apply_jit_function(nested_nodes, nested_arg_i, stack, operation)));
+      }
+      if (auto name = is_builtin(fn)) {
         Stack stack;
         for (std::shared_ptr<Operator> op : getAllOperatorsFor(*name)) {
           try {
@@ -136,15 +146,10 @@ py::cpp_function jit_tensorwise() {
           } catch (std::exception& e) {
             continue;
           }
-          std::vector<TensorNode> nested_nodes;
-          for (const auto& arg : args_) {
-            if (py::isinstance<THPNestedTensor>(arg)) {
-              nested_nodes.push_back(
-                  py::cast<THPNestedTensor>(arg).get_structure());
-            }
-          }
           auto operation = op->getOperation();
-          return THPNestedTensor(_ListNestedTensor(apply_jit_function(nested_nodes, nested_arg_i, stack, operation)));
+          py::gil_scoped_release release;
+          return THPNestedTensor(_ListNestedTensor(apply_jit_function(
+              nested_nodes, nested_arg_i, stack, operation)));
         }
       }
       // TODO: Need implementation of generic python version.
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index 1ca8f9d2..62f243fd 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev2019122721+7ce1e24'
-git_version = '7ce1e243f4f622c674fce0dbae37d3b0f9b2a34f'
+__version__ = '0.0.1.dev2019122723+a1305f1'
+git_version = 'a1305f1706cef8c32d52ca02c026e644c538f493'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From b41a1cd2c48475def3bc0cb5b3bf676444bde481 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Fri, 27 Dec 2019 18:20:51 -0800
Subject: [PATCH 39/49] Checkpoint

---
 benchmarks/jit_tensorwise.py         | 23 ++++++++++------
 nestedtensor/csrc/jit_list_apply.cpp | 41 ++++++++++++----------------
 nestedtensor/version.py              |  4 +--
 3 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/benchmarks/jit_tensorwise.py b/benchmarks/jit_tensorwise.py
index cc9160dc..c18fc677 100644
--- a/benchmarks/jit_tensorwise.py
+++ b/benchmarks/jit_tensorwise.py
@@ -9,30 +9,35 @@
 def f(i, w):
     return torch.conv2d(i, w)
 
+def loop_f(inp1, w):
+    for inp in inp1:
+        torch.conv2d(inp, w)
+
 
 if __name__ == "__main__":
-    w = torch.randn(128, 3, 7, 7).cuda()
-    inp1 = list(torch.randn(1024, 1, 3, 56, 56).cuda().unbind())
+    w = torch.randn(64, 3, 9, 9).cuda()
+    inp1 = list(torch.randn(128, 1, 3, 16, 16).cuda().unbind())
     inp3 = nestedtensor.as_nested_tensor(inp1)._impl
     # print(sum(inp.numel() for inp in inp1))
     # print(inp3.numel())
 
+    fc = nestedtensor._C.jit_tensorwise()(torch.conv2d)
+
     t0 = time.time()
     count = 0
-    while(time.time() - t0 < 10.0):
-        for inp in inp1:
-            r1 = torch.conv2d(inp, w)
+    while(time.time() - t0 < 5.0):
+        r2 = fc(inp3, w)
         torch.cuda.synchronize()
         count += 1
-    print(count)
+    print("jit: " + str(count))
 
     t0 = time.time()
     count = 0
-    while(time.time() - t0 < 10.0):
-        r2 = f(inp3, w)
+    while(time.time() - t0 < 5.0):
+        loop_f(inp1, w)
         torch.cuda.synchronize()
         count += 1
-    print(count)
+    print("for loop: " + str(count))
 
     
     # print(r.nested_size())
diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index 21bdadb1..86b12b76 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -1,5 +1,6 @@
 #include <jit_list_apply.h>
 #include <torch/csrc/jit/script/builtin_functions.h>
+#include <torch/extension.h>
 
 namespace torch {
 namespace nested_tensor {
@@ -11,14 +12,12 @@ using namespace torch::jit::script;
 
 // TODO Expand to IValues to support generic lists?
 at::Tensor run_function(Stack&& stack, Function& fn) {
-  c10::IValue result = fn(stack);
-  return result.toTensor();
+  return std::move(fn(stack).toTensor());
 }
 
 at::Tensor run_function(Stack&& stack, Operation& fn) {
   fn(stack);
-  c10::IValue result = stack.front();
-  return result.toTensor();
+  return std::move(stack.front().toTensor());
 }
 
 // TODO: Assert that one arg must be a nestedtensor?
@@ -39,17 +38,19 @@ static TensorNode apply_jit_function(
     // a NestedTensor function.
     size_t leaf_size = nested_nodes[0].size();
     c10::List<at::Tensor> results;
+    results.reserve(leaf_size);
     for (size_t j = 0; j < leaf_size; j++) {
       Stack stack(stack_template);
       size_t ni = 0;
       for (size_t i = 0; i < stack.size(); i++) {
         if (nested_arg_i.count(i)) {
           stack[i] = nested_nodes[ni].payload(j);
+          ni++;
         }
       }
-      results.emplace_back(run_function(std::move(stack), fn));
+      results.push_back(run_function(std::move(stack), fn));
     }
-    return TensorNode(results);
+    return TensorNode(std::move(results));
   } else {
     bool broadcastable = true;
     size_t num_children = 0;
@@ -81,9 +82,6 @@ static TensorNode apply_jit_function(
   }
 }
 
-// TODO: Write comparison operation based on a subset of Argument comparison
-// TODO: Move this into jit_tensorwise and add support for all 3 cases.
-// TODO: Template apply_jit_function to work with Operation and Function.
 c10::optional<Symbol> is_builtin(py::object fn) {
   py::object builtin_name =
       py::module::import("torch.jit").attr("_find_builtin")(fn);
@@ -108,34 +106,29 @@ py::cpp_function jit_tensorwise() {
       // TODO: Support for no NestedTensor arguments
       py::list args_vector;
       std::set<size_t> nested_arg_i;
+      std::vector<TensorNode> nested_nodes;
       for (size_t i = 0; i < args_.size(); i++) {
         py::object arg = args_[i];
         if (py::isinstance<THPNestedTensor>(arg)) {
-          args_vector.append(_get_first_variable(
-              py::cast<THPNestedTensor>(arg).get_structure()));
+          TensorNode nested_node =
+              py::cast<THPNestedTensor>(arg).get_structure();
+          args_vector.append(_get_first_variable(nested_node));
+          nested_nodes.emplace_back(std::move(nested_node));
           nested_arg_i.insert(i);
         } else {
           args_vector.append(arg);
         }
       }
       py::args args = py::args(args_vector);
-
-      std::vector<TensorNode> nested_nodes;
-      for (const auto& arg : args_) {
-        if (py::isinstance<THPNestedTensor>(arg)) {
-          nested_nodes.push_back(
-              py::cast<THPNestedTensor>(arg).get_structure());
-        }
-      }
-
       if (py::isinstance<StrongFunctionPtr>(fn)) {
         auto sfn = py::cast<StrongFunctionPtr>(fn);
         Function& operation = *sfn.function_;
         Stack stack = createStackForSchema(
             operation.getSchema(), args, kwargs_, c10::nullopt);
         py::gil_scoped_release release;
-        return THPNestedTensor(_ListNestedTensor(
+        THPNestedTensor result = THPNestedTensor(_ListNestedTensor(
             apply_jit_function(nested_nodes, nested_arg_i, stack, operation)));
+        return result;
       }
       if (auto name = is_builtin(fn)) {
         Stack stack;
@@ -148,8 +141,10 @@ py::cpp_function jit_tensorwise() {
           }
           auto operation = op->getOperation();
           py::gil_scoped_release release;
-          return THPNestedTensor(_ListNestedTensor(apply_jit_function(
-              nested_nodes, nested_arg_i, stack, operation)));
+          THPNestedTensor result =
+              THPNestedTensor(_ListNestedTensor(apply_jit_function(
+                  nested_nodes, nested_arg_i, stack, operation)));
+          return result;
         }
       }
       // TODO: Need implementation of generic python version.
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index 62f243fd..ab846dd7 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev2019122723+a1305f1'
-git_version = 'a1305f1706cef8c32d52ca02c026e644c538f493'
+__version__ = '0.0.1.dev201912282+a2ca671'
+git_version = 'a2ca671ba806919c8744dd944e6f34e2ec4bf1c0'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From b980a10fe92c5d2619d1c7572fc59e008e731b83 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Thu, 9 Jan 2020 11:40:55 -0800
Subject: [PATCH 40/49] Checkpoint

---
 nestedtensor/version.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index 44ec9fff..7a94571f 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev20201919+d8734b8'
-git_version = 'd8734b84bcdd5dd1c74b2c1f48f8c890c783925a'
+__version__ = '0.0.1.dev20201919+d20dbe9'
+git_version = 'd20dbe9111f682dca1aef880ea380317b2e2e8fe'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From 6d9caa62ca19f0cc1616ee3991f56e36167c6870 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Thu, 9 Jan 2020 18:58:58 -0800
Subject: [PATCH 41/49] Checkpoint

---
 nestedtensor/csrc/jit_list_apply.cpp | 86 +++++++++++++++++++---------
 nestedtensor/nested/monkey_patch.py  |  2 +-
 nestedtensor/nested/nested.py        |  3 +
 nestedtensor/version.py              |  4 +-
 test/test_nested_tensor_nary.py      | 20 +++++--
 5 files changed, 80 insertions(+), 35 deletions(-)

diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index 86b12b76..cbdc88b7 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -23,30 +23,35 @@ at::Tensor run_function(Stack&& stack, Operation& fn) {
 // TODO: Assert that one arg must be a nestedtensor?
 template <class F>
 static TensorNode apply_jit_function(
-    const std::vector<TensorNode>& nested_nodes,
-    const std::set<size_t>& nested_arg_i,
+    const std::map<void*, TensorNode>& nested_nodes_map,
     Stack& stack_template,
     F& fn) {
   bool all_leaf = true;
-  for (const auto& nested_node : nested_nodes) {
-    all_leaf = all_leaf && nested_node.is_leaf();
+  for (const auto& entry : nested_nodes_map) {
+    all_leaf = all_leaf && entry.second.is_leaf();
   }
   if (all_leaf) {
     // NOTE: We assume no named tensors and no sparse variables as
     // appropriate for TorchScript.
     // TODO: Assert leaf sizes match and are non-zero, otherwise this isn't
     // a NestedTensor function.
-    size_t leaf_size = nested_nodes[0].size();
+    size_t leaf_size = nested_nodes_map.begin()->second.size();
     c10::List<at::Tensor> results;
     results.reserve(leaf_size);
     for (size_t j = 0; j < leaf_size; j++) {
       Stack stack(stack_template);
-      size_t ni = 0;
+      // size_t ni = 0;
       for (size_t i = 0; i < stack.size(); i++) {
-        if (nested_arg_i.count(i)) {
-          stack[i] = nested_nodes[ni].payload(j);
-          ni++;
+        if (stack[i].isTensor()) {
+          void* candidate_key = stack[i].toTensor().data_ptr();
+          if (nested_nodes_map.count(candidate_key)) {
+            stack[i] = nested_nodes_map.find(candidate_key)->second.payload(j);
+          }
         }
+        // if (nested_arg_i.count(i)) {
+        //   stack[i] = nested_nodes[ni].payload(j);
+        //   ni++;
+        // }
       }
       results.push_back(run_function(std::move(stack), fn));
     }
@@ -54,29 +59,30 @@ static TensorNode apply_jit_function(
   } else {
     bool broadcastable = true;
     size_t num_children = 0;
-    for (const auto& nested_node : nested_nodes) {
-      if (!nested_node.is_leaf()) {
+    for (const auto& entry : nested_nodes_map) {
+      if (!entry.second.is_leaf()) {
         if (num_children > 0) {
           broadcastable =
-              broadcastable && (num_children == nested_node.degree());
+              broadcastable && (num_children == entry.second.degree());
         } else {
-          num_children = nested_node.degree();
+          num_children = entry.second.degree();
         }
       }
     }
     TORCH_CHECK(broadcastable, "Can't broadcast given nested tensors");
     std::vector<TensorNode> result;
     for (size_t i = 0; i < num_children; i++) {
-      std::vector<TensorNode> local_args;
-      for (const auto& nested_node : nested_nodes) {
-        if (nested_node.is_leaf()) {
-          local_args.push_back(nested_node);
+      std::map<void*, TensorNode> local_args;
+      for (const auto& entry : nested_nodes_map) {
+        if (entry.second.is_leaf()) {
+          local_args.insert(entry);
+          // local_args[entry.first] = entry.second;
         } else {
-          local_args.push_back(nested_node.children(i));
+          local_args.insert({entry.first, entry.second.children(i)});
         }
       }
       result.push_back(
-          apply_jit_function<F>(local_args, nested_arg_i, stack_template, fn));
+          apply_jit_function<F>(local_args, stack_template, fn));
     }
     return TensorNode(result);
   }
@@ -104,38 +110,62 @@ py::cpp_function jit_tensorwise() {
   return py::cpp_function([](py::object fn) {
     return py::cpp_function([fn](py::args args_, py::kwargs kwargs_) {
       // TODO: Support for no NestedTensor arguments
+      std::map<void*, TensorNode> nested_nodes_map;
+
+      // std::cout << "processing args" << std::endl;
       py::list args_vector;
-      std::set<size_t> nested_arg_i;
-      std::vector<TensorNode> nested_nodes;
+      // std::set<size_t> nested_arg_i;
+      // std::vector<TensorNode> nested_nodes;
       for (size_t i = 0; i < args_.size(); i++) {
         py::object arg = args_[i];
         if (py::isinstance<THPNestedTensor>(arg)) {
           TensorNode nested_node =
               py::cast<THPNestedTensor>(arg).get_structure();
-          args_vector.append(_get_first_variable(nested_node));
-          nested_nodes.emplace_back(std::move(nested_node));
-          nested_arg_i.insert(i);
+          at::Tensor first_tensor = _get_first_variable(nested_node);
+          args_vector.append(first_tensor);
+          // nested_nodes.emplace_back(std::move(nested_node));
+          // nested_arg_i.insert(i);
+          nested_nodes_map.insert({first_tensor.data_ptr(), nested_node});
         } else {
           args_vector.append(arg);
         }
       }
       py::args args = py::args(args_vector);
+
+      // std::cout << "processing kwargs" << std::endl;
+      py::dict kwargs_dict;
+      for (const auto& kwarg : kwargs_) {
+        // std::cout << "kwarg.first: " << kwarg.first << std::endl;
+        py::handle arg = kwarg.second;
+        if (py::isinstance<THPNestedTensor>(arg)) {
+          TensorNode nested_node =
+              py::cast<THPNestedTensor>(arg).get_structure();
+          at::Tensor first_tensor = _get_first_variable(nested_node);
+          kwargs_dict[kwarg.first] = first_tensor;
+          nested_nodes_map.insert({first_tensor.data_ptr(), nested_node});
+        } else {
+          kwargs_dict[kwarg.first] = kwarg.second;
+        }
+      }
+      py::kwargs kwargs = py::kwargs(kwargs_dict);
+
       if (py::isinstance<StrongFunctionPtr>(fn)) {
         auto sfn = py::cast<StrongFunctionPtr>(fn);
         Function& operation = *sfn.function_;
         Stack stack = createStackForSchema(
-            operation.getSchema(), args, kwargs_, c10::nullopt);
+            operation.getSchema(), args, kwargs, c10::nullopt);
         py::gil_scoped_release release;
         THPNestedTensor result = THPNestedTensor(_ListNestedTensor(
-            apply_jit_function(nested_nodes, nested_arg_i, stack, operation)));
+            apply_jit_function(nested_nodes_map, stack, operation)));
         return result;
       }
       if (auto name = is_builtin(fn)) {
         Stack stack;
         for (std::shared_ptr<Operator> op : getAllOperatorsFor(*name)) {
           try {
+            // std::cout << "op->schema(): " << op->schema() << std::endl;
             stack =
-                createStackForSchema(op->schema(), args, kwargs_, c10::nullopt);
+                createStackForSchema(op->schema(), args, kwargs, c10::nullopt);
           } catch (std::exception& e) {
             continue;
           }
@@ -143,7 +173,7 @@ py::cpp_function jit_tensorwise() {
           py::gil_scoped_release release;
           THPNestedTensor result =
               THPNestedTensor(_ListNestedTensor(apply_jit_function(
-                  nested_nodes, nested_arg_i, stack, operation)));
+                  nested_nodes_map, stack, operation)));
           return result;
         }
       }
diff --git a/nestedtensor/nested/monkey_patch.py b/nestedtensor/nested/monkey_patch.py
index 5f4168ba..f1136e39 100644
--- a/nestedtensor/nested/monkey_patch.py
+++ b/nestedtensor/nested/monkey_patch.py
@@ -88,7 +88,7 @@ def set_function(key, function):
         set_nt_method(function_name + '_', utils.tensorwise())
         if function_name in ['fill']:
             continue
-        if function_name in ['mvlgamma', 'clamp', 'clamp_min', 'clamp_max']:
+        if function_name in ['mvlgamma', 'clamp', 'clamp_min', 'clamp_max', 'fmod']:
             set_wrapped_torch_function(function_name, utils.tensorwise())
         else:
             set_wrapped_jit_torch_function(function_name, _C.jit_tensorwise())
diff --git a/nestedtensor/nested/nested.py b/nestedtensor/nested/nested.py
index c86906f0..dd7fa4e6 100644
--- a/nestedtensor/nested/nested.py
+++ b/nestedtensor/nested/nested.py
@@ -324,6 +324,9 @@ def __torch_function__(self, func, args=(), kwargs=None):
         if func in NestedTensor.__jit_function_dispatch:
             _jit_local_func = NestedTensor.__jit_function_dispatch[func]
             impl_args = [a._impl if isinstance(a, NestedTensor) else a for a in args]
+            if kwargs is not None:
+                impl_kwargs = {k: v._impl if isinstance(v, NestedTensor) else v for (k, v) in kwargs.items()}
+                return NestedTensor(_jit_local_func(*impl_args, **impl_kwargs))
             return NestedTensor(_jit_local_func(*impl_args))
         if func in NestedTensor.__function_dispatch:
             _local_func = NestedTensor.__function_dispatch[func]
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index 7a94571f..8167f1e1 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev20201919+d20dbe9'
-git_version = 'd20dbe9111f682dca1aef880ea380317b2e2e8fe'
+__version__ = '0.0.1.dev20201102+b980a10'
+git_version = 'b980a10fe92c5d2619d1c7572fc59e008e731b83'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION
diff --git a/test/test_nested_tensor_nary.py b/test/test_nested_tensor_nary.py
index 69fa4b43..dd0ae7c6 100644
--- a/test/test_nested_tensor_nary.py
+++ b/test/test_nested_tensor_nary.py
@@ -93,10 +93,22 @@ def method_inplace(x): return method_inplace_(x, 0.3)
         def _close(t1, t2):
             self.assertTrue(((t1 - t2).abs() < 1e-6).all())
 
-        # if func__ not in ['mvlgamma']:
-        #     func(a1, out=a3)
-        #     # TODO: Abstract this
-        #     _close(func(a1), a3)
+        if func__ not in ['mvlgamma']:
+            # print("func__")
+            # print(func__)
+            # print('1 a1')
+            # print(a1)
+            # print('1 a3')
+            # print(a3)
+            func(a1, out=a3)
+            # print('2 a1')
+            # print(a1)
+            # print('2 a3')
+            # print(a3)
+            # print('func(a1)')
+            # print(func(a1))
+            # TODO: Abstract this
+            _close(func(a1), a3)
         _close(func(a1), a2)
         _close(method(a1), a2)
         _close(method_inplace(a1), a2)

From 7f813f11bb46fbcc88414c423c2d7def24837751 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Thu, 9 Jan 2020 19:02:51 -0800
Subject: [PATCH 42/49] Checkpoint

---
 nestedtensor/csrc/jit_list_apply.cpp | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index cbdc88b7..4f2ab0d7 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -81,8 +81,7 @@ static TensorNode apply_jit_function(
           local_args.insert({entry.first, entry.second.children(i)});
         }
       }
-      result.push_back(
-          apply_jit_function<F>(local_args, stack_template, fn));
+      result.push_back(apply_jit_function<F>(local_args, stack_template, fn));
     }
     return TensorNode(result);
   }
@@ -142,6 +141,14 @@ py::cpp_function jit_tensorwise() {
               py::cast<THPNestedTensor>(arg).get_structure();
           at::Tensor first_tensor = _get_first_variable(nested_node);
           kwargs_dict[kwarg.first] = first_tensor;
+          // TODO: This is a terrible way of identifying a NestedTensor,
+          // because it doesn't work with duplicates or partial overlap.
+          // However we need the tensor args for overload resolution
+          // and to build the stack. Further we need a way of figuring
+          // out which entries of the stack correspond to what input
+          // arguments. The more rigorous approach would be to add
+          // IValue support for NestedTensor via TorchBind or similar,
+          // but that piece is not mature enough yet.
           nested_nodes_map.insert({first_tensor.data_ptr(), nested_node});
         } else {
           kwargs_dict[kwarg.first] = kwarg.second;
@@ -171,9 +178,8 @@ py::cpp_function jit_tensorwise() {
           }
           auto operation = op->getOperation();
           py::gil_scoped_release release;
-          THPNestedTensor result =
-              THPNestedTensor(_ListNestedTensor(apply_jit_function(
-                  nested_nodes_map, stack, operation)));
+          THPNestedTensor result = THPNestedTensor(_ListNestedTensor(
+              apply_jit_function(nested_nodes_map, stack, operation)));
           return result;
         }
       }

From b94b880b892625bf6940b0ad2a4cbeb00fb95d0e Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Fri, 10 Jan 2020 15:16:21 -0800
Subject: [PATCH 43/49] Checkpoint

---
 nestedtensor/csrc/jit_list_apply.cpp | 188 ++++++++++++++++++++-------
 nestedtensor/version.py              |   4 +-
 2 files changed, 146 insertions(+), 46 deletions(-)

diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index 4f2ab0d7..6d27a200 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -23,35 +23,30 @@ at::Tensor run_function(Stack&& stack, Operation& fn) {
 // TODO: Assert that one arg must be a nestedtensor?
 template <class F>
 static TensorNode apply_jit_function(
-    const std::map<void*, TensorNode>& nested_nodes_map,
     Stack& stack_template,
+    const std::set<size_t>& tensor_node_i,
+    const std::vector<TensorNode>& tensor_nodes,
     F& fn) {
   bool all_leaf = true;
-  for (const auto& entry : nested_nodes_map) {
-    all_leaf = all_leaf && entry.second.is_leaf();
+  for (const auto& node : tensor_nodes) {
+    all_leaf = all_leaf && node.is_leaf();
   }
   if (all_leaf) {
     // NOTE: We assume no named tensors and no sparse variables as
     // appropriate for TorchScript.
     // TODO: Assert leaf sizes match and are non-zero, otherwise this isn't
     // a NestedTensor function.
-    size_t leaf_size = nested_nodes_map.begin()->second.size();
+    size_t leaf_size = tensor_nodes[0].size();
     c10::List<at::Tensor> results;
     results.reserve(leaf_size);
     for (size_t j = 0; j < leaf_size; j++) {
       Stack stack(stack_template);
-      // size_t ni = 0;
+      size_t ni = 0;
       for (size_t i = 0; i < stack.size(); i++) {
-        if (stack[i].isTensor()) {
-          void* candidate_key = stack[i].toTensor().data_ptr();
-          if (nested_nodes_map.count(candidate_key)) {
-            stack[i] = nested_nodes_map.find(candidate_key)->second.payload(j);
-          }
+        if (tensor_node_i.count(i)) {
+          stack[i] = tensor_nodes[ni].payload(j);
+          ni++;
         }
-        // if (nested_arg_i.count(i)) {
-        //   stack[i] = nested_nodes[ni].payload(j);
-        //   ni++;
-        // }
       }
       results.push_back(run_function(std::move(stack), fn));
     }
@@ -59,29 +54,28 @@ static TensorNode apply_jit_function(
   } else {
     bool broadcastable = true;
     size_t num_children = 0;
-    for (const auto& entry : nested_nodes_map) {
-      if (!entry.second.is_leaf()) {
+    for (const auto& node : tensor_nodes) {
+      if (!node.is_leaf()) {
         if (num_children > 0) {
-          broadcastable =
-              broadcastable && (num_children == entry.second.degree());
+          broadcastable = broadcastable && (num_children == node.degree());
         } else {
-          num_children = entry.second.degree();
+          num_children = node.degree();
         }
       }
     }
     TORCH_CHECK(broadcastable, "Can't broadcast given nested tensors");
     std::vector<TensorNode> result;
     for (size_t i = 0; i < num_children; i++) {
-      std::map<void*, TensorNode> local_args;
-      for (const auto& entry : nested_nodes_map) {
-        if (entry.second.is_leaf()) {
-          local_args.insert(entry);
-          // local_args[entry.first] = entry.second;
+      std::vector<TensorNode> local_args;
+      for (const auto& node : local_args) {
+        if (node.is_leaf()) {
+          local_args.push_back(node);
         } else {
-          local_args.insert({entry.first, entry.second.children(i)});
+          local_args.push_back(node.children(i));
         }
       }
-      result.push_back(apply_jit_function<F>(local_args, stack_template, fn));
+      result.push_back(
+          apply_jit_function<F>(stack_template, tensor_node_i, local_args, fn));
     }
     return TensorNode(result);
   }
@@ -100,6 +94,108 @@ c10::optional<Symbol> is_builtin(py::object fn) {
   return name;
 }
 
+c10::optional<TensorNode> try_nested_node(
+    Argument argument,
+    py::object py_arg) {
+  InferredType inferred_type = tryToInferType(py_arg);
+  if (!inferred_type.success()) {
+    return c10::nullopt;
+  }
+  if (inferred_type.type()->kind() == TypeKind::TensorType &&
+      py::isinstance<THPNestedTensor>(py_arg)) {
+    return py::cast<THPNestedTensor>(py_arg).get_structure();
+  }
+  return c10::nullopt;
+}
+
+// TODO: Add support for NestedTEnsor
+// TODO: Replace throw with nullopt returns
+inline c10::optional<
+    std::tuple<Stack, std::set<size_t>, std::vector<TensorNode>>>
+my_createStackForSchema(
+    const FunctionSchema& schema,
+    const tuple_slice& args,
+    const py::kwargs& kwargs,
+    c10::optional<IValue> self) {
+  size_t all_arguments = (self ? 1 : 0) + args.size() + kwargs.size();
+  if (all_arguments > schema.arguments().size()) {
+    // throw std::runtime_error(c10::str(
+    //     schema.name(),
+    //     "() expected at most ",
+    //     schema.arguments().size(),
+    //     " argument(s) but received ",
+    //     all_arguments,
+    //     " argument(s). Declaration: ",
+    //     schema));
+    return c10::nullopt;
+  }
+  Stack stack;
+  stack.reserve(schema.arguments().size());
+
+  std::set<size_t> tensor_node_i;
+  std::vector<TensorNode> tensor_nodes;
+
+  if (self) {
+    // NOTE: self cannot be a NestedTensor because it cannot be an ivalue.
+    push(stack, std::move(*self));
+  }
+  // First push all positional args.
+  for (size_t i = 0; i < args.size(); i++) {
+    // Use the type information from the schema to convert the PyObject.
+    const auto& schema_arg = schema.arguments()[i];
+    if (auto tensor_node = try_nested_node(schema_arg, args[i])) {
+      tensor_nodes.push_back(*tensor_node);
+      tensor_node_i.insert(i);
+      push(stack, torch::jit::IValue(torch::zeros({})));
+    } else {
+      push(stack, argumentToIValue(schema, stack.size(), args[i]));
+    }
+  }
+
+  // Now for every remaining non-positional argument in the schema, look for it
+  // in the kwargs dict and push it if found, or use its default value if it
+  // has one.
+  size_t consumed_kwargs = 0;
+  for (size_t i = stack.size(); i < schema.arguments().size(); ++i) {
+    const auto& schema_arg = schema.arguments()[i];
+    if (kwargs.contains(schema_arg.name().c_str())) {
+      auto kwarg = kwargs[schema_arg.name().c_str()];
+      if (auto tensor_node = try_nested_node(schema_arg, kwarg)) {
+        tensor_nodes.push_back(*tensor_node);
+        tensor_node_i.insert(i);
+        push(stack, torch::jit::IValue(torch::zeros({})));
+      } else {
+        // TODO: Should this throw an error?
+        if (schema_arg.type()->kind() != tryToInferType(kwarg).type()->kind()) {
+          return c10::nullopt;
+        }
+        push(stack, argumentToIValue(schema, stack.size(), args[i]));
+      }
+      consumed_kwargs += 1;
+    } else if (schema_arg.default_value()) {
+      push(stack, *schema_arg.default_value());
+    } else {
+      // throw std::runtime_error(c10::str(
+      //     schema.name(),
+      //     "() is missing value for argument '",
+      //     arg.name(),
+      //     "'. Declaration: ",
+      //     schema));
+      return c10::nullopt;
+    }
+  }
+
+  if (consumed_kwargs != kwargs.size()) {
+    std::vector<std::string> names;
+    for (const auto& kwarg : kwargs) {
+      names.emplace_back(py::cast<std::string>(kwarg.first));
+    }
+    schema.findErrorInKwargs(names);
+  }
+
+  return std::make_tuple(stack, tensor_node_i, tensor_nodes);
+}
+
 // TODO: This should support 3 types of functions
 // fn might be scripted (i.e. StrongFunctionPtr)
 // fn might be a builtin (need to resolve!)
@@ -159,28 +255,32 @@ py::cpp_function jit_tensorwise() {
       if (py::isinstance<StrongFunctionPtr>(fn)) {
         auto sfn = py::cast<StrongFunctionPtr>(fn);
         Function& operation = *sfn.function_;
-        Stack stack = createStackForSchema(
-            operation.getSchema(), args, kwargs, c10::nullopt);
-        py::gil_scoped_release release;
-        THPNestedTensor result = THPNestedTensor(_ListNestedTensor(
-            apply_jit_function(nested_nodes_map, stack, operation)));
-        return result;
+        if (auto pack = my_createStackForSchema(
+                operation.getSchema(), args, kwargs, c10::nullopt)) {
+          py::gil_scoped_release release;
+          THPNestedTensor result =
+              THPNestedTensor(_ListNestedTensor(apply_jit_function(
+                  std::get<0>(*pack),
+                  std::get<1>(*pack),
+                  std::get<2>(*pack),
+                  operation)));
+          return result;
+        }
       }
       if (auto name = is_builtin(fn)) {
-        Stack stack;
         for (std::shared_ptr<Operator> op : getAllOperatorsFor(*name)) {
-          try {
-            // std::cout << "op->schema(): " << op->schema() << std::endl;
-            stack =
-                createStackForSchema(op->schema(), args, kwargs, c10::nullopt);
-          } catch (std::exception& e) {
-            continue;
+          if (auto pack = my_createStackForSchema(
+                  op->schema(), args, kwargs, c10::nullopt)) {
+            auto operation = op->getOperation();
+            py::gil_scoped_release release;
+            THPNestedTensor result =
+                THPNestedTensor(_ListNestedTensor(apply_jit_function(
+                    std::get<0>(*pack),
+                    std::get<1>(*pack),
+                    std::get<2>(*pack),
+                    operation)));
+            return result;
           }
-          auto operation = op->getOperation();
-          py::gil_scoped_release release;
-          THPNestedTensor result = THPNestedTensor(_ListNestedTensor(
-              apply_jit_function(nested_nodes_map, stack, operation)));
-          return result;
         }
       }
       // TODO: Need implementation of generic python version.
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index 8167f1e1..c08b7c50 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev20201102+b980a10'
-git_version = 'b980a10fe92c5d2619d1c7572fc59e008e731b83'
+__version__ = '0.0.1.dev202011023+7f813f1'
+git_version = '7f813f11bb46fbcc88414c423c2d7def24837751'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From d040bd11803cae8469c4ef70090d34850ea6c615 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Fri, 10 Jan 2020 17:15:08 -0800
Subject: [PATCH 44/49] Checkpoint

---
 nestedtensor/csrc/jit_list_apply.cpp | 90 +++++++++++-----------------
 nestedtensor/version.py              |  4 +-
 2 files changed, 38 insertions(+), 56 deletions(-)

diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index 6d27a200..1f2dd1fb 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -27,6 +27,11 @@ static TensorNode apply_jit_function(
     const std::set<size_t>& tensor_node_i,
     const std::vector<TensorNode>& tensor_nodes,
     F& fn) {
+  // std::cout << "t0" << std::endl;
+  // for (size_t i = 0; i < tensor_nodes.size(); i++) {
+  //   std::cout << "tensor_nodes[" << i << "]" << std::endl;
+  // }
+  // std::cout << "t1" << std::endl;
   bool all_leaf = true;
   for (const auto& node : tensor_nodes) {
     all_leaf = all_leaf && node.is_leaf();
@@ -67,7 +72,7 @@ static TensorNode apply_jit_function(
     std::vector<TensorNode> result;
     for (size_t i = 0; i < num_children; i++) {
       std::vector<TensorNode> local_args;
-      for (const auto& node : local_args) {
+      for (const auto& node : tensor_nodes) {
         if (node.is_leaf()) {
           local_args.push_back(node);
         } else {
@@ -97,14 +102,23 @@ c10::optional<Symbol> is_builtin(py::object fn) {
 c10::optional<TensorNode> try_nested_node(
     Argument argument,
     py::object py_arg) {
+  // std::cout << "1" << std::endl;
   InferredType inferred_type = tryToInferType(py_arg);
-  if (!inferred_type.success()) {
+  // std::cout << "2" << std::endl;
+  // Nestedtensor must not be a valid IValue
+  if (inferred_type.success()) {
+    // std::cout << "3" << std::endl;
     return c10::nullopt;
   }
-  if (inferred_type.type()->kind() == TypeKind::TensorType &&
+  // std::cout << "4" << std::endl;
+  if (argument.type()->kind() == TypeKind::TensorType &&
       py::isinstance<THPNestedTensor>(py_arg)) {
-    return py::cast<THPNestedTensor>(py_arg).get_structure();
+    // std::cout << "5" << std::endl;
+    TensorNode node = py::cast<THPNestedTensor>(py_arg).get_structure();
+    // std::cout << "51" << std::endl;
+    return node;
   }
+  // std::cout << "6" << std::endl;
   return c10::nullopt;
 }
 
@@ -143,11 +157,20 @@ my_createStackForSchema(
   for (size_t i = 0; i < args.size(); i++) {
     // Use the type information from the schema to convert the PyObject.
     const auto& schema_arg = schema.arguments()[i];
+    // std::cout << "schema_arg 0: " << schema_arg << std::endl;
+    // std::cout << "schema_arg.type(): " << schema_arg.type() << std::endl;
+    // std::cout << "schema_arg.type()->kind(): "
+    //           << typeKindToString(schema_arg.type()->kind()) << std::endl;
     if (auto tensor_node = try_nested_node(schema_arg, args[i])) {
+      // std::cout << "found nested tensor" << std::endl;
       tensor_nodes.push_back(*tensor_node);
       tensor_node_i.insert(i);
       push(stack, torch::jit::IValue(torch::zeros({})));
     } else {
+      // TODO: Should this throw an error?
+      if (schema_arg.type()->kind() != tryToInferType(args[i]).type()->kind()) {
+        return c10::nullopt;
+      }
       push(stack, argumentToIValue(schema, stack.size(), args[i]));
     }
   }
@@ -158,6 +181,7 @@ my_createStackForSchema(
   size_t consumed_kwargs = 0;
   for (size_t i = stack.size(); i < schema.arguments().size(); ++i) {
     const auto& schema_arg = schema.arguments()[i];
+    // std::cout << "schema_arg 1: " << schema_arg << std::endl;
     if (kwargs.contains(schema_arg.name().c_str())) {
       auto kwarg = kwargs[schema_arg.name().c_str()];
       if (auto tensor_node = try_nested_node(schema_arg, kwarg)) {
@@ -203,67 +227,23 @@ my_createStackForSchema(
 // (not fast!)
 py::cpp_function jit_tensorwise() {
   return py::cpp_function([](py::object fn) {
-    return py::cpp_function([fn](py::args args_, py::kwargs kwargs_) {
-      // TODO: Support for no NestedTensor arguments
-      std::map<void*, TensorNode> nested_nodes_map;
-
-      // std::cout << "processing args" << std::endl;
-      py::list args_vector;
-      // std::set<size_t> nested_arg_i;
-      // std::vector<TensorNode> nested_nodes;
-      for (size_t i = 0; i < args_.size(); i++) {
-        py::object arg = args_[i];
-        if (py::isinstance<THPNestedTensor>(arg)) {
-          TensorNode nested_node =
-              py::cast<THPNestedTensor>(arg).get_structure();
-          at::Tensor first_tensor = _get_first_variable(nested_node);
-          args_vector.append(first_tensor);
-          // nested_nodes.emplace_back(std::move(nested_node));
-          // nested_arg_i.insert(i);
-          nested_nodes_map.insert({first_tensor.data_ptr(), nested_node});
-        } else {
-          args_vector.append(arg);
-        }
-      }
-      py::args args = py::args(args_vector);
-
-      // std::cout << "processing kwargs" << std::endl;
-      py::dict kwargs_dict;
-      for (const auto& kwarg : kwargs_) {
-        // std::cout << "kwarg.first: " << kwarg.first << std::endl;
-        py::handle arg = kwarg.second;
-        if (py::isinstance<THPNestedTensor>(arg)) {
-          TensorNode nested_node =
-              py::cast<THPNestedTensor>(arg).get_structure();
-          at::Tensor first_tensor = _get_first_variable(nested_node);
-          kwargs_dict[kwarg.first] = first_tensor;
-          // TODO: This is a terrible way of identifying a NestedTensor,
-          // because it doesn't work with duplicates or partial overlap.
-          // However we need the tensor args for overload resolution
-          // and to build the stack. Further we need a way of figuring
-          // out which entries of the stack correspond to what input
-          // arguments. The more rigorous approach would be to add
-          // IValue support for NestedTensor via TorchBind or similar,
-          // but that piece is not mature enough yet.
-          nested_nodes_map.insert({first_tensor.data_ptr(), nested_node});
-        } else {
-          kwargs_dict[kwarg.first] = kwarg.second;
-        }
-      }
-      py::kwargs kwargs = py::kwargs(kwargs_dict);
+    return py::cpp_function([fn](py::args args, py::kwargs kwargs) {
+      // // TODO: Support for no NestedTensor arguments
 
       if (py::isinstance<StrongFunctionPtr>(fn)) {
         auto sfn = py::cast<StrongFunctionPtr>(fn);
         Function& operation = *sfn.function_;
         if (auto pack = my_createStackForSchema(
                 operation.getSchema(), args, kwargs, c10::nullopt)) {
-          py::gil_scoped_release release;
+        // std::cout << "GOT ONE 0" << std::endl;
+          // py::gil_scoped_release release;
           THPNestedTensor result =
               THPNestedTensor(_ListNestedTensor(apply_jit_function(
                   std::get<0>(*pack),
                   std::get<1>(*pack),
                   std::get<2>(*pack),
                   operation)));
+            // std::cout << "done 0" << std::endl;
           return result;
         }
       }
@@ -271,14 +251,16 @@ py::cpp_function jit_tensorwise() {
         for (std::shared_ptr<Operator> op : getAllOperatorsFor(*name)) {
           if (auto pack = my_createStackForSchema(
                   op->schema(), args, kwargs, c10::nullopt)) {
+        // std::cout << "GOT ONE 1" << std::endl;
             auto operation = op->getOperation();
-            py::gil_scoped_release release;
+            // py::gil_scoped_release release;
             THPNestedTensor result =
                 THPNestedTensor(_ListNestedTensor(apply_jit_function(
                     std::get<0>(*pack),
                     std::get<1>(*pack),
                     std::get<2>(*pack),
                     operation)));
+            // std::cout << "done 1" << std::endl;
             return result;
           }
         }
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index c08b7c50..3028277d 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev202011023+7f813f1'
-git_version = '7f813f11bb46fbcc88414c423c2d7def24837751'
+__version__ = '0.0.1.dev20201111+b94b880'
+git_version = 'b94b880b892625bf6940b0ad2a4cbeb00fb95d0e'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From ca117cc21eae925a93b343a3e5beeca843ff35cc Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Fri, 10 Jan 2020 17:23:23 -0800
Subject: [PATCH 45/49] Checkpoint

---
 nestedtensor/csrc/jit_list_apply.cpp | 30 ++++------------------------
 nestedtensor/csrc/py_init.cpp        |  3 ++-
 nestedtensor/nested/monkey_patch.py  |  2 +-
 nestedtensor/version.py              |  4 ++--
 test/test_nested_tensor_nary.py      | 12 -----------
 test/utils.py                        |  2 +-
 6 files changed, 10 insertions(+), 43 deletions(-)

diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index 1f2dd1fb..abf80501 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -27,11 +27,6 @@ static TensorNode apply_jit_function(
     const std::set<size_t>& tensor_node_i,
     const std::vector<TensorNode>& tensor_nodes,
     F& fn) {
-  // std::cout << "t0" << std::endl;
-  // for (size_t i = 0; i < tensor_nodes.size(); i++) {
-  //   std::cout << "tensor_nodes[" << i << "]" << std::endl;
-  // }
-  // std::cout << "t1" << std::endl;
   bool all_leaf = true;
   for (const auto& node : tensor_nodes) {
     all_leaf = all_leaf && node.is_leaf();
@@ -102,23 +97,16 @@ c10::optional<Symbol> is_builtin(py::object fn) {
 c10::optional<TensorNode> try_nested_node(
     Argument argument,
     py::object py_arg) {
-  // std::cout << "1" << std::endl;
   InferredType inferred_type = tryToInferType(py_arg);
-  // std::cout << "2" << std::endl;
   // Nestedtensor must not be a valid IValue
   if (inferred_type.success()) {
-    // std::cout << "3" << std::endl;
     return c10::nullopt;
   }
-  // std::cout << "4" << std::endl;
   if (argument.type()->kind() == TypeKind::TensorType &&
       py::isinstance<THPNestedTensor>(py_arg)) {
-    // std::cout << "5" << std::endl;
     TensorNode node = py::cast<THPNestedTensor>(py_arg).get_structure();
-    // std::cout << "51" << std::endl;
     return node;
   }
-  // std::cout << "6" << std::endl;
   return c10::nullopt;
 }
 
@@ -157,12 +145,7 @@ my_createStackForSchema(
   for (size_t i = 0; i < args.size(); i++) {
     // Use the type information from the schema to convert the PyObject.
     const auto& schema_arg = schema.arguments()[i];
-    // std::cout << "schema_arg 0: " << schema_arg << std::endl;
-    // std::cout << "schema_arg.type(): " << schema_arg.type() << std::endl;
-    // std::cout << "schema_arg.type()->kind(): "
-    //           << typeKindToString(schema_arg.type()->kind()) << std::endl;
     if (auto tensor_node = try_nested_node(schema_arg, args[i])) {
-      // std::cout << "found nested tensor" << std::endl;
       tensor_nodes.push_back(*tensor_node);
       tensor_node_i.insert(i);
       push(stack, torch::jit::IValue(torch::zeros({})));
@@ -181,7 +164,6 @@ my_createStackForSchema(
   size_t consumed_kwargs = 0;
   for (size_t i = stack.size(); i < schema.arguments().size(); ++i) {
     const auto& schema_arg = schema.arguments()[i];
-    // std::cout << "schema_arg 1: " << schema_arg << std::endl;
     if (kwargs.contains(schema_arg.name().c_str())) {
       auto kwarg = kwargs[schema_arg.name().c_str()];
       if (auto tensor_node = try_nested_node(schema_arg, kwarg)) {
@@ -225,25 +207,23 @@ my_createStackForSchema(
 // fn might be a builtin (need to resolve!)
 // fn might be neither, so we just dispatch to some regular python for-loops
 // (not fast!)
+// TODO: Support for no NestedTensor arguments
+// NOTE: For now this is a private function
 py::cpp_function jit_tensorwise() {
   return py::cpp_function([](py::object fn) {
     return py::cpp_function([fn](py::args args, py::kwargs kwargs) {
-      // // TODO: Support for no NestedTensor arguments
-
       if (py::isinstance<StrongFunctionPtr>(fn)) {
         auto sfn = py::cast<StrongFunctionPtr>(fn);
         Function& operation = *sfn.function_;
         if (auto pack = my_createStackForSchema(
                 operation.getSchema(), args, kwargs, c10::nullopt)) {
-        // std::cout << "GOT ONE 0" << std::endl;
-          // py::gil_scoped_release release;
+          py::gil_scoped_release release;
           THPNestedTensor result =
               THPNestedTensor(_ListNestedTensor(apply_jit_function(
                   std::get<0>(*pack),
                   std::get<1>(*pack),
                   std::get<2>(*pack),
                   operation)));
-            // std::cout << "done 0" << std::endl;
           return result;
         }
       }
@@ -251,16 +231,14 @@ py::cpp_function jit_tensorwise() {
         for (std::shared_ptr<Operator> op : getAllOperatorsFor(*name)) {
           if (auto pack = my_createStackForSchema(
                   op->schema(), args, kwargs, c10::nullopt)) {
-        // std::cout << "GOT ONE 1" << std::endl;
             auto operation = op->getOperation();
-            // py::gil_scoped_release release;
+            py::gil_scoped_release release;
             THPNestedTensor result =
                 THPNestedTensor(_ListNestedTensor(apply_jit_function(
                     std::get<0>(*pack),
                     std::get<1>(*pack),
                     std::get<2>(*pack),
                     operation)));
-            // std::cout << "done 1" << std::endl;
             return result;
           }
         }
diff --git a/nestedtensor/csrc/py_init.cpp b/nestedtensor/csrc/py_init.cpp
index a8d5aa2a..90db5b07 100644
--- a/nestedtensor/csrc/py_init.cpp
+++ b/nestedtensor/csrc/py_init.cpp
@@ -110,7 +110,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
       .def("__str__", &torch::nested_tensor::THPNestedTensor::str)
       .def("__repr__", &torch::nested_tensor::THPNestedTensor::str);
 
-  m.def("jit_tensorwise", &torch::nested_tensor::jit_tensorwise);
+  //NOTE: This is a private function until it is feature complete
+  m.def("_jit_tensorwise", &torch::nested_tensor::jit_tensorwise);
   m.def("as_nested_tensor", &torch::nested_tensor::as_nested_tensor);
   m.def("nested_tensor", &torch::nested_tensor::nested_tensor);
 }
diff --git a/nestedtensor/nested/monkey_patch.py b/nestedtensor/nested/monkey_patch.py
index f1136e39..dd13b622 100644
--- a/nestedtensor/nested/monkey_patch.py
+++ b/nestedtensor/nested/monkey_patch.py
@@ -91,7 +91,7 @@ def set_function(key, function):
         if function_name in ['mvlgamma', 'clamp', 'clamp_min', 'clamp_max', 'fmod']:
             set_wrapped_torch_function(function_name, utils.tensorwise())
         else:
-            set_wrapped_jit_torch_function(function_name, _C.jit_tensorwise())
+            set_wrapped_jit_torch_function(function_name, _C._jit_tensorwise())
         set_nt_method(function_name, utils.tensorwise())
     # <
 
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index 3028277d..1b88a78c 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev20201111+b94b880'
-git_version = 'b94b880b892625bf6940b0ad2a4cbeb00fb95d0e'
+__version__ = '0.0.1.dev20201111+d040bd1'
+git_version = 'd040bd11803cae8469c4ef70090d34850ea6c615'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION
diff --git a/test/test_nested_tensor_nary.py b/test/test_nested_tensor_nary.py
index dd0ae7c6..0d6644f5 100644
--- a/test/test_nested_tensor_nary.py
+++ b/test/test_nested_tensor_nary.py
@@ -94,19 +94,7 @@ def _close(t1, t2):
             self.assertTrue(((t1 - t2).abs() < 1e-6).all())
 
         if func__ not in ['mvlgamma']:
-            # print("func__")
-            # print(func__)
-            # print('1 a1')
-            # print(a1)
-            # print('1 a3')
-            # print(a3)
             func(a1, out=a3)
-            # print('2 a1')
-            # print(a1)
-            # print('2 a3')
-            # print(a3)
-            # print('func(a1)')
-            # print(func(a1))
             # TODO: Abstract this
             _close(func(a1), a3)
         _close(func(a1), a2)
diff --git a/test/utils.py b/test/utils.py
index e050babf..f73916cc 100644
--- a/test/utils.py
+++ b/test/utils.py
@@ -75,7 +75,7 @@ def gen_random_int(seed, low=0, high=2 ** 32):
 
 
 # TODO: Something occasionally causes a NaN here...
-def gen_nested_list(seed, nested_dim, tensor_dim, size_low=1, size_high=2):
+def gen_nested_list(seed, nested_dim, tensor_dim, size_low=1, size_high=10):
     tensors = []
     num_tensors = gen_random_int(
         (seed * nested_dim + seed) * 1024, low=size_low, high=size_high)

From 7f14ac17cec7453607dbfa9534e1bd45f60aba35 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Fri, 10 Jan 2020 17:25:31 -0800
Subject: [PATCH 46/49] Checkpoint

---
 nestedtensor/csrc/buffer_nested_tensor.h | 1 -
 nestedtensor/csrc/jit_list_apply.cpp     | 2 --
 nestedtensor/version.py                  | 4 ++--
 3 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/nestedtensor/csrc/buffer_nested_tensor.h b/nestedtensor/csrc/buffer_nested_tensor.h
index 28953c4d..62820cec 100644
--- a/nestedtensor/csrc/buffer_nested_tensor.h
+++ b/nestedtensor/csrc/buffer_nested_tensor.h
@@ -131,7 +131,6 @@ struct TORCH_API _BufferNestedTensor {
         new_size.push_back(start->payload(0)[i]);
       }
     }
-    std::cout << "new_size: " << new_size << std::endl;
     return _buffer.reshape(at::IntArrayRef(new_size));
   }
 
diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index abf80501..64728df1 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -110,8 +110,6 @@ c10::optional<TensorNode> try_nested_node(
   return c10::nullopt;
 }
 
-// TODO: Add support for NestedTEnsor
-// TODO: Replace throw with nullopt returns
 inline c10::optional<
     std::tuple<Stack, std::set<size_t>, std::vector<TensorNode>>>
 my_createStackForSchema(
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index 1b88a78c..dac4e0dd 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev20201111+d040bd1'
-git_version = 'd040bd11803cae8469c4ef70090d34850ea6c615'
+__version__ = '0.0.1.dev20201111+ca117cc'
+git_version = 'ca117cc21eae925a93b343a3e5beeca843ff35cc'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From f439e4eac6ff826b7c9951ae45706863be0dc82a Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Sat, 11 Jan 2020 18:35:33 -0800
Subject: [PATCH 47/49] Checkpoint

---
 nestedtensor/csrc/buffer_nested_tensor.cpp |  2 +-
 nestedtensor/csrc/jit_list_apply.cpp       | 94 +++++++++++++++++++---
 nestedtensor/csrc/py_init.cpp              |  2 +-
 nestedtensor/nested/monkey_patch.py        | 10 ++-
 nestedtensor/nested/utils.py               |  1 -
 nestedtensor/version.py                    |  4 +-
 setup.py                                   |  2 +-
 test/test_nested_tensor_nary.py            |  9 +++
 test/utils.py                              |  3 +-
 9 files changed, 106 insertions(+), 21 deletions(-)

diff --git a/nestedtensor/csrc/buffer_nested_tensor.cpp b/nestedtensor/csrc/buffer_nested_tensor.cpp
index 6026a3c1..45123a05 100644
--- a/nestedtensor/csrc/buffer_nested_tensor.cpp
+++ b/nestedtensor/csrc/buffer_nested_tensor.cpp
@@ -71,8 +71,8 @@ std::pair<int64_t, TensorNode> _build_structure(
     for (size_t i = 0; i < nested_size.degree(); i++) {
       std::pair<int64_t, TensorNode> result_i = _build_structure(
           index, buffers, nested_size.children(i), nested_stride.children(i));
+      index = std::get<0>(result_i);
       result.push_back(std::get<1>(result_i));
-      index++;
     }
     return std::pair<int64_t, TensorNode>(index, TensorNode(result));
   }
diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index 64728df1..ba04d6c2 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -12,7 +12,8 @@ using namespace torch::jit::script;
 
 // TODO Expand to IValues to support generic lists?
 at::Tensor run_function(Stack&& stack, Function& fn) {
-  return std::move(fn(stack).toTensor());
+  fn(stack);
+  return std::move(stack.front().toTensor());
 }
 
 at::Tensor run_function(Stack&& stack, Operation& fn) {
@@ -50,7 +51,7 @@ static TensorNode apply_jit_function(
       }
       results.push_back(run_function(std::move(stack), fn));
     }
-    return TensorNode(std::move(results));
+    return TensorNode(results);
   } else {
     bool broadcastable = true;
     size_t num_children = 0;
@@ -118,6 +119,9 @@ my_createStackForSchema(
     const py::kwargs& kwargs,
     c10::optional<IValue> self) {
   size_t all_arguments = (self ? 1 : 0) + args.size() + kwargs.size();
+  // std::cout << "all_arguments: " << all_arguments << std::endl;
+  // std::cout << "schema.arguments().size(): " << schema.arguments().size()
+  //           << std::endl;
   if (all_arguments > schema.arguments().size()) {
     // throw std::runtime_error(c10::str(
     //     schema.name(),
@@ -144,17 +148,35 @@ my_createStackForSchema(
     // Use the type information from the schema to convert the PyObject.
     const auto& schema_arg = schema.arguments()[i];
     if (auto tensor_node = try_nested_node(schema_arg, args[i])) {
+      // std::cout << i << " is a nested tensor" << std::endl;
       tensor_nodes.push_back(*tensor_node);
-      tensor_node_i.insert(i);
+      tensor_node_i.insert(stack.size());
       push(stack, torch::jit::IValue(torch::zeros({})));
     } else {
-      // TODO: Should this throw an error?
-      if (schema_arg.type()->kind() != tryToInferType(args[i]).type()->kind()) {
+      // auto inferred_type = tryToInferType(args[i]);
+      // if (inferred_type.success()) {
+      //   std::cout << "i: " << i << " - "
+      //             << typeKindToString(inferred_type.type()->kind())
+      //             << std::endl;
+      // } else {
+      //   std::cout << "No success of getting type for " << i << std::endl;
+      // }
+      // TODO: This is expensive because argumentToIValue constructs an error
+      // message.
+      try {
+        IValue ivalue_arg = argumentToIValue(schema, i, args[i]);
+        // std::cout << "i: " << i << " - "
+        //           << typeKindToString(ivalue_arg.type()->kind()) << std::endl;
+        push(stack, ivalue_arg);
+        // std::cout << "001" << std::endl;
+      } catch (const std::runtime_error& e) {
+        // std::cout << "002 = " << e.what() << std::endl;
         return c10::nullopt;
       }
-      push(stack, argumentToIValue(schema, stack.size(), args[i]));
     }
+    // std::cout << "11: " << i << std::endl;
   }
+  // std::cout << "Looking at kwargs" << std::endl;
 
   // Now for every remaining non-positional argument in the schema, look for it
   // in the kwargs dict and push it if found, or use its default value if it
@@ -166,19 +188,41 @@ my_createStackForSchema(
       auto kwarg = kwargs[schema_arg.name().c_str()];
       if (auto tensor_node = try_nested_node(schema_arg, kwarg)) {
         tensor_nodes.push_back(*tensor_node);
-        tensor_node_i.insert(i);
+        tensor_node_i.insert(stack.size());
         push(stack, torch::jit::IValue(torch::zeros({})));
       } else {
         // TODO: Should this throw an error?
-        if (schema_arg.type()->kind() != tryToInferType(kwarg).type()->kind()) {
+        // auto inferred_type = tryToInferType(kwarg);
+        // if (inferred_type.success()) {
+        //   std::cout << "i: " << i << " - "
+        //             << typeKindToString(inferred_type.type()->kind())
+        //             << std::endl;
+        // } else {
+        //   std::cout << "No success of getting type for " << i << std::endl;
+        // }
+        // TODO: This is expensive because argumentToIValue constructs an error
+        // message.
+        IValue ivalue_arg;
+        try {
+          ivalue_arg = argumentToIValue(schema, i, kwarg);
+          // std::cout << "i: " << i << " - "
+          //           << typeKindToString(ivalue_arg.type()->kind()) << std::endl;
+          push(stack, ivalue_arg);
+          // std::cout << "001" << std::endl;
+        } catch (const std::runtime_error& e) {
+          // std::cout << "002 = " << e.what() << std::endl;
           return c10::nullopt;
         }
-        push(stack, argumentToIValue(schema, stack.size(), args[i]));
+        // return c10::nullopt;
       }
       consumed_kwargs += 1;
     } else if (schema_arg.default_value()) {
+      // std::cout << "Getting defautl value" << *schema_arg.default_value()
+      //           << std::endl;
       push(stack, *schema_arg.default_value());
     } else {
+      // std::cout << "Missing value for argument " << schema_arg.name()
+      //           << std::endl;
       // throw std::runtime_error(c10::str(
       //     schema.name(),
       //     "() is missing value for argument '",
@@ -194,7 +238,12 @@ my_createStackForSchema(
     for (const auto& kwarg : kwargs) {
       names.emplace_back(py::cast<std::string>(kwarg.first));
     }
-    schema.findErrorInKwargs(names);
+    try {
+      schema.findErrorInKwargs(names);
+    } catch (const std::runtime_error& e) {
+      // std::cout << "022 = " << e.what() << std::endl;
+      return c10::nullopt;
+    }
   }
 
   return std::make_tuple(stack, tensor_node_i, tensor_nodes);
@@ -226,7 +275,32 @@ py::cpp_function jit_tensorwise() {
         }
       }
       if (auto name = is_builtin(fn)) {
+        // TODO: Why doesn't argumentToIValue deal with NoneType for a kwarg?
+        // See also
+        // https://github.com/pytorch/pytorch/blob/7d630278daee00ea2db6bc01e8a2a5f160bd8e81/torch/csrc/jit/pybind_utils.h#L778
+        // If out is NoneType for a builtin we'll simply remove it.
+        bool out_is_none = false;
+        for (const auto& kwarg : kwargs) {
+          if (py::cast<std::string>(kwarg.first) == "out") {
+            auto inferred_type = tryToInferType(kwarg.second);
+            if (inferred_type.success() &&
+                inferred_type.type()->kind() == TypeKind::NoneType) {
+              out_is_none = true;
+            }
+          }
+        }
+        if (out_is_none) {
+          py::dict new_kwargs;
+          for (const auto& kwarg : kwargs) {
+            if (py::cast<std::string>(kwarg.first) == "out") {
+              continue;
+            }
+            new_kwargs[kwarg.first] = kwarg.second;
+          }
+          kwargs = py::kwargs(new_kwargs);
+        }
         for (std::shared_ptr<Operator> op : getAllOperatorsFor(*name)) {
+          // std::cout << "op->schema(): " << op->schema() << std::endl;
           if (auto pack = my_createStackForSchema(
                   op->schema(), args, kwargs, c10::nullopt)) {
             auto operation = op->getOperation();
diff --git a/nestedtensor/csrc/py_init.cpp b/nestedtensor/csrc/py_init.cpp
index 90db5b07..c7bc4f46 100644
--- a/nestedtensor/csrc/py_init.cpp
+++ b/nestedtensor/csrc/py_init.cpp
@@ -110,7 +110,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
       .def("__str__", &torch::nested_tensor::THPNestedTensor::str)
       .def("__repr__", &torch::nested_tensor::THPNestedTensor::str);
 
-  //NOTE: This is a private function until it is feature complete
+  // NOTE: This is a private function until it is feature complete
   m.def("_jit_tensorwise", &torch::nested_tensor::jit_tensorwise);
   m.def("as_nested_tensor", &torch::nested_tensor::as_nested_tensor);
   m.def("nested_tensor", &torch::nested_tensor::nested_tensor);
diff --git a/nestedtensor/nested/monkey_patch.py b/nestedtensor/nested/monkey_patch.py
index dd13b622..e1b572fe 100644
--- a/nestedtensor/nested/monkey_patch.py
+++ b/nestedtensor/nested/monkey_patch.py
@@ -88,10 +88,12 @@ def set_function(key, function):
         set_nt_method(function_name + '_', utils.tensorwise())
         if function_name in ['fill']:
             continue
-        if function_name in ['mvlgamma', 'clamp', 'clamp_min', 'clamp_max', 'fmod']:
-            set_wrapped_torch_function(function_name, utils.tensorwise())
-        else:
-            set_wrapped_jit_torch_function(function_name, _C._jit_tensorwise())
+        # NOTE: jit_tensorwise doesn't support clamp_max, clamp_min, clamp, fmod, mvlgamma, 
+        # if function_name in ['mvlgamma', 'clamp', 'clamp_min', 'clamp_max']:
+        #     set_wrapped_torch_function(function_name, utils.tensorwise())
+        # else:
+        #     set_wrapped_jit_torch_function(function_name, _C._jit_tensorwise())
+        set_wrapped_jit_torch_function(function_name, _C._jit_tensorwise())
         set_nt_method(function_name, utils.tensorwise())
     # <
 
diff --git a/nestedtensor/nested/utils.py b/nestedtensor/nested/utils.py
index c02371a0..5e971c9f 100644
--- a/nestedtensor/nested/utils.py
+++ b/nestedtensor/nested/utils.py
@@ -175,7 +175,6 @@ def wrapper(f):
         def decorator(*_args, **_kwargs):
             def _func(*args, **kwargs):
                 if find_nested_tensor_dispatch_key(*args) is None:
-                    # import pdb; pdb.set_trace()
                     result = f(*args, **kwargs)
                     if not torch.is_tensor(result):
                         return tuple(result)
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index dac4e0dd..66512921 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev20201111+ca117cc'
-git_version = 'ca117cc21eae925a93b343a3e5beeca843ff35cc'
+__version__ = '0.0.1.dev20201122+7f14ac1'
+git_version = '7f14ac17cec7453607dbfa9534e1bd45f60aba35'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION
diff --git a/setup.py b/setup.py
index a0cf5e29..19b48027 100644
--- a/setup.py
+++ b/setup.py
@@ -85,7 +85,7 @@ def get_extensions():
 
     define_macros = []
 
-    extra_compile_args = {'cxx': ['-O3', '-g']}
+    extra_compile_args = {'cxx': ['-g']}
     if (torch.cuda.is_available() and CUDA_HOME is not None) or os.getenv('FORCE_CUDA', '0') == '1':
         extension = CUDAExtension
         define_macros += [('WITH_CUDA', None)]
diff --git a/test/test_nested_tensor_nary.py b/test/test_nested_tensor_nary.py
index 0d6644f5..fa59b1bf 100644
--- a/test/test_nested_tensor_nary.py
+++ b/test/test_nested_tensor_nary.py
@@ -75,6 +75,15 @@ def method_inplace(x): return method_inplace_(x, 2, 0, 1.0)
         elif func__ in ['fmod']:
 
             def func(x, out=None):
+                # print('x')
+                # print(x)
+                # print('out')
+                # print(out)
+                # print('func_')
+                # print(func_)
+                # # if out is None:
+                # #     return func_(x, 0.3)
+                # print("HEEEEEEEEEEEEEEEEEE")
                 return func_(x, 0.3, out=out)
 
             def method(x): return method_(x, 0.3)
diff --git a/test/utils.py b/test/utils.py
index f73916cc..bffd3ecc 100644
--- a/test/utils.py
+++ b/test/utils.py
@@ -87,7 +87,8 @@ def gen_nested_list(seed, nested_dim, tensor_dim, size_low=1, size_high=10):
             ran_size = ()
             for _ in range(tensor_dim):
                 ran = gen_random_int(ran * 1024, low=size_low, high=size_high)
-                ran_size = ran_size + (ran,)
+                # ran_size = ran_size + (ran,)
+                ran_size = ran_size + (1,)
 
             tensors.append(gen_float_tensor(ran, ran_size))
     else:

From 00d579661b2e93046c7666d752ca8e0e063e2f9d Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Sat, 11 Jan 2020 18:41:49 -0800
Subject: [PATCH 48/49] Checkpoint

---
 nestedtensor/csrc/jit_list_apply.cpp | 52 ----------------------------
 nestedtensor/nested/nested.py        | 13 ++++---
 nestedtensor/version.py              |  4 +--
 setup.py                             |  2 +-
 test/test_nested_tensor_nary.py      |  9 -----
 test/utils.py                        |  3 +-
 6 files changed, 10 insertions(+), 73 deletions(-)

diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index ba04d6c2..20256dff 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -119,18 +119,7 @@ my_createStackForSchema(
     const py::kwargs& kwargs,
     c10::optional<IValue> self) {
   size_t all_arguments = (self ? 1 : 0) + args.size() + kwargs.size();
-  // std::cout << "all_arguments: " << all_arguments << std::endl;
-  // std::cout << "schema.arguments().size(): " << schema.arguments().size()
-  //           << std::endl;
   if (all_arguments > schema.arguments().size()) {
-    // throw std::runtime_error(c10::str(
-    //     schema.name(),
-    //     "() expected at most ",
-    //     schema.arguments().size(),
-    //     " argument(s) but received ",
-    //     all_arguments,
-    //     " argument(s). Declaration: ",
-    //     schema));
     return c10::nullopt;
   }
   Stack stack;
@@ -148,35 +137,20 @@ my_createStackForSchema(
     // Use the type information from the schema to convert the PyObject.
     const auto& schema_arg = schema.arguments()[i];
     if (auto tensor_node = try_nested_node(schema_arg, args[i])) {
-      // std::cout << i << " is a nested tensor" << std::endl;
       tensor_nodes.push_back(*tensor_node);
       tensor_node_i.insert(stack.size());
       push(stack, torch::jit::IValue(torch::zeros({})));
     } else {
-      // auto inferred_type = tryToInferType(args[i]);
-      // if (inferred_type.success()) {
-      //   std::cout << "i: " << i << " - "
-      //             << typeKindToString(inferred_type.type()->kind())
-      //             << std::endl;
-      // } else {
-      //   std::cout << "No success of getting type for " << i << std::endl;
-      // }
       // TODO: This is expensive because argumentToIValue constructs an error
       // message.
       try {
         IValue ivalue_arg = argumentToIValue(schema, i, args[i]);
-        // std::cout << "i: " << i << " - "
-        //           << typeKindToString(ivalue_arg.type()->kind()) << std::endl;
         push(stack, ivalue_arg);
-        // std::cout << "001" << std::endl;
       } catch (const std::runtime_error& e) {
-        // std::cout << "002 = " << e.what() << std::endl;
         return c10::nullopt;
       }
     }
-    // std::cout << "11: " << i << std::endl;
   }
-  // std::cout << "Looking at kwargs" << std::endl;
 
   // Now for every remaining non-positional argument in the schema, look for it
   // in the kwargs dict and push it if found, or use its default value if it
@@ -191,44 +165,20 @@ my_createStackForSchema(
         tensor_node_i.insert(stack.size());
         push(stack, torch::jit::IValue(torch::zeros({})));
       } else {
-        // TODO: Should this throw an error?
-        // auto inferred_type = tryToInferType(kwarg);
-        // if (inferred_type.success()) {
-        //   std::cout << "i: " << i << " - "
-        //             << typeKindToString(inferred_type.type()->kind())
-        //             << std::endl;
-        // } else {
-        //   std::cout << "No success of getting type for " << i << std::endl;
-        // }
         // TODO: This is expensive because argumentToIValue constructs an error
         // message.
         IValue ivalue_arg;
         try {
           ivalue_arg = argumentToIValue(schema, i, kwarg);
-          // std::cout << "i: " << i << " - "
-          //           << typeKindToString(ivalue_arg.type()->kind()) << std::endl;
           push(stack, ivalue_arg);
-          // std::cout << "001" << std::endl;
         } catch (const std::runtime_error& e) {
-          // std::cout << "002 = " << e.what() << std::endl;
           return c10::nullopt;
         }
-        // return c10::nullopt;
       }
       consumed_kwargs += 1;
     } else if (schema_arg.default_value()) {
-      // std::cout << "Getting defautl value" << *schema_arg.default_value()
-      //           << std::endl;
       push(stack, *schema_arg.default_value());
     } else {
-      // std::cout << "Missing value for argument " << schema_arg.name()
-      //           << std::endl;
-      // throw std::runtime_error(c10::str(
-      //     schema.name(),
-      //     "() is missing value for argument '",
-      //     arg.name(),
-      //     "'. Declaration: ",
-      //     schema));
       return c10::nullopt;
     }
   }
@@ -241,7 +191,6 @@ my_createStackForSchema(
     try {
       schema.findErrorInKwargs(names);
     } catch (const std::runtime_error& e) {
-      // std::cout << "022 = " << e.what() << std::endl;
       return c10::nullopt;
     }
   }
@@ -300,7 +249,6 @@ py::cpp_function jit_tensorwise() {
           kwargs = py::kwargs(new_kwargs);
         }
         for (std::shared_ptr<Operator> op : getAllOperatorsFor(*name)) {
-          // std::cout << "op->schema(): " << op->schema() << std::endl;
           if (auto pack = my_createStackForSchema(
                   op->schema(), args, kwargs, c10::nullopt)) {
             auto operation = op->getOperation();
diff --git a/nestedtensor/nested/nested.py b/nestedtensor/nested/nested.py
index dd7fa4e6..d1ffbbdc 100644
--- a/nestedtensor/nested/nested.py
+++ b/nestedtensor/nested/nested.py
@@ -265,8 +265,7 @@ def to_tensor(self, dim=0):
         if dim == 0:
             if None in self.size():
                 raise ValueError("Shape not Tensor compliant")
-            result = self._impl.to_tensor()
-            return result
+            return self._impl.to_tensor()
         # If dim is bigger than nested_dim the NestedTensor is already
         # of Tensor for dimensions bigger than the given.
         if self.nested_dim() == 1:
@@ -321,16 +320,16 @@ def nested_stride(self, dim=None):
 
     def __torch_function__(self, func, args=(), kwargs=None):
         _local_func = None
+        if kwargs is None:
+            kwargs = {}
         if func in NestedTensor.__jit_function_dispatch:
             _jit_local_func = NestedTensor.__jit_function_dispatch[func]
             impl_args = [a._impl if isinstance(a, NestedTensor) else a for a in args]
-            if kwargs is not None:
-                impl_kwargs = {k: v._impl if isinstance(v, NestedTensor) else v for (k, v) in kwargs.items()}
-                return NestedTensor(_jit_local_func(*impl_args, **impl_kwargs))
-            return NestedTensor(_jit_local_func(*impl_args))
+            impl_kwargs = {k: v._impl if isinstance(v, NestedTensor) else v for (k, v) in kwargs.items()}
+            return NestedTensor(_jit_local_func(*impl_args, **impl_kwargs))
         if func in NestedTensor.__function_dispatch:
             _local_func = NestedTensor.__function_dispatch[func]
-            return _local_func(*args) if kwargs is None else _local_func(*args, **kwargs)
+            return _local_func(*args, **kwargs)
         raise NotImplementedError("NestedTensor doesn't support function {}".format(func))
 
     def __bool__(self):
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index 66512921..d6c752d8 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev20201122+7f14ac1'
-git_version = '7f14ac17cec7453607dbfa9534e1bd45f60aba35'
+__version__ = '0.0.1.dev20201122+f439e4e'
+git_version = 'f439e4eac6ff826b7c9951ae45706863be0dc82a'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION
diff --git a/setup.py b/setup.py
index 19b48027..a0cf5e29 100644
--- a/setup.py
+++ b/setup.py
@@ -85,7 +85,7 @@ def get_extensions():
 
     define_macros = []
 
-    extra_compile_args = {'cxx': ['-g']}
+    extra_compile_args = {'cxx': ['-O3', '-g']}
     if (torch.cuda.is_available() and CUDA_HOME is not None) or os.getenv('FORCE_CUDA', '0') == '1':
         extension = CUDAExtension
         define_macros += [('WITH_CUDA', None)]
diff --git a/test/test_nested_tensor_nary.py b/test/test_nested_tensor_nary.py
index fa59b1bf..0d6644f5 100644
--- a/test/test_nested_tensor_nary.py
+++ b/test/test_nested_tensor_nary.py
@@ -75,15 +75,6 @@ def method_inplace(x): return method_inplace_(x, 2, 0, 1.0)
         elif func__ in ['fmod']:
 
             def func(x, out=None):
-                # print('x')
-                # print(x)
-                # print('out')
-                # print(out)
-                # print('func_')
-                # print(func_)
-                # # if out is None:
-                # #     return func_(x, 0.3)
-                # print("HEEEEEEEEEEEEEEEEEE")
                 return func_(x, 0.3, out=out)
 
             def method(x): return method_(x, 0.3)
diff --git a/test/utils.py b/test/utils.py
index bffd3ecc..f73916cc 100644
--- a/test/utils.py
+++ b/test/utils.py
@@ -87,8 +87,7 @@ def gen_nested_list(seed, nested_dim, tensor_dim, size_low=1, size_high=10):
             ran_size = ()
             for _ in range(tensor_dim):
                 ran = gen_random_int(ran * 1024, low=size_low, high=size_high)
-                # ran_size = ran_size + (ran,)
-                ran_size = ran_size + (1,)
+                ran_size = ran_size + (ran,)
 
             tensors.append(gen_float_tensor(ran, ran_size))
     else:

From a67b20e8d88dc494de832d42a6dc7684b5b8ea51 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Sat, 11 Jan 2020 18:49:45 -0800
Subject: [PATCH 49/49] Checkpoint

---
 nestedtensor/csrc/jit_list_apply.cpp | 7 ++-----
 nestedtensor/nested/monkey_patch.py  | 5 -----
 nestedtensor/version.py              | 4 ++--
 3 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/nestedtensor/csrc/jit_list_apply.cpp b/nestedtensor/csrc/jit_list_apply.cpp
index 20256dff..cf7f3431 100644
--- a/nestedtensor/csrc/jit_list_apply.cpp
+++ b/nestedtensor/csrc/jit_list_apply.cpp
@@ -144,8 +144,7 @@ my_createStackForSchema(
       // TODO: This is expensive because argumentToIValue constructs an error
       // message.
       try {
-        IValue ivalue_arg = argumentToIValue(schema, i, args[i]);
-        push(stack, ivalue_arg);
+        push(stack, argumentToIValue(schema, i, args[i]));
       } catch (const std::runtime_error& e) {
         return c10::nullopt;
       }
@@ -167,10 +166,8 @@ my_createStackForSchema(
       } else {
         // TODO: This is expensive because argumentToIValue constructs an error
         // message.
-        IValue ivalue_arg;
         try {
-          ivalue_arg = argumentToIValue(schema, i, kwarg);
-          push(stack, ivalue_arg);
+          push(stack, argumentToIValue(schema, i, kwarg));
         } catch (const std::runtime_error& e) {
           return c10::nullopt;
         }
diff --git a/nestedtensor/nested/monkey_patch.py b/nestedtensor/nested/monkey_patch.py
index e1b572fe..036a5c56 100644
--- a/nestedtensor/nested/monkey_patch.py
+++ b/nestedtensor/nested/monkey_patch.py
@@ -88,11 +88,6 @@ def set_function(key, function):
         set_nt_method(function_name + '_', utils.tensorwise())
         if function_name in ['fill']:
             continue
-        # NOTE: jit_tensorwise doesn't support clamp_max, clamp_min, clamp, fmod, mvlgamma, 
-        # if function_name in ['mvlgamma', 'clamp', 'clamp_min', 'clamp_max']:
-        #     set_wrapped_torch_function(function_name, utils.tensorwise())
-        # else:
-        #     set_wrapped_jit_torch_function(function_name, _C._jit_tensorwise())
         set_wrapped_jit_torch_function(function_name, _C._jit_tensorwise())
         set_nt_method(function_name, utils.tensorwise())
     # <
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index d6c752d8..9eefa823 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev20201122+f439e4e'
-git_version = 'f439e4eac6ff826b7c9951ae45706863be0dc82a'
+__version__ = '0.0.1.dev20201122+00d5796'
+git_version = '00d579661b2e93046c7666d752ca8e0e063e2f9d'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION