diff --git a/CMakeLists.txt b/CMakeLists.txt
index c3c5ecf..8321b35 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,15 +1,16 @@
 cmake_minimum_required(VERSION 3.0)
 project(apfp)
-
+ 
 set(CMAKE_CXX_STANDARD 17)
 
 # Target options 
 set(APFP_PLATFORM "xilinx_u280_xdma_201920_3" CACHE STRING "Platform string for Vitis.")
 set(APFP_BITS 1024 CACHE STRING "Number of bits to use for a floating point number, including mantissa, exponent, and sign.")
-set(APFP_MULT_BASE_BITS 16 CACHE STRING "Number of bits to bottom out the multiplication at and use native multiplication.")
+set(APFP_MULT_BASE_BITS 18 CACHE STRING "Number of bits to bottom out the multiplication at and use native multiplication.")
 set(APFP_TILE_SIZE_N 32 CACHE STRING "Tile size in the N-dimension when running matrix-matrix multiplication.")
 set(APFP_TILE_SIZE_M 32 CACHE STRING "Tile size in the M-dimension when running matrix-matrix multiplication.")
 set(APFP_SEMANTICS "MPFR" CACHE STRING "Which semantics to use for floating point operations [GMP/MPFR].")
+set(APFP_PROFILING OFF CACHE BOOL "Enable profiling in the generated kernel.")
 set_property(CACHE APFP_SEMANTICS PROPERTY STRINGS GMP MPFR)
 
 # Validation and derived numbers
@@ -26,7 +27,7 @@ find_package(MPFR REQUIRED)
 find_package(GMP REQUIRED)
 find_package(Threads REQUIRED)
 
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wpedantic -DAPFP_${APFP_SEMANTICS}_SEMANTICS")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wpedantic -Wno-unused-label -DAPFP_${APFP_SEMANTICS}_SEMANTICS")
 include_directories(${CMAKE_BINARY_DIR} include SYSTEM hlslib/include ${Vitis_INCLUDE_DIRS} )
 
 configure_file(include/Config.h.in Config.h)
@@ -49,7 +50,7 @@ add_vitis_kernel(MatrixMultiplication FILES ${APFP_KERNEL_FILES}
                              m_axi_b:DDR[1]
                              m_axi_c_read:DDR[1]
                              m_axi_c_write:DDR[1])
-add_vitis_program(MatrixMultiplication ${APFP_PLATFORM})
+add_vitis_program(MatrixMultiplication ${APFP_PLATFORM} PROFILING ${APFP_PROFILING})
 
 # Internal library 
 add_library(apfp host/Random.cpp host/MatrixMultiplicationReference.cpp)
diff --git a/device/ArithmeticOperations.cpp b/device/ArithmeticOperations.cpp
index 066c23c..5558fdd 100644
--- a/device/ArithmeticOperations.cpp
+++ b/device/ArithmeticOperations.cpp
@@ -54,6 +54,7 @@ PackedFloat Add(PackedFloat const &a, PackedFloat const &b) {
     // Optionally shift by 1, 2, 4, 8, 16... log2(B), such that all bits have eventually traveled to
     // their designated position.
     const int kNumStages = hlslib::ConstLog2(kBits);
+ShiftStages:
     for (int i = 0; i < kNumStages; ++i) {
 #pragma HLS UNROLL
         a_mantissa = ((shift_c & (1 << i)) == 0) ? a_mantissa : (a_mantissa >> (1 << i));
diff --git a/device/Karatsuba.cpp b/device/Karatsuba.cpp
index 1a9b4a4..1b4724f 100644
--- a/device/Karatsuba.cpp
+++ b/device/Karatsuba.cpp
@@ -2,6 +2,11 @@
 
 #include <type_traits>  // std::enable_if
 
+constexpr int AddLatency(int bits) {
+    // 4 is the maximum supported latency of integer adds using the BIND_OP pragma
+    return (bits >= 1024) ? 4 : (bits >= 768) ? 3 : (bits >= 512) ? 2 : (bits >= 256) ? 1 : 0;
+}
+
 template <int bits>
 auto _Karatsuba(ap_uint<bits> const &a, ap_uint<bits> const &b) ->
     typename std::enable_if<(bits > kMultBaseBits), ap_uint<2 * bits>>::type {
@@ -33,6 +38,7 @@ auto _Karatsuba(ap_uint<bits> const &a, ap_uint<bits> const &b) ->
     Full a0a1b0b1 = _Karatsuba<bits / 2>(a0a1, b0b1);
     ap_int<bits + 2> a0a1b0b1_signed = a0a1b0b1_is_neg ? -ap_int<bits + 1>(a0a1b0b1) : ap_int<bits + 2>(a0a1b0b1);
     ap_uint<bits + 2> z1 = ap_uint<bits + 2>(a0a1b0b1_signed) + z0 + z2;
+#pragma HLS BIND_OP variable = z1 op = add impl = fabric latency = AddLatency(bits)
 
     // Align everything and combine
     ap_uint<(2 * bits)> z0z2 = z0 | (ap_uint<(2 * bits)>(z2) << bits);
@@ -41,6 +47,7 @@ auto _Karatsuba(ap_uint<bits> const &a, ap_uint<bits> const &b) ->
     // Workaround to avoid HLS padding an extra bit for the add. This is necessary to support 2048 bit multiplication,
     // which required adding two 4096 numbers, because 4096 bits is the maximum width support by the ap_uint type.
     z.V = z1_aligned.V + z0z2.V;
+#pragma HLS BIND_OP variable = z.V op = add impl = fabric latency = AddLatency(bits)
 
     return z;
 }
diff --git a/device/MatrixMultiplication.cpp b/device/MatrixMultiplication.cpp
index b1a8851..0dabc64 100644
--- a/device/MatrixMultiplication.cpp
+++ b/device/MatrixMultiplication.cpp
@@ -9,42 +9,78 @@
 // Annoyingly we have to specialize the innermost loop on whether multiple DRAM flits per number are required or not,
 // because HLS otherwise gets confused by pragmas applied to a loop of size 1 in the latter case.
 template <int lines_per_number>
-void ReadAInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &to_kernel, const int size_k, const int n0,
+void ReadAInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &a_to_feeder, const int size_k, const int n0,
                 const int k) {
 #pragma HLS INLINE
+    DramLine num[kLinesPerNumber];
+ReadA_N:
     for (int n1 = 0; n1 < kTileSizeN; ++n1) {
-        DramLine num[kLinesPerNumber];
+    ReadA_Flits:
         for (int i = 0; i < kLinesPerNumber; ++i) {
 #pragma HLS PIPELINE II = 1
 #pragma HLS LOOP_FLATTEN
             num[i] = mem[((n0 * kTileSizeN + n1) * size_k + k) * kLinesPerNumber + i];
             if (i == kLinesPerNumber - 1) {
-                to_kernel.Push(*reinterpret_cast<PackedFloat const *>(num));
+                a_to_feeder.Push(*reinterpret_cast<PackedFloat const *>(num));
             }
         }
     }
 }
 
 template <>
-void ReadAInner<1>(DramLine const *const mem, hlslib::Stream<PackedFloat> &to_kernel, const int size_k, const int n0,
+void ReadAInner<1>(DramLine const *const mem, hlslib::Stream<PackedFloat> &a_to_feeder, const int size_k, const int n0,
                    const int k) {
 #pragma HLS INLINE
+ReadA_N:
     for (int n1 = 0; n1 < kTileSizeN; ++n1) {
 #pragma HLS PIPELINE II = 1
 #pragma HLS LOOP_FLATTEN
         const auto num = mem[((n0 * kTileSizeN + n1) * size_k + k) * kLinesPerNumber];
-        to_kernel.Push(*reinterpret_cast<PackedFloat const *>(&num));
+        a_to_feeder.Push(*reinterpret_cast<PackedFloat const *>(&num));
     }
 }
 
-void ReadA(DramLine const *const mem, hlslib::Stream<PackedFloat> &to_kernel, const int size_n, const int size_k,
+void ReadA(DramLine const *const mem, hlslib::Stream<PackedFloat> &a_to_feeder, const int size_n, const int size_k,
            const int size_m) {
     const auto tiles_n = hlslib::CeilDivide(size_n, kTileSizeN);
     const auto tiles_m = hlslib::CeilDivide(size_m, kTileSizeM);
+ReadA_TilesN:
     for (int n0 = 0; n0 < tiles_n; ++n0) {
+    ReadA_TilesM:
         for (int m0 = 0; m0 < tiles_m; ++m0) {
+        ReadA_K:
             for (int k = 0; k < size_k; ++k) {
-                ReadAInner<kLinesPerNumber>(mem, to_kernel, size_k, n0, k);
+                ReadAInner<kLinesPerNumber>(mem, a_to_feeder, size_k, n0, k);
+            }
+        }
+    }
+}
+
+// In order to eliminate control logic in the compute function, we introduce extra feeders that run in the iteration
+// space of the computational module, but write to the kernel every iteration to absorb the conditional pipeline reads
+void FeedA(hlslib::Stream<PackedFloat> &a_to_feeder, hlslib::Stream<PackedFloat> &a_to_kernel, const int size_n,
+           const int size_k, const int size_m) {
+    const auto tiles_n = hlslib::CeilDivide(size_n, kTileSizeN);
+    const auto tiles_m = hlslib::CeilDivide(size_m, kTileSizeM);
+    PackedFloat a;
+FeedA_TilesN:
+    for (int n0 = 0; n0 < tiles_n; ++n0) {
+    FeedA_TilesM:
+        for (int m0 = 0; m0 < tiles_m; ++m0) {
+        FeedA_K:
+            for (int k = 0; k < size_k; ++k) {
+            FeedA_N:
+                for (int n1 = 0; n1 < kTileSizeN; ++n1) {
+                FeedA_M:
+                    for (int m1 = 0; m1 < kTileSizeM; ++m1) {
+#pragma HLS PIPELINE II = 1
+#pragma HLS LOOP_FLATTEN
+                        if (m1 == 0) {
+                            a = a_to_feeder.Pop();
+                        }
+                        a_to_kernel.Push(a);
+                    }
+                }
             }
         }
     }
@@ -53,42 +89,76 @@ void ReadA(DramLine const *const mem, hlslib::Stream<PackedFloat> &to_kernel, co
 ////////////////////////////////////////////////////////////////////////////////
 
 template <int lines_per_number>
-void ReadBInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &to_kernel, const int size_m, const int m0,
+void ReadBInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &b_to_feeder, const int size_m, const int m0,
                 const int k) {
 #pragma HLS INLINE
+    DramLine num[kLinesPerNumber];
+ReadB_M:
     for (int m1 = 0; m1 < kTileSizeM; ++m1) {
-        DramLine num[kLinesPerNumber];
+    ReadB_Flits:
         for (int i = 0; i < kLinesPerNumber; ++i) {
 #pragma HLS PIPELINE II = 1
 #pragma HLS LOOP_FLATTEN
             num[i] = mem[(k * size_m + m0 * kTileSizeM + m1) * kLinesPerNumber + i];
             if (i == kLinesPerNumber - 1) {
-                to_kernel.Push(*reinterpret_cast<PackedFloat const *>(num));
+                b_to_feeder.Push(*reinterpret_cast<PackedFloat const *>(num));
             }
         }
     }
 }
 
 template <>
-void ReadBInner<1>(DramLine const *const mem, hlslib::Stream<PackedFloat> &to_kernel, const int size_m, const int m0,
+void ReadBInner<1>(DramLine const *const mem, hlslib::Stream<PackedFloat> &b_to_feeder, const int size_m, const int m0,
                    const int k) {
 #pragma HLS INLINE
+ReadB_M:
     for (int m1 = 0; m1 < kTileSizeM; ++m1) {
 #pragma HLS PIPELINE II = 1
 #pragma HLS LOOP_FLATTEN
         const auto num = mem[(k * size_m + m0 * kTileSizeM + m1) * kLinesPerNumber];
-        to_kernel.Push(*reinterpret_cast<PackedFloat const *>(&num));
+        b_to_feeder.Push(*reinterpret_cast<PackedFloat const *>(&num));
     }
 }
 
-void ReadB(DramLine const *const mem, hlslib::Stream<PackedFloat> &to_kernel, const int size_n, const int size_k,
+void ReadB(DramLine const *const mem, hlslib::Stream<PackedFloat> &b_to_feeder, const int size_n, const int size_k,
            const int size_m) {
     const auto tiles_n = hlslib::CeilDivide(size_n, kTileSizeN);
     const auto tiles_m = hlslib::CeilDivide(size_m, kTileSizeM);
+ReadB_TilesN:
     for (int n0 = 0; n0 < tiles_n; ++n0) {
+    ReadB_TilesM:
         for (int m0 = 0; m0 < tiles_m; ++m0) {
+        ReadB_K:
             for (int k = 0; k < size_k; ++k) {
-                ReadBInner<kLinesPerNumber>(mem, to_kernel, size_m, m0, k);
+                ReadBInner<kLinesPerNumber>(mem, b_to_feeder, size_m, m0, k);
+            }
+        }
+    }
+}
+
+void FeedB(hlslib::Stream<PackedFloat> &b_to_feeder, hlslib::Stream<PackedFloat> &b_to_kernel, const int size_n,
+           const int size_k, const int size_m) {
+    const auto tiles_n = hlslib::CeilDivide(size_n, kTileSizeN);
+    const auto tiles_m = hlslib::CeilDivide(size_m, kTileSizeM);
+    PackedFloat b;
+FeedB_TilesN:
+    for (int n0 = 0; n0 < tiles_n; ++n0) {
+    FeedB_TilesM:
+        for (int m0 = 0; m0 < tiles_m; ++m0) {
+        FeedB_K:
+            for (int k = 0; k < size_k; ++k) {
+            FeedB_N:
+                for (int n1 = 0; n1 < kTileSizeN; ++n1) {
+                FeedB_M:
+                    for (int m1 = 0; m1 < kTileSizeM; ++m1) {
+#pragma HLS PIPELINE II = 1
+#pragma HLS LOOP_FLATTEN
+                        if (n1 == 0) {
+                            b = b_to_feeder.Pop();
+                        }
+                        b_to_kernel.Push(b);
+                    }
+                }
             }
         }
     }
@@ -97,41 +167,75 @@ void ReadB(DramLine const *const mem, hlslib::Stream<PackedFloat> &to_kernel, co
 ////////////////////////////////////////////////////////////////////////////////
 
 template <int lines_per_number>
-void ReadCInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &to_kernel, const int size_m, const int n0,
+void ReadCInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &c_to_feeder, const int size_m, const int n0,
                 const int m0, const int n1) {
 #pragma HLS INLINE
+ReadC_M:
     for (int m1 = 0; m1 < kTileSizeM; ++m1) {
         DramLine num[kLinesPerNumber];
+    ReadC_Flits:
         for (int i = 0; i < kLinesPerNumber; ++i) {
 #pragma HLS PIPELINE II = 1
 #pragma HLS LOOP_FLATTEN
             num[i] = mem[((n0 * kTileSizeN + n1) * size_m + m0 * kTileSizeM + m1) * kLinesPerNumber + i];
             if (i == kLinesPerNumber - 1) {
-                to_kernel.Push(*reinterpret_cast<PackedFloat const *>(num));
+                c_to_feeder.Push(*reinterpret_cast<PackedFloat const *>(num));
             }
         }
     }
 }
 
 template <>
-void ReadCInner<1>(DramLine const *const mem, hlslib::Stream<PackedFloat> &to_kernel, const int size_m, const int n0,
+void ReadCInner<1>(DramLine const *const mem, hlslib::Stream<PackedFloat> &c_to_feeder, const int size_m, const int n0,
                    const int m0, const int n1) {
 #pragma HLS INLINE
+ReadC_M:
     for (int m1 = 0; m1 < kTileSizeM; ++m1) {
 #pragma HLS PIPELINE II = 1
 #pragma HLS LOOP_FLATTEN
         const auto num = mem[((n0 * kTileSizeN + n1) * size_m + m0 * kTileSizeM + m1) * kLinesPerNumber];
-        to_kernel.Push(*reinterpret_cast<PackedFloat const *>(&num));
+        c_to_feeder.Push(*reinterpret_cast<PackedFloat const *>(&num));
     }
 }
 
-void ReadC(DramLine const *const mem, hlslib::Stream<PackedFloat> &to_kernel, const int size_n, const int size_m) {
+void ReadC(DramLine const *const mem, hlslib::Stream<PackedFloat> &c_to_feeder, const int size_n, const int size_m) {
     const auto tiles_n = hlslib::CeilDivide(size_n, kTileSizeN);
     const auto tiles_m = hlslib::CeilDivide(size_m, kTileSizeM);
+ReadC_TilesN:
     for (int n0 = 0; n0 < tiles_n; ++n0) {
+    ReadC_TilesM:
         for (int m0 = 0; m0 < tiles_m; ++m0) {
+        ReadC_N:
             for (int n1 = 0; n1 < kTileSizeN; ++n1) {
-                ReadCInner<kLinesPerNumber>(mem, to_kernel, size_m, n0, m0, n1);
+                ReadCInner<kLinesPerNumber>(mem, c_to_feeder, size_m, n0, m0, n1);
+            }
+        }
+    }
+}
+
+void FeedC(hlslib::Stream<PackedFloat> &c_to_feeder, hlslib::Stream<PackedFloat> &c_to_kernel, const int size_n,
+           const int size_k, const int size_m) {
+    const auto tiles_n = hlslib::CeilDivide(size_n, kTileSizeN);
+    const auto tiles_m = hlslib::CeilDivide(size_m, kTileSizeM);
+    PackedFloat c;
+FeedC_TilesN:
+    for (int n0 = 0; n0 < tiles_n; ++n0) {
+    FeedC_TilesM:
+        for (int m0 = 0; m0 < tiles_m; ++m0) {
+        FeedC_K:
+            for (int k = 0; k < size_k; ++k) {
+            FeedC_N:
+                for (int n1 = 0; n1 < kTileSizeN; ++n1) {
+                FeedC_M:
+                    for (int m1 = 0; m1 < kTileSizeM; ++m1) {
+#pragma HLS PIPELINE II = 1
+#pragma HLS LOOP_FLATTEN
+                        if (k == 0) {
+                            c = c_to_feeder.Pop();
+                        }
+                        c_to_kernel.Push(c);
+                    }
+                }
             }
         }
     }
@@ -139,13 +243,42 @@ void ReadC(DramLine const *const mem, hlslib::Stream<PackedFloat> &to_kernel, co
 
 ////////////////////////////////////////////////////////////////////////////////
 
+void DrainC(hlslib::Stream<PackedFloat> &c_to_drainer, hlslib::Stream<PackedFloat> &drainer_to_c, const int size_n,
+            const int size_k, const int size_m) {
+    const auto tiles_n = hlslib::CeilDivide(size_n, kTileSizeN);
+    const auto tiles_m = hlslib::CeilDivide(size_m, kTileSizeM);
+DrainC_TilesN:
+    for (int n0 = 0; n0 < tiles_n; ++n0) {
+    DrainC_TilesM:
+        for (int m0 = 0; m0 < tiles_m; ++m0) {
+        DrainC_K:
+            for (int k = 0; k < size_k; ++k) {
+            DrainC_N:
+                for (int n1 = 0; n1 < kTileSizeN; ++n1) {
+                DrainC_M:
+                    for (int m1 = 0; m1 < kTileSizeM; ++m1) {
+#pragma HLS PIPELINE II = 1
+#pragma HLS LOOP_FLATTEN
+                        const auto c = c_to_drainer.Pop();
+                        if (k == size_k - 1) {
+                            drainer_to_c.Push(c);
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
 template <int lines_per_number>
 void WriteCInner(hlslib::Stream<PackedFloat> &from_kernel, DramLine *const mem, const int size_m, const int n0,
                  const int m0, const int n1) {
 #pragma HLS INLINE
+WriteC_M:
     for (int m1 = 0; m1 < kTileSizeM; ++m1) {
         DramLine num[kLinesPerNumber];
 #pragma HLS ARRAY_PARTITION variable = num complete
+    WriteC_Flits:
         for (int i = 0; i < kLinesPerNumber; ++i) {
 #pragma HLS PIPELINE II = 1
 #pragma HLS LOOP_FLATTEN
@@ -161,6 +294,7 @@ template <>
 void WriteCInner<1>(hlslib::Stream<PackedFloat> &from_kernel, DramLine *const mem, const int size_m, const int n0,
                     const int m0, const int n1) {
 #pragma HLS INLINE
+WriteC_M:
     for (int m1 = 0; m1 < kTileSizeM; ++m1) {
 #pragma HLS PIPELINE II = 1
 #pragma HLS LOOP_FLATTEN
@@ -171,8 +305,13 @@ void WriteCInner<1>(hlslib::Stream<PackedFloat> &from_kernel, DramLine *const me
 }
 
 void WriteC(hlslib::Stream<PackedFloat> &from_kernel, DramLine *const mem, const int size_n, int const size_m) {
-    for (int n0 = 0; n0 < hlslib::CeilDivide(size_n, kTileSizeN); ++n0) {
-        for (int m0 = 0; m0 < hlslib::CeilDivide(size_m, kTileSizeM); ++m0) {
+    const auto tiles_n = hlslib::CeilDivide(size_n, kTileSizeN);
+    const auto tiles_m = hlslib::CeilDivide(size_m, kTileSizeM);
+WriteC_TilesN:
+    for (int n0 = 0; n0 < tiles_n; ++n0) {
+    WriteC_TilesM:
+        for (int m0 = 0; m0 < tiles_m; ++m0) {
+        WriteC_N:
             for (int n1 = 0; n1 < kTileSizeN; ++n1) {
                 WriteCInner<kLinesPerNumber>(from_kernel, mem, size_m, n0, m0, n1);
             }
@@ -187,41 +326,35 @@ void Compute(hlslib::Stream<PackedFloat> &a_in, hlslib::Stream<PackedFloat> &b_i
     PackedFloat a_buffer;  // Just to make A symmetric to B and C
     PackedFloat b_buffer[kTileSizeM];
     PackedFloat c_buffer[kTileSizeN * kTileSizeM];
-    for (int n0 = 0; n0 < hlslib::CeilDivide(size_n, kTileSizeN); ++n0) {
-        for (int m0 = 0; m0 < hlslib::CeilDivide(size_m, kTileSizeM); ++m0) {
+    const int tiles_n = hlslib::CeilDivide(size_n, kTileSizeN);
+    const int tiles_m = hlslib::CeilDivide(size_m, kTileSizeM);
+Compute_TilesN:
+    for (int n0 = 0; n0 < tiles_n; ++n0) {
+    Compute_TilesM:
+        for (int m0 = 0; m0 < tiles_m; ++m0) {
+        Compute_K:
             for (int k = 0; k < size_k; ++k) {
+            Compute_N:
                 for (int n1 = 0; n1 < kTileSizeN; ++n1) {
+                Compute_M:
                     for (int m1 = 0; m1 < kTileSizeM; ++m1) {
 #pragma HLS PIPELINE II = 1
 #pragma HLS LOOP_FLATTEN
-                        PackedFloat a, b, c;
-                        if (m1 == 0) {
-                            a = a_in.Pop();
-                            a_buffer = a;
-                        } else {
-                            a = a_buffer;
-                        }
-                        if (n1 == 0) {
-                            b = b_in.Pop();
-                            b_buffer[m1] = b;
-                        } else {
-                            b = b_buffer[m1];
-                        }
-                        if (k == 0) {
-                            c = c_in.Pop();
-                        } else {
-                            c = c_buffer[n1 * kTileSizeM + m1];
-                        }
+                        const PackedFloat a_read = a_in.Pop();
+                        const PackedFloat b_read = b_in.Pop();
+                        const PackedFloat c_read = c_in.Pop();
+                        const PackedFloat a = (m1 == 0) ? a_read : a_buffer;
+                        const PackedFloat b = (n1 == 0) ? b_read : b_buffer[m1];
+                        const PackedFloat c = (k == 0) ? c_read : c_buffer[n1 * kTileSizeM + m1];
+                        a_buffer = a;
+                        b_buffer[m1] = b;
                         // Ignore contributions from out-of-bound indices
                         const bool in_bounds = (n0 * kTileSizeN + n1 < size_n) && (m0 * kTileSizeM + m1 < size_m);
                         // Meat of the computation
                         const auto res = in_bounds ? MultiplyAccumulate(a, b, c) : c;
                         // Write back to buffer
                         c_buffer[n1 * kTileSizeM + m1] = res;
-                        // Write out on last slice
-                        if (k == size_k - 1) {
-                            c_out.Push(res);
-                        }
+                        c_out.Push(res);
                     }
                 }
             }
@@ -229,6 +362,8 @@ void Compute(hlslib::Stream<PackedFloat> &a_in, hlslib::Stream<PackedFloat> &b_i
     }
 }
 
+////////////////////////////////////////////////////////////////////////////////
+
 void MatrixMultiplication(DramLine const *const a, DramLine const *const b, DramLine const *const c_read,
                           DramLine *const c_write, const int size_n, const int size_k, int const size_m) {
 #pragma HLS INTERFACE m_axi offset = slave port = a bundle = a
@@ -237,16 +372,38 @@ void MatrixMultiplication(DramLine const *const a, DramLine const *const b, Dram
 // C, to make sure that the compiler doesn't try to look for dependencies/conflicts
 #pragma HLS INTERFACE m_axi offset = slave port = c_read bundle = c_read
 #pragma HLS INTERFACE m_axi offset = slave port = c_write bundle = c_write
+#pragma HLS INTERFACE s_axilite port = a
+#pragma HLS INTERFACE s_axilite port = b
+#pragma HLS INTERFACE s_axilite port = c_read
+#pragma HLS INTERFACE s_axilite port = c_write
+#pragma HLS INTERFACE s_axilite port = size_n
+#pragma HLS INTERFACE s_axilite port = size_k
+#pragma HLS INTERFACE s_axilite port = size_m
+#pragma HLS STABLE variable = a
+#pragma HLS STABLE variable = b
+#pragma HLS STABLE variable = c_read
+#pragma HLS STABLE variable = c_write
+#pragma HLS STABLE variable = size_n
+#pragma HLS STABLE variable = size_k
+#pragma HLS STABLE variable = size_m
 #pragma HLS DATAFLOW
-    hlslib::Stream<PackedFloat> a_to_kernel("a_to_kernel");
-    hlslib::Stream<PackedFloat> b_to_kernel("b_to_kernel");
-    hlslib::Stream<PackedFloat> c_to_kernel("c_to_kernel");
-    hlslib::Stream<PackedFloat> kernel_to_c("kernel_to_c");
+    hlslib::Stream<PackedFloat, 16> a_to_feeder("a_to_feeder");
+    hlslib::Stream<PackedFloat, 16> a_to_kernel("a_to_kernel");
+    hlslib::Stream<PackedFloat, 16> b_to_feeder("b_to_feeder");
+    hlslib::Stream<PackedFloat, 16> b_to_kernel("b_to_kernel");
+    hlslib::Stream<PackedFloat, 16> c_to_feeder("c_to_feeder");
+    hlslib::Stream<PackedFloat, 16> c_to_kernel("c_to_kernel");
+    hlslib::Stream<PackedFloat, 16> c_from_kernel("c_from_kernel");
+    hlslib::Stream<PackedFloat, 16> c_from_drainer("c_from_drainer");
     HLSLIB_DATAFLOW_INIT();
-    HLSLIB_DATAFLOW_FUNCTION(ReadA, a, a_to_kernel, size_n, size_k, size_m);
-    HLSLIB_DATAFLOW_FUNCTION(ReadB, b, b_to_kernel, size_n, size_k, size_m);
-    HLSLIB_DATAFLOW_FUNCTION(ReadC, c_read, c_to_kernel, size_n, size_m);
-    HLSLIB_DATAFLOW_FUNCTION(Compute, a_to_kernel, b_to_kernel, c_to_kernel, kernel_to_c, size_n, size_k, size_m);
-    HLSLIB_DATAFLOW_FUNCTION(WriteC, kernel_to_c, c_write, size_n, size_m);
+    HLSLIB_DATAFLOW_FUNCTION(ReadA, a, a_to_feeder, size_n, size_k, size_m);
+    HLSLIB_DATAFLOW_FUNCTION(FeedA, a_to_feeder, a_to_kernel, size_n, size_k, size_m);
+    HLSLIB_DATAFLOW_FUNCTION(ReadB, b, b_to_feeder, size_n, size_k, size_m);
+    HLSLIB_DATAFLOW_FUNCTION(FeedB, b_to_feeder, b_to_kernel, size_n, size_k, size_m);
+    HLSLIB_DATAFLOW_FUNCTION(ReadC, c_read, c_to_feeder, size_n, size_m);
+    HLSLIB_DATAFLOW_FUNCTION(FeedC, c_to_feeder, c_to_kernel, size_n, size_k, size_m);
+    HLSLIB_DATAFLOW_FUNCTION(Compute, a_to_kernel, b_to_kernel, c_to_kernel, c_from_kernel, size_n, size_k, size_m);
+    HLSLIB_DATAFLOW_FUNCTION(DrainC, c_from_kernel, c_from_drainer, size_n, size_k, size_m);
+    HLSLIB_DATAFLOW_FUNCTION(WriteC, c_from_drainer, c_write, size_n, size_m);
     HLSLIB_DATAFLOW_FINALIZE();
 }
diff --git a/hlslib b/hlslib
index bdc88d0..7e2aacd 160000
--- a/hlslib
+++ b/hlslib
@@ -1 +1 @@
-Subproject commit bdc88d044c62a722d4e1a21b439b1bb002864368
+Subproject commit 7e2aacd0d6352c963931efea46f140ef499e4b9b
diff --git a/host/Random.cpp b/host/Random.cpp
index c5f58c4..a534b59 100644
--- a/host/Random.cpp
+++ b/host/Random.cpp
@@ -24,7 +24,7 @@ __mpf_struct RandomNumberGenerator::GenerateGmp() {
 
 __mpfr_struct RandomNumberGenerator::GenerateMpfr() {
     mpfr_t num;
-    mpfr_init2(num, kBits);
+    mpfr_init2(num, kMantissaBits);
     Generate(num);
     return num[0];
 }
diff --git a/host/TestProgram.cpp b/host/TestProgram.cpp
index 3b44916..efc0967 100644
--- a/host/TestProgram.cpp
+++ b/host/TestProgram.cpp
@@ -17,8 +17,11 @@ bool RunTestSimulation(int size_n, int size_k, int size_m) {
 bool RunTest(std::string const &kernel_path, int size_n, int size_k, int size_m) {
 #endif
     hlslib::ocl::Context context;
+    std::cout << "Configuring the device..." << std::flush;
     auto program = context.MakeProgram(kernel_path);
+    std::cout << " Done.\n";
     // Initialize some random data
+    std::cout << "Initializing input data..." << std::flush;
     std::vector<__mpfr_struct> a_mpfr, b_mpfr, c_mpfr;
     RandomNumberGenerator rng;
     for (int n = 0; n < size_n; ++n) {
@@ -47,7 +50,9 @@ bool RunTest(std::string const &kernel_path, int size_n, int size_k, int size_m)
     for (auto &x : c_mpfr) {
         c_host.emplace_back(&x);
     }
+    std::cout << " Done.\n";
     // Allocate device memory, padding each buffer to the tile size
+    std::cout << "Copying data to the device..." << std::flush;
     auto a_device = context.MakeBuffer<DramLine, hlslib::ocl::Access::read>(
         hlslib::ocl::StorageType::DDR, 1,
         kLinesPerNumber * (hlslib::CeilDivide(size_n, kTileSizeN) * kTileSizeN) * size_k);
@@ -62,16 +67,23 @@ bool RunTest(std::string const &kernel_path, int size_n, int size_k, int size_m)
     a_device.CopyFromHost(0, kLinesPerNumber * size_n * size_k, reinterpret_cast<DramLine const *>(&a_host[0]));
     b_device.CopyFromHost(0, kLinesPerNumber * size_k * size_m, reinterpret_cast<DramLine const *>(&b_host[0]));
     c_device.CopyFromHost(0, kLinesPerNumber * size_n * size_m, reinterpret_cast<DramLine const *>(&c_host[0]));
+    std::cout << " Done.\n";
     // In simulation mode, this will call the function "MatrixMultiplication" and run it in software.
     // Otherwise, the provided path to a kernel binary will be loaded and executed.
     auto kernel = program.MakeKernel(MatrixMultiplication, "MatrixMultiplication", a_device, b_device, c_device,
                                      c_device, size_n, size_k, size_m);
+    const unsigned long expected_cycles = hlslib::CeilDivide(size_n, kTileSizeN) *
+                                          hlslib::CeilDivide(size_m, kTileSizeM) * kTileSizeN * kTileSizeM * size_k;
+    const float expected_runtime = expected_cycles / 0.3e9;
+    std::cout << "The expected number of cycles to completion is " << expected_cycles << ", which is "
+              << expected_runtime << " seconds at 300 MHz.\n";
+    const auto communication_volume = hlslib::CeilDivide(size_n, kTileSizeN) * hlslib::CeilDivide(size_m, kTileSizeM) *
+                                      ((kTileSizeN + kTileSizeM) * size_k + 2 * kTileSizeN * kTileSizeM);
+    std::cout << "This communicates " << 1e-6 * kBytes * communication_volume << " MB, requiring a bandwidth of "
+              << 1e-9 * kBytes * communication_volume / expected_runtime << " GB/s.\n";
     std::cout << "Executing kernel...\n";
     const auto elapsed = kernel.ExecuteTask();
     std::cout << "Ran in " << elapsed.first << " seconds.\n";
-    const unsigned long ideal_cycles =
-        hlslib::CeilDivide(size_n, kTileSizeN) * hlslib::CeilDivide(size_m, kTileSizeM) * kTileSizeN * kTileSizeM;
-    std::cout << "The ideal number of cycles to completion is " << ideal_cycles << ".\n";
     // Copy back result
     c_device.CopyToHost(0, kLinesPerNumber * size_n * size_m, reinterpret_cast<DramLine *>(&c_host[0]));
     // Run reference implementation. Because of GMP's "clever" way of wrapping their struct in an array of size 1,
@@ -95,6 +107,7 @@ bool RunTest(std::string const &kernel_path, int size_n, int size_k, int size_m)
             }
         }
     }
+    std::cout << "Results successfully verified against MPFR.\n";
 
     // Clean up
     for (int n = 0; n < size_n; ++n) {