diff --git a/c10/util/llvmMathExtras.h b/c10/util/llvmMathExtras.h
new file mode 100644
index 0000000000000..76ae3b26a29ba
--- /dev/null
+++ b/c10/util/llvmMathExtras.h
@@ -0,0 +1,854 @@
+//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
+ //
+ // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ // See https://llvm.org/LICENSE.txt for license information.
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ //
+ //===----------------------------------------------------------------------===//
+ //
+ // This file contains some functions that are useful for math stuff.
+ //
+ //===----------------------------------------------------------------------===//
+
+ #ifndef LLVM_SUPPORT_MATHEXTRAS_H
+ #define LLVM_SUPPORT_MATHEXTRAS_H
+
+ #include <algorithm>
+ #include <cmath>
+ #include <cassert>
+ #include <climits>
+ #include <cstring>
+ #include <limits>
+ #include <type_traits>
+
+ #ifdef __ANDROID_NDK__
+ #include <android/api-level.h>
+ #endif
+
+ #ifndef __has_builtin
+ # define __has_builtin(x) 0
+ #endif
+
+ #ifndef LLVM_GNUC_PREREQ
+ # if defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__)
+ #  define LLVM_GNUC_PREREQ(maj, min, patch) \
+     ((__GNUC__ << 20) + (__GNUC_MINOR__ << 10) + __GNUC_PATCHLEVEL__ >= \
+      ((maj) << 20) + ((min) << 10) + (patch))
+ # elif defined(__GNUC__) && defined(__GNUC_MINOR__)
+ #  define LLVM_GNUC_PREREQ(maj, min, patch) \
+     ((__GNUC__ << 20) + (__GNUC_MINOR__ << 10) >= ((maj) << 20) + ((min) << 10))
+ # else
+ #  define LLVM_GNUC_PREREQ(maj, min, patch) 0
+ # endif
+ #endif
+
+ #ifdef _MSC_VER
+ // Declare these intrinsics manually rather including intrin.h. It's very
+ // expensive, and MathExtras.h is popular.
+ // #include <intrin.h>
+ extern "C" {
+ unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
+ unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
+ unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
+ unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
+ }
+ #endif
+
+ namespace llvm {
+ /// The behavior an operation has on an input of 0.
+ enum ZeroBehavior {
+   /// The returned value is undefined.
+   ZB_Undefined,
+   /// The returned value is numeric_limits<T>::max()
+   ZB_Max,
+   /// The returned value is numeric_limits<T>::digits
+   ZB_Width
+ };
+
+ namespace detail {
+ template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
+   static std::size_t count(T Val, ZeroBehavior) {
+     if (!Val)
+       return std::numeric_limits<T>::digits;
+     if (Val & 0x1)
+       return 0;
+
+     // Bisection method.
+     std::size_t ZeroBits = 0;
+     T Shift = std::numeric_limits<T>::digits >> 1;
+     T Mask = std::numeric_limits<T>::max() >> Shift;
+     while (Shift) {
+       if ((Val & Mask) == 0) {
+         Val >>= Shift;
+         ZeroBits |= Shift;
+       }
+       Shift >>= 1;
+       Mask >>= Shift;
+     }
+     return ZeroBits;
+   }
+ };
+
+ #if __GNUC__ >= 4 || defined(_MSC_VER)
+ template <typename T> struct TrailingZerosCounter<T, 4> {
+   static std::size_t count(T Val, ZeroBehavior ZB) {
+     if (ZB != ZB_Undefined && Val == 0)
+       return 32;
+
+ #if __has_builtin(__builtin_ctz) || LLVM_GNUC_PREREQ(4, 0, 0)
+     return __builtin_ctz(Val);
+ #elif defined(_MSC_VER)
+     unsigned long Index;
+     _BitScanForward(&Index, Val);
+     return Index;
+ #endif
+   }
+ };
+
+ #if !defined(_MSC_VER) || defined(_M_X64)
+ template <typename T> struct TrailingZerosCounter<T, 8> {
+   static std::size_t count(T Val, ZeroBehavior ZB) {
+     if (ZB != ZB_Undefined && Val == 0)
+       return 64;
+
+ #if __has_builtin(__builtin_ctzll) || LLVM_GNUC_PREREQ(4, 0, 0)
+     return __builtin_ctzll(Val);
+ #elif defined(_MSC_VER)
+     unsigned long Index;
+     _BitScanForward64(&Index, Val);
+     return Index;
+ #endif
+   }
+ };
+ #endif
+ #endif
+ } // namespace detail
+
+ /// Count number of 0's from the least significant bit to the most
+ ///   stopping at the first 1.
+ ///
+ /// Only unsigned integral types are allowed.
+ ///
+ /// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
+ ///   valid arguments.
+ template <typename T>
+ std::size_t countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
+   static_assert(std::numeric_limits<T>::is_integer &&
+                     !std::numeric_limits<T>::is_signed,
+                 "Only unsigned integral types are allowed.");
+   return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
+ }
+
+ namespace detail {
+ template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
+   static std::size_t count(T Val, ZeroBehavior) {
+     if (!Val)
+       return std::numeric_limits<T>::digits;
+
+     // Bisection method.
+     std::size_t ZeroBits = 0;
+     for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
+       T Tmp = Val >> Shift;
+       if (Tmp)
+         Val = Tmp;
+       else
+         ZeroBits |= Shift;
+     }
+     return ZeroBits;
+   }
+ };
+
+ #if __GNUC__ >= 4 || defined(_MSC_VER)
+ template <typename T> struct LeadingZerosCounter<T, 4> {
+   static std::size_t count(T Val, ZeroBehavior ZB) {
+     if (ZB != ZB_Undefined && Val == 0)
+       return 32;
+
+ #if __has_builtin(__builtin_clz) || LLVM_GNUC_PREREQ(4, 0, 0)
+     return __builtin_clz(Val);
+ #elif defined(_MSC_VER)
+     unsigned long Index;
+     _BitScanReverse(&Index, Val);
+     return Index ^ 31;
+ #endif
+   }
+ };
+
+ #if !defined(_MSC_VER) || defined(_M_X64)
+ template <typename T> struct LeadingZerosCounter<T, 8> {
+   static std::size_t count(T Val, ZeroBehavior ZB) {
+     if (ZB != ZB_Undefined && Val == 0)
+       return 64;
+
+ #if __has_builtin(__builtin_clzll) || LLVM_GNUC_PREREQ(4, 0, 0)
+     return __builtin_clzll(Val);
+ #elif defined(_MSC_VER)
+     unsigned long Index;
+     _BitScanReverse64(&Index, Val);
+     return Index ^ 63;
+ #endif
+   }
+ };
+ #endif
+ #endif
+ } // namespace detail
+
+ /// Count number of 0's from the most significant bit to the least
+ ///   stopping at the first 1.
+ ///
+ /// Only unsigned integral types are allowed.
+ ///
+ /// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
+ ///   valid arguments.
+ template <typename T>
+ std::size_t countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
+   static_assert(std::numeric_limits<T>::is_integer &&
+                     !std::numeric_limits<T>::is_signed,
+                 "Only unsigned integral types are allowed.");
+   return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
+ }
+
+ /// Get the index of the first set bit starting from the least
+ ///   significant bit.
+ ///
+ /// Only unsigned integral types are allowed.
+ ///
+ /// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
+ ///   valid arguments.
+ template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
+   if (ZB == ZB_Max && Val == 0)
+     return std::numeric_limits<T>::max();
+
+   return countTrailingZeros(Val, ZB_Undefined);
+ }
+
+ /// Create a bitmask with the N right-most bits set to 1, and all other
+ /// bits set to 0.  Only unsigned types are allowed.
+ template <typename T> T maskTrailingOnes(unsigned N) {
+   static_assert(std::is_unsigned<T>::value, "Invalid type!");
+   const unsigned Bits = CHAR_BIT * sizeof(T);
+   assert(N <= Bits && "Invalid bit index");
+   return N == 0 ? 0 : (T(-1) >> (Bits - N));
+ }
+
+ /// Create a bitmask with the N left-most bits set to 1, and all other
+ /// bits set to 0.  Only unsigned types are allowed.
+ template <typename T> T maskLeadingOnes(unsigned N) {
+   return ~maskTrailingOnes<T>(CHAR_BIT * sizeof(T) - N);
+ }
+
+ /// Create a bitmask with the N right-most bits set to 0, and all other
+ /// bits set to 1.  Only unsigned types are allowed.
+ template <typename T> T maskTrailingZeros(unsigned N) {
+   return maskLeadingOnes<T>(CHAR_BIT * sizeof(T) - N);
+ }
+
+ /// Create a bitmask with the N left-most bits set to 0, and all other
+ /// bits set to 1.  Only unsigned types are allowed.
+ template <typename T> T maskLeadingZeros(unsigned N) {
+   return maskTrailingOnes<T>(CHAR_BIT * sizeof(T) - N);
+ }
+
+ /// Get the index of the last set bit starting from the least
+ ///   significant bit.
+ ///
+ /// Only unsigned integral types are allowed.
+ ///
+ /// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
+ ///   valid arguments.
+ template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
+   if (ZB == ZB_Max && Val == 0)
+     return std::numeric_limits<T>::max();
+
+   // Use ^ instead of - because both gcc and llvm can remove the associated ^
+   // in the __builtin_clz intrinsic on x86.
+   return countLeadingZeros(Val, ZB_Undefined) ^
+          (std::numeric_limits<T>::digits - 1);
+ }
+
+ /// Macro compressed bit reversal table for 256 bits.
+ ///
+ /// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
+ static const unsigned char BitReverseTable256[256] = {
+ #define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
+ #define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
+ #define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
+   R6(0), R6(2), R6(1), R6(3)
+ #undef R2
+ #undef R4
+ #undef R6
+ };
+
+ /// Reverse the bits in \p Val.
+ template <typename T>
+ T reverseBits(T Val) {
+   unsigned char in[sizeof(Val)];
+   unsigned char out[sizeof(Val)];
+   std::memcpy(in, &Val, sizeof(Val));
+   for (unsigned i = 0; i < sizeof(Val); ++i)
+     out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
+   std::memcpy(&Val, out, sizeof(Val));
+   return Val;
+ }
+
+ // NOTE: The following support functions use the _32/_64 extensions instead of
+ // type overloading so that signed and unsigned integers can be used without
+ // ambiguity.
+
+ /// Return the high 32 bits of a 64 bit value.
+ constexpr inline uint32_t Hi_32(uint64_t Value) {
+   return static_cast<uint32_t>(Value >> 32);
+ }
+
+ /// Return the low 32 bits of a 64 bit value.
+ constexpr inline uint32_t Lo_32(uint64_t Value) {
+   return static_cast<uint32_t>(Value);
+ }
+
+ /// Make a 64-bit integer from a high / low pair of 32-bit integers.
+ constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
+   return ((uint64_t)High << 32) | (uint64_t)Low;
+ }
+
+ /// Checks if an integer fits into the given bit width.
+ template <unsigned N> constexpr inline bool isInt(int64_t x) {
+   return N >= 64 || (-(INT64_C(1)<<(N-1)) <= x && x < (INT64_C(1)<<(N-1)));
+ }
+ // Template specializations to get better code for common cases.
+ template <> constexpr inline bool isInt<8>(int64_t x) {
+   return static_cast<int8_t>(x) == x;
+ }
+ template <> constexpr inline bool isInt<16>(int64_t x) {
+   return static_cast<int16_t>(x) == x;
+ }
+ template <> constexpr inline bool isInt<32>(int64_t x) {
+   return static_cast<int32_t>(x) == x;
+ }
+
+ /// Checks if a signed integer is an N bit number shifted left by S.
+ template <unsigned N, unsigned S>
+ constexpr inline bool isShiftedInt(int64_t x) {
+   static_assert(
+       N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number.");
+   static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide.");
+   return isInt<N + S>(x) && (x % (UINT64_C(1) << S) == 0);
+ }
+
+ /// Checks if an unsigned integer fits into the given bit width.
+ ///
+ /// This is written as two functions rather than as simply
+ ///
+ ///   return N >= 64 || X < (UINT64_C(1) << N);
+ ///
+ /// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting
+ /// left too many places.
+ template <unsigned N>
+ constexpr inline typename std::enable_if<(N < 64), bool>::type
+ isUInt(uint64_t X) {
+   static_assert(N > 0, "isUInt<0> doesn't make sense");
+   return X < (UINT64_C(1) << (N));
+ }
+ template <unsigned N>
+ constexpr inline typename std::enable_if<N >= 64, bool>::type
+ isUInt(uint64_t X) {
+   return true;
+ }
+
+ // Template specializations to get better code for common cases.
+ template <> constexpr inline bool isUInt<8>(uint64_t x) {
+   return static_cast<uint8_t>(x) == x;
+ }
+ template <> constexpr inline bool isUInt<16>(uint64_t x) {
+   return static_cast<uint16_t>(x) == x;
+ }
+ template <> constexpr inline bool isUInt<32>(uint64_t x) {
+   return static_cast<uint32_t>(x) == x;
+ }
+
+ /// Checks if a unsigned integer is an N bit number shifted left by S.
+ template <unsigned N, unsigned S>
+ constexpr inline bool isShiftedUInt(uint64_t x) {
+   static_assert(
+       N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)");
+   static_assert(N + S <= 64,
+                 "isShiftedUInt<N, S> with N + S > 64 is too wide.");
+   // Per the two static_asserts above, S must be strictly less than 64.  So
+   // 1 << S is not undefined behavior.
+   return isUInt<N + S>(x) && (x % (UINT64_C(1) << S) == 0);
+ }
+
+ /// Gets the maximum value for a N-bit unsigned integer.
+ inline uint64_t maxUIntN(uint64_t N) {
+   assert(N > 0 && N <= 64 && "integer width out of range");
+
+   // uint64_t(1) << 64 is undefined behavior, so we can't do
+   //   (uint64_t(1) << N) - 1
+   // without checking first that N != 64.  But this works and doesn't have a
+   // branch.
+   return UINT64_MAX >> (64 - N);
+ }
+
+ /// Gets the minimum value for a N-bit signed integer.
+ inline int64_t minIntN(int64_t N) {
+   assert(N > 0 && N <= 64 && "integer width out of range");
+
+   return -(UINT64_C(1)<<(N-1));
+ }
+
+ /// Gets the maximum value for a N-bit signed integer.
+ inline int64_t maxIntN(int64_t N) {
+   assert(N > 0 && N <= 64 && "integer width out of range");
+
+   // This relies on two's complement wraparound when N == 64, so we convert to
+   // int64_t only at the very end to avoid UB.
+   return (UINT64_C(1) << (N - 1)) - 1;
+ }
+
+ /// Checks if an unsigned integer fits into the given (dynamic) bit width.
+ inline bool isUIntN(unsigned N, uint64_t x) {
+   return N >= 64 || x <= maxUIntN(N);
+ }
+
+ /// Checks if an signed integer fits into the given (dynamic) bit width.
+ inline bool isIntN(unsigned N, int64_t x) {
+   return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N));
+ }
+
+ /// Return true if the argument is a non-empty sequence of ones starting at the
+ /// least significant bit with the remainder zero (32 bit version).
+ /// Ex. isMask_32(0x0000FFFFU) == true.
+ constexpr inline bool isMask_32(uint32_t Value) {
+   return Value && ((Value + 1) & Value) == 0;
+ }
+
+ /// Return true if the argument is a non-empty sequence of ones starting at the
+ /// least significant bit with the remainder zero (64 bit version).
+ constexpr inline bool isMask_64(uint64_t Value) {
+   return Value && ((Value + 1) & Value) == 0;
+ }
+
+ /// Return true if the argument contains a non-empty sequence of ones with the
+ /// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
+ constexpr inline bool isShiftedMask_32(uint32_t Value) {
+   return Value && isMask_32((Value - 1) | Value);
+ }
+
+ /// Return true if the argument contains a non-empty sequence of ones with the
+ /// remainder zero (64 bit version.)
+ constexpr inline bool isShiftedMask_64(uint64_t Value) {
+   return Value && isMask_64((Value - 1) | Value);
+ }
+
+ /// Return true if the argument is a power of two > 0.
+ /// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
+ constexpr inline bool isPowerOf2_32(uint32_t Value) {
+   return Value && !(Value & (Value - 1));
+ }
+
+ /// Return true if the argument is a power of two > 0 (64 bit edition.)
+ constexpr inline bool isPowerOf2_64(uint64_t Value) {
+   return Value && !(Value & (Value - 1));
+ }
+
+ /// Count the number of ones from the most significant bit to the first
+ /// zero bit.
+ ///
+ /// Ex. countLeadingOnes(0xFF0FFF00) == 8.
+ /// Only unsigned integral types are allowed.
+ ///
+ /// \param ZB the behavior on an input of all ones. Only ZB_Width and
+ /// ZB_Undefined are valid arguments.
+ template <typename T>
+ std::size_t countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
+   static_assert(std::numeric_limits<T>::is_integer &&
+                     !std::numeric_limits<T>::is_signed,
+                 "Only unsigned integral types are allowed.");
+   return countLeadingZeros<T>(~Value, ZB);
+ }
+
+ /// Count the number of ones from the least significant bit to the first
+ /// zero bit.
+ ///
+ /// Ex. countTrailingOnes(0x00FF00FF) == 8.
+ /// Only unsigned integral types are allowed.
+ ///
+ /// \param ZB the behavior on an input of all ones. Only ZB_Width and
+ /// ZB_Undefined are valid arguments.
+ template <typename T>
+ std::size_t countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
+   static_assert(std::numeric_limits<T>::is_integer &&
+                     !std::numeric_limits<T>::is_signed,
+                 "Only unsigned integral types are allowed.");
+   return countTrailingZeros<T>(~Value, ZB);
+ }
+
+ namespace detail {
+ template <typename T, std::size_t SizeOfT> struct PopulationCounter {
+   static unsigned count(T Value) {
+     // Generic version, forward to 32 bits.
+     static_assert(SizeOfT <= 4, "Not implemented!");
+ #if __GNUC__ >= 4
+     return __builtin_popcount(Value);
+ #else
+     uint32_t v = Value;
+     v = v - ((v >> 1) & 0x55555555);
+     v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
+     return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
+ #endif
+   }
+ };
+
+ template <typename T> struct PopulationCounter<T, 8> {
+   static unsigned count(T Value) {
+ #if __GNUC__ >= 4
+     return __builtin_popcountll(Value);
+ #else
+     uint64_t v = Value;
+     v = v - ((v >> 1) & 0x5555555555555555ULL);
+     v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
+     v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
+     return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
+ #endif
+   }
+ };
+ } // namespace detail
+
+ /// Count the number of set bits in a value.
+ /// Ex. countPopulation(0xF000F000) = 8
+ /// Returns 0 if the word is zero.
+ template <typename T>
+ inline unsigned countPopulation(T Value) {
+   static_assert(std::numeric_limits<T>::is_integer &&
+                     !std::numeric_limits<T>::is_signed,
+                 "Only unsigned integral types are allowed.");
+   return detail::PopulationCounter<T, sizeof(T)>::count(Value);
+ }
+
+ /// Return the log base 2 of the specified value.
+ inline double Log2(double Value) {
+ #if defined(__ANDROID_API__) && __ANDROID_API__ < 18
+   return __builtin_log(Value) / __builtin_log(2.0);
+ #else
+   return log2(Value);
+ #endif
+ }
+
+ /// Return the floor log base 2 of the specified value, -1 if the value is zero.
+ /// (32 bit edition.)
+ /// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
+ inline unsigned Log2_32(uint32_t Value) {
+   return 31 - countLeadingZeros(Value);
+ }
+
+ /// Return the floor log base 2 of the specified value, -1 if the value is zero.
+ /// (64 bit edition.)
+ inline unsigned Log2_64(uint64_t Value) {
+   return 63 - countLeadingZeros(Value);
+ }
+
+ /// Return the ceil log base 2 of the specified value, 32 if the value is zero.
+ /// (32 bit edition).
+ /// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
+ inline unsigned Log2_32_Ceil(uint32_t Value) {
+   return 32 - countLeadingZeros(Value - 1);
+ }
+
+ /// Return the ceil log base 2 of the specified value, 64 if the value is zero.
+ /// (64 bit edition.)
+ inline unsigned Log2_64_Ceil(uint64_t Value) {
+   return 64 - countLeadingZeros(Value - 1);
+ }
+
+ /// Return the greatest common divisor of the values using Euclid's algorithm.
+ inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) {
+   while (B) {
+     uint64_t T = B;
+     B = A % B;
+     A = T;
+   }
+   return A;
+ }
+
+ /// This function takes a 64-bit integer and returns the bit equivalent double.
+ inline double BitsToDouble(uint64_t Bits) {
+   double D;
+   static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
+   memcpy(&D, &Bits, sizeof(Bits));
+   return D;
+ }
+
+ /// This function takes a 32-bit integer and returns the bit equivalent float.
+ inline float BitsToFloat(uint32_t Bits) {
+   float F;
+   static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
+   memcpy(&F, &Bits, sizeof(Bits));
+   return F;
+ }
+
+ /// This function takes a double and returns the bit equivalent 64-bit integer.
+ /// Note that copying doubles around changes the bits of NaNs on some hosts,
+ /// notably x86, so this routine cannot be used if these bits are needed.
+ inline uint64_t DoubleToBits(double Double) {
+   uint64_t Bits;
+   static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
+   memcpy(&Bits, &Double, sizeof(Double));
+   return Bits;
+ }
+
+ /// This function takes a float and returns the bit equivalent 32-bit integer.
+ /// Note that copying floats around changes the bits of NaNs on some hosts,
+ /// notably x86, so this routine cannot be used if these bits are needed.
+ inline uint32_t FloatToBits(float Float) {
+   uint32_t Bits;
+   static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
+   memcpy(&Bits, &Float, sizeof(Float));
+   return Bits;
+ }
+
+ /// A and B are either alignments or offsets. Return the minimum alignment that
+ /// may be assumed after adding the two together.
+ constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
+   // The largest power of 2 that divides both A and B.
+   //
+   // Replace "-Value" by "1+~Value" in the following commented code to avoid
+   // MSVC warning C4146
+   //    return (A | B) & -(A | B);
+   return (A | B) & (1 + ~(A | B));
+ }
+
+ /// Aligns \c Addr to \c Alignment bytes, rounding up.
+ ///
+ /// Alignment should be a power of two.  This method rounds up, so
+ /// alignAddr(7, 4) == 8 and alignAddr(8, 4) == 8.
+ inline uintptr_t alignAddr(const void *Addr, size_t Alignment) {
+   assert(Alignment && isPowerOf2_64((uint64_t)Alignment) &&
+          "Alignment is not a power of two!");
+
+   assert((uintptr_t)Addr + Alignment - 1 >= (uintptr_t)Addr);
+
+   return (((uintptr_t)Addr + Alignment - 1) & ~(uintptr_t)(Alignment - 1));
+ }
+
+ /// Returns the necessary adjustment for aligning \c Ptr to \c Alignment
+ /// bytes, rounding up.
+ inline size_t alignmentAdjustment(const void *Ptr, size_t Alignment) {
+   return alignAddr(Ptr, Alignment) - (uintptr_t)Ptr;
+ }
+
+ /// Returns the next power of two (in 64-bits) that is strictly greater than A.
+ /// Returns zero on overflow.
+ inline uint64_t NextPowerOf2(uint64_t A) {
+   A |= (A >> 1);
+   A |= (A >> 2);
+   A |= (A >> 4);
+   A |= (A >> 8);
+   A |= (A >> 16);
+   A |= (A >> 32);
+   return A + 1;
+ }
+
+ /// Returns the power of two which is less than or equal to the given value.
+ /// Essentially, it is a floor operation across the domain of powers of two.
+ inline uint64_t PowerOf2Floor(uint64_t A) {
+   if (!A) return 0;
+   return 1ull << (63 - countLeadingZeros(A, ZB_Undefined));
+ }
+
+ /// Returns the power of two which is greater than or equal to the given value.
+ /// Essentially, it is a ceil operation across the domain of powers of two.
+ inline uint64_t PowerOf2Ceil(uint64_t A) {
+   if (!A)
+     return 0;
+   return NextPowerOf2(A - 1);
+ }
+
+ /// Returns the next integer (mod 2**64) that is greater than or equal to
+ /// \p Value and is a multiple of \p Align. \p Align must be non-zero.
+ ///
+ /// If non-zero \p Skew is specified, the return value will be a minimal
+ /// integer that is greater than or equal to \p Value and equal to
+ /// \p Align * N + \p Skew for some integer N. If \p Skew is larger than
+ /// \p Align, its value is adjusted to '\p Skew mod \p Align'.
+ ///
+ /// Examples:
+ /// \code
+ ///   alignTo(5, 8) = 8
+ ///   alignTo(17, 8) = 24
+ ///   alignTo(~0LL, 8) = 0
+ ///   alignTo(321, 255) = 510
+ ///
+ ///   alignTo(5, 8, 7) = 7
+ ///   alignTo(17, 8, 1) = 17
+ ///   alignTo(~0LL, 8, 3) = 3
+ ///   alignTo(321, 255, 42) = 552
+ /// \endcode
+ inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
+   assert(Align != 0u && "Align can't be 0.");
+   Skew %= Align;
+   return (Value + Align - 1 - Skew) / Align * Align + Skew;
+ }
+
+ /// Returns the next integer (mod 2**64) that is greater than or equal to
+ /// \p Value and is a multiple of \c Align. \c Align must be non-zero.
+ template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) {
+   static_assert(Align != 0u, "Align must be non-zero");
+   return (Value + Align - 1) / Align * Align;
+ }
+
+ /// Returns the integer ceil(Numerator / Denominator).
+ inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
+   return alignTo(Numerator, Denominator) / Denominator;
+ }
+
+ /// \c alignTo for contexts where a constant expression is required.
+ /// \sa alignTo
+ ///
+ /// \todo FIXME: remove when \c constexpr becomes really \c constexpr
+ template <uint64_t Align>
+ struct AlignTo {
+   static_assert(Align != 0u, "Align must be non-zero");
+   template <uint64_t Value>
+   struct from_value {
+     static const uint64_t value = (Value + Align - 1) / Align * Align;
+   };
+ };
+
+ /// Returns the largest uint64_t less than or equal to \p Value and is
+ /// \p Skew mod \p Align. \p Align must be non-zero
+ inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
+   assert(Align != 0u && "Align can't be 0.");
+   Skew %= Align;
+   return (Value - Skew) / Align * Align + Skew;
+ }
+
+ /// Returns the offset to the next integer (mod 2**64) that is greater than
+ /// or equal to \p Value and is a multiple of \p Align. \p Align must be
+ /// non-zero.
+ inline uint64_t OffsetToAlignment(uint64_t Value, uint64_t Align) {
+   return alignTo(Value, Align) - Value;
+ }
+
+ /// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
+ /// Requires 0 < B <= 32.
+ template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) {
+   static_assert(B > 0, "Bit width can't be 0.");
+   static_assert(B <= 32, "Bit width out of range.");
+   return int32_t(X << (32 - B)) >> (32 - B);
+ }
+
+ /// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
+ /// Requires 0 < B < 32.
+ inline int32_t SignExtend32(uint32_t X, unsigned B) {
+   assert(B > 0 && "Bit width can't be 0.");
+   assert(B <= 32 && "Bit width out of range.");
+   return int32_t(X << (32 - B)) >> (32 - B);
+ }
+
+ /// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
+ /// Requires 0 < B < 64.
+ template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) {
+   static_assert(B > 0, "Bit width can't be 0.");
+   static_assert(B <= 64, "Bit width out of range.");
+   return int64_t(x << (64 - B)) >> (64 - B);
+ }
+
+ /// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
+ /// Requires 0 < B < 64.
+ inline int64_t SignExtend64(uint64_t X, unsigned B) {
+   assert(B > 0 && "Bit width can't be 0.");
+   assert(B <= 64 && "Bit width out of range.");
+   return int64_t(X << (64 - B)) >> (64 - B);
+ }
+
+ /// Subtract two unsigned integers, X and Y, of type T and return the absolute
+ /// value of the result.
+ template <typename T>
+ typename std::enable_if<std::is_unsigned<T>::value, T>::type
+ AbsoluteDifference(T X, T Y) {
+   return std::max(X, Y) - std::min(X, Y);
+ }
+
+ /// Add two unsigned integers, X and Y, of type T.  Clamp the result to the
+ /// maximum representable value of T on overflow.  ResultOverflowed indicates if
+ /// the result is larger than the maximum representable value of type T.
+ template <typename T>
+ typename std::enable_if<std::is_unsigned<T>::value, T>::type
+ SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) {
+   bool Dummy;
+   bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
+   // Hacker's Delight, p. 29
+   T Z = X + Y;
+   Overflowed = (Z < X || Z < Y);
+   if (Overflowed)
+     return std::numeric_limits<T>::max();
+   else
+     return Z;
+ }
+
+ /// Multiply two unsigned integers, X and Y, of type T.  Clamp the result to the
+ /// maximum representable value of T on overflow.  ResultOverflowed indicates if
+ /// the result is larger than the maximum representable value of type T.
+ template <typename T>
+ typename std::enable_if<std::is_unsigned<T>::value, T>::type
+ SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) {
+   bool Dummy;
+   bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
+
+   // Hacker's Delight, p. 30 has a different algorithm, but we don't use that
+   // because it fails for uint16_t (where multiplication can have undefined
+   // behavior due to promotion to int), and requires a division in addition
+   // to the multiplication.
+
+   Overflowed = false;
+
+   // Log2(Z) would be either Log2Z or Log2Z + 1.
+   // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z
+   // will necessarily be less than Log2Max as desired.
+   int Log2Z = Log2_64(X) + Log2_64(Y);
+   const T Max = std::numeric_limits<T>::max();
+   int Log2Max = Log2_64(Max);
+   if (Log2Z < Log2Max) {
+     return X * Y;
+   }
+   if (Log2Z > Log2Max) {
+     Overflowed = true;
+     return Max;
+   }
+
+   // We're going to use the top bit, and maybe overflow one
+   // bit past it. Multiply all but the bottom bit then add
+   // that on at the end.
+   T Z = (X >> 1) * Y;
+   if (Z & ~(Max >> 1)) {
+     Overflowed = true;
+     return Max;
+   }
+   Z <<= 1;
+   if (X & 1)
+     return SaturatingAdd(Z, Y, ResultOverflowed);
+
+   return Z;
+ }
+
+ /// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to
+ /// the product. Clamp the result to the maximum representable value of T on
+ /// overflow. ResultOverflowed indicates if the result is larger than the
+ /// maximum representable value of type T.
+ template <typename T>
+ typename std::enable_if<std::is_unsigned<T>::value, T>::type
+ SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) {
+   bool Dummy;
+   bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
+
+   T Product = SaturatingMultiply(X, Y, &Overflowed);
+   if (Overflowed)
+     return Product;
+
+   return SaturatingAdd(A, Product, &Overflowed);
+ }
+
+ /// Use this rather than HUGE_VALF; the latter causes warnings on MSVC.
+ extern const float huge_valf;
+ } // End llvm namespace
+
+ #endif
diff --git a/c10/util/sparse_bitset.h b/c10/util/sparse_bitset.h
new file mode 100644
index 0000000000000..150e0454ca4a2
--- /dev/null
+++ b/c10/util/sparse_bitset.h
@@ -0,0 +1,871 @@
+//===- llvm/ADT/SparseBitVector.h - Efficient Sparse BitVector --*- C++ -*-===//
+ //
+ // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ // See https://llvm.org/LICENSE.txt for license information.
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ //
+ //===----------------------------------------------------------------------===//
+ //
+ // This file defines the SparseBitVector class.  See the doxygen comment for
+ // SparseBitVector for more details on the algorithm used.
+ //
+ //===----------------------------------------------------------------------===//
+
+#pragma once
+ #include <cassert>
+ #include <climits>
+ #include <cstring>
+ #include <iterator>
+ #include <list>
+ #include "./llvmMathExtras.h"
+
+ namespace c10 {
+
+ /// SparseBitVector is an implementation of a bitvector that is sparse by only
+ /// storing the elements that have non-zero bits set.  In order to make this
+ /// fast for the most common cases, SparseBitVector is implemented as a linked
+ /// list of SparseBitVectorElements.  We maintain a pointer to the last
+ /// SparseBitVectorElement accessed (in the form of a list iterator), in order
+ /// to make multiple in-order test/set constant time after the first one is
+ /// executed.  Note that using vectors to store SparseBitVectorElement's does
+ /// not work out very well because it causes insertion in the middle to take
+ /// enormous amounts of time with a large amount of bits.  Other structures that
+ /// have better worst cases for insertion in the middle (various balanced trees,
+ /// etc) do not perform as well in practice as a linked list with this iterator
+ /// kept up to date.  They are also significantly more memory intensive.
+
+ template <unsigned ElementSize = 128> struct SparseBitVectorElement {
+ public:
+   using BitWord = unsigned long;
+   using size_type = unsigned;
+   enum {
+     BITWORD_SIZE = sizeof(BitWord) * CHAR_BIT,
+     BITWORDS_PER_ELEMENT = (ElementSize + BITWORD_SIZE - 1) / BITWORD_SIZE,
+     BITS_PER_ELEMENT = ElementSize
+   };
+
+ private:
+   // Index of Element in terms of where first bit starts.
+   unsigned ElementIndex;
+   BitWord Bits[BITWORDS_PER_ELEMENT];
+
+   SparseBitVectorElement() {
+     ElementIndex = ~0U;
+     memset(&Bits[0], 0, sizeof (BitWord) * BITWORDS_PER_ELEMENT);
+   }
+
+ public:
+   explicit SparseBitVectorElement(unsigned Idx) {
+     ElementIndex = Idx;
+     memset(&Bits[0], 0, sizeof (BitWord) * BITWORDS_PER_ELEMENT);
+   }
+
+   // Comparison.
+   bool operator==(const SparseBitVectorElement &RHS) const {
+     if (ElementIndex != RHS.ElementIndex)
+       return false;
+     for (unsigned i = 0; i < BITWORDS_PER_ELEMENT; ++i)
+       if (Bits[i] != RHS.Bits[i])
+         return false;
+     return true;
+   }
+
+   bool operator!=(const SparseBitVectorElement &RHS) const {
+     return !(*this == RHS);
+   }
+
+   // Return the bits that make up word Idx in our element.
+   BitWord word(unsigned Idx) const {
+     assert(Idx < BITWORDS_PER_ELEMENT);
+     return Bits[Idx];
+   }
+
+   unsigned index() const {
+     return ElementIndex;
+   }
+
+   bool empty() const {
+     for (unsigned i = 0; i < BITWORDS_PER_ELEMENT; ++i)
+       if (Bits[i])
+         return false;
+     return true;
+   }
+
+   void set(unsigned Idx) {
+     Bits[Idx / BITWORD_SIZE] |= 1L << (Idx % BITWORD_SIZE);
+   }
+
+   bool test_and_set(unsigned Idx) {
+     bool old = test(Idx);
+     if (!old) {
+       set(Idx);
+       return true;
+     }
+     return false;
+   }
+
+   void reset(unsigned Idx) {
+     Bits[Idx / BITWORD_SIZE] &= ~(1L << (Idx % BITWORD_SIZE));
+   }
+
+   bool test(unsigned Idx) const {
+     return Bits[Idx / BITWORD_SIZE] & (1L << (Idx % BITWORD_SIZE));
+   }
+
+   size_type count() const {
+     unsigned NumBits = 0;
+     for (unsigned i = 0; i < BITWORDS_PER_ELEMENT; ++i)
+       NumBits += llvm::countPopulation(Bits[i]);
+     return NumBits;
+   }
+
+   /// find_first - Returns the index of the first set bit.
+   int find_first() const {
+     for (unsigned i = 0; i < BITWORDS_PER_ELEMENT; ++i)
+       if (Bits[i] != 0)
+         return i * BITWORD_SIZE + llvm::countTrailingZeros(Bits[i]);
+      throw std::runtime_error("Illegal empty element");
+   }
+
+   /// find_last - Returns the index of the last set bit.
+   int find_last() const {
+     for (unsigned I = 0; I < BITWORDS_PER_ELEMENT; ++I) {
+       unsigned Idx = BITWORDS_PER_ELEMENT - I - 1;
+       if (Bits[Idx] != 0)
+         return Idx * BITWORD_SIZE + BITWORD_SIZE -
+                 llvm::countLeadingZeros(Bits[Idx]);
+     }
+      throw std::runtime_error("Illegal empty element");
+   }
+
+   /// find_next - Returns the index of the next set bit starting from the
+   /// "Curr" bit. Returns -1 if the next set bit is not found.
+   int find_next(unsigned Curr) const {
+     if (Curr >= BITS_PER_ELEMENT)
+       return -1;
+
+     unsigned WordPos = Curr / BITWORD_SIZE;
+     unsigned BitPos = Curr % BITWORD_SIZE;
+     BitWord Copy = Bits[WordPos];
+     assert(WordPos <= BITWORDS_PER_ELEMENT
+            && "Word Position outside of element");
+
+     // Mask off previous bits.
+     Copy &= ~0UL << BitPos;
+
+     if (Copy != 0)
+       return WordPos * BITWORD_SIZE + llvm::countTrailingZeros(Copy);
+
+     // Check subsequent words.
+     for (unsigned i = WordPos+1; i < BITWORDS_PER_ELEMENT; ++i)
+       if (Bits[i] != 0)
+         return i * BITWORD_SIZE + llvm::countTrailingZeros(Bits[i]);
+     return -1;
+   }
+
+   // Union this element with RHS and return true if this one changed.
+   bool unionWith(const SparseBitVectorElement &RHS) {
+     bool changed = false;
+     for (unsigned i = 0; i < BITWORDS_PER_ELEMENT; ++i) {
+       BitWord old = changed ? 0 : Bits[i];
+
+       Bits[i] |= RHS.Bits[i];
+       if (!changed && old != Bits[i])
+         changed = true;
+     }
+     return changed;
+   }
+
+   // Return true if we have any bits in common with RHS
+   bool intersects(const SparseBitVectorElement &RHS) const {
+     for (unsigned i = 0; i < BITWORDS_PER_ELEMENT; ++i) {
+       if (RHS.Bits[i] & Bits[i])
+         return true;
+     }
+     return false;
+   }
+
+   // Intersect this Element with RHS and return true if this one changed.
+   // BecameZero is set to true if this element became all-zero bits.
+   bool intersectWith(const SparseBitVectorElement &RHS,
+                      bool &BecameZero) {
+     bool changed = false;
+     bool allzero = true;
+
+     BecameZero = false;
+     for (unsigned i = 0; i < BITWORDS_PER_ELEMENT; ++i) {
+       BitWord old = changed ? 0 : Bits[i];
+
+       Bits[i] &= RHS.Bits[i];
+       if (Bits[i] != 0)
+         allzero = false;
+
+       if (!changed && old != Bits[i])
+         changed = true;
+     }
+     BecameZero = allzero;
+     return changed;
+   }
+
+   // Intersect this Element with the complement of RHS and return true if this
+   // one changed.  BecameZero is set to true if this element became all-zero
+   // bits.
+   bool intersectWithComplement(const SparseBitVectorElement &RHS,
+                                bool &BecameZero) {
+     bool changed = false;
+     bool allzero = true;
+
+     BecameZero = false;
+     for (unsigned i = 0; i < BITWORDS_PER_ELEMENT; ++i) {
+       BitWord old = changed ? 0 : Bits[i];
+
+       Bits[i] &= ~RHS.Bits[i];
+       if (Bits[i] != 0)
+         allzero = false;
+
+       if (!changed && old != Bits[i])
+         changed = true;
+     }
+     BecameZero = allzero;
+     return changed;
+   }
+
+   // Three argument version of intersectWithComplement that intersects
+   // RHS1 & ~RHS2 into this element
+   void intersectWithComplement(const SparseBitVectorElement &RHS1,
+                                const SparseBitVectorElement &RHS2,
+                                bool &BecameZero) {
+     bool allzero = true;
+
+     BecameZero = false;
+     for (unsigned i = 0; i < BITWORDS_PER_ELEMENT; ++i) {
+       Bits[i] = RHS1.Bits[i] & ~RHS2.Bits[i];
+       if (Bits[i] != 0)
+         allzero = false;
+     }
+     BecameZero = allzero;
+   }
+ };
+
+ template <unsigned ElementSize = 128>
+ class SparseBitVector {
+   using ElementList = std::list<SparseBitVectorElement<ElementSize>>;
+   using ElementListIter = typename ElementList::iterator;
+   using ElementListConstIter = typename ElementList::const_iterator;
+   enum {
+     BITWORD_SIZE = SparseBitVectorElement<ElementSize>::BITWORD_SIZE
+   };
+
+   ElementList Elements;
+   // Pointer to our current Element. This has no visible effect on the external
+   // state of a SparseBitVector, it's just used to improve performance in the
+   // common case of testing/modifying bits with similar indices.
+   mutable ElementListIter CurrElementIter;
+
+   // This is like std::lower_bound, except we do linear searching from the
+   // current position.
+   ElementListIter FindLowerBoundImpl(unsigned ElementIndex) const {
+
+     // We cache a non-const iterator so we're forced to resort to const_cast to
+     // get the begin/end in the case where 'this' is const. To avoid duplication
+     // of code with the only difference being whether the const cast is present
+     // 'this' is always const in this particular function and we sort out the
+     // difference in FindLowerBound and FindLowerBoundConst.
+     ElementListIter Begin =
+         const_cast<SparseBitVector<ElementSize> *>(this)->Elements.begin();
+     ElementListIter End =
+         const_cast<SparseBitVector<ElementSize> *>(this)->Elements.end();
+
+     if (Elements.empty()) {
+       CurrElementIter = Begin;
+       return CurrElementIter;
+     }
+
+     // Make sure our current iterator is valid.
+     if (CurrElementIter == End)
+       --CurrElementIter;
+
+     // Search from our current iterator, either backwards or forwards,
+     // depending on what element we are looking for.
+     ElementListIter ElementIter = CurrElementIter;
+     if (CurrElementIter->index() == ElementIndex) {
+       return ElementIter;
+     } else if (CurrElementIter->index() > ElementIndex) {
+       while (ElementIter != Begin
+              && ElementIter->index() > ElementIndex)
+         --ElementIter;
+     } else {
+       while (ElementIter != End &&
+              ElementIter->index() < ElementIndex)
+         ++ElementIter;
+     }
+     CurrElementIter = ElementIter;
+     return ElementIter;
+   }
+   ElementListConstIter FindLowerBoundConst(unsigned ElementIndex) const {
+     return FindLowerBoundImpl(ElementIndex);
+   }
+   ElementListIter FindLowerBound(unsigned ElementIndex) {
+     return FindLowerBoundImpl(ElementIndex);
+   }
+
+   // Iterator to walk set bits in the bitmap.  This iterator is a lot uglier
+   // than it would be, in order to be efficient.
+   class SparseBitVectorIterator {
+   private:
+     bool AtEnd;
+
+     const SparseBitVector<ElementSize> *BitVector = nullptr;
+
+     // Current element inside of bitmap.
+     ElementListConstIter Iter;
+
+     // Current bit number inside of our bitmap.
+     unsigned BitNumber;
+
+     // Current word number inside of our element.
+     unsigned WordNumber;
+
+     // Current bits from the element.
+     typename SparseBitVectorElement<ElementSize>::BitWord Bits;
+
+     // Move our iterator to the first non-zero bit in the bitmap.
+     void AdvanceToFirstNonZero() {
+       if (AtEnd)
+         return;
+       if (BitVector->Elements.empty()) {
+         AtEnd = true;
+         return;
+       }
+       Iter = BitVector->Elements.begin();
+       BitNumber = Iter->index() * ElementSize;
+       unsigned BitPos = Iter->find_first();
+       BitNumber += BitPos;
+       WordNumber = (BitNumber % ElementSize) / BITWORD_SIZE;
+       Bits = Iter->word(WordNumber);
+       Bits >>= BitPos % BITWORD_SIZE;
+     }
+
+     // Move our iterator to the next non-zero bit.
+     void AdvanceToNextNonZero() {
+       if (AtEnd)
+         return;
+
+       while (Bits && !(Bits & 1)) {
+         Bits >>= 1;
+         BitNumber += 1;
+       }
+
+       // See if we ran out of Bits in this word.
+       if (!Bits) {
+         int NextSetBitNumber = Iter->find_next(BitNumber % ElementSize) ;
+         // If we ran out of set bits in this element, move to next element.
+         if (NextSetBitNumber == -1 || (BitNumber % ElementSize == 0)) {
+           ++Iter;
+           WordNumber = 0;
+
+           // We may run out of elements in the bitmap.
+           if (Iter == BitVector->Elements.end()) {
+             AtEnd = true;
+             return;
+           }
+           // Set up for next non-zero word in bitmap.
+           BitNumber = Iter->index() * ElementSize;
+           NextSetBitNumber = Iter->find_first();
+           BitNumber += NextSetBitNumber;
+           WordNumber = (BitNumber % ElementSize) / BITWORD_SIZE;
+           Bits = Iter->word(WordNumber);
+           Bits >>= NextSetBitNumber % BITWORD_SIZE;
+         } else {
+           WordNumber = (NextSetBitNumber % ElementSize) / BITWORD_SIZE;
+           Bits = Iter->word(WordNumber);
+           Bits >>= NextSetBitNumber % BITWORD_SIZE;
+           BitNumber = Iter->index() * ElementSize;
+           BitNumber += NextSetBitNumber;
+         }
+       }
+     }
+
+   public:
+     SparseBitVectorIterator() = default;
+
+     SparseBitVectorIterator(const SparseBitVector<ElementSize> *RHS,
+                             bool end = false):BitVector(RHS) {
+       Iter = BitVector->Elements.begin();
+       BitNumber = 0;
+       Bits = 0;
+       WordNumber = ~0;
+       AtEnd = end;
+       AdvanceToFirstNonZero();
+     }
+
+     // Preincrement.
+     inline SparseBitVectorIterator& operator++() {
+       ++BitNumber;
+       Bits >>= 1;
+       AdvanceToNextNonZero();
+       return *this;
+     }
+
+     // Postincrement.
+     inline SparseBitVectorIterator operator++(int) {
+       SparseBitVectorIterator tmp = *this;
+       ++*this;
+       return tmp;
+     }
+
+     // Return the current set bit number.
+     unsigned operator*() const {
+       return BitNumber;
+     }
+
+     bool operator==(const SparseBitVectorIterator &RHS) const {
+       // If they are both at the end, ignore the rest of the fields.
+       if (AtEnd && RHS.AtEnd)
+         return true;
+       // Otherwise they are the same if they have the same bit number and
+       // bitmap.
+       return AtEnd == RHS.AtEnd && RHS.BitNumber == BitNumber;
+     }
+
+     bool operator!=(const SparseBitVectorIterator &RHS) const {
+       return !(*this == RHS);
+     }
+   };
+
+ public:
+   using iterator = SparseBitVectorIterator;
+
+   SparseBitVector() : Elements(), CurrElementIter(Elements.begin()) {}
+
+   SparseBitVector(const SparseBitVector &RHS)
+       : Elements(RHS.Elements), CurrElementIter(Elements.begin()) {}
+   SparseBitVector(SparseBitVector &&RHS)
+       : Elements(std::move(RHS.Elements)), CurrElementIter(Elements.begin()) {}
+
+   // Clear.
+   void clear() {
+     Elements.clear();
+   }
+
+   // Assignment
+   SparseBitVector& operator=(const SparseBitVector& RHS) {
+     if (this == &RHS)
+       return *this;
+
+     Elements = RHS.Elements;
+     CurrElementIter = Elements.begin();
+     return *this;
+   }
+   SparseBitVector &operator=(SparseBitVector &&RHS) {
+     Elements = std::move(RHS.Elements);
+     CurrElementIter = Elements.begin();
+     return *this;
+   }
+
+   // Test, Reset, and Set a bit in the bitmap.
+   bool test(unsigned Idx) const {
+     if (Elements.empty())
+       return false;
+
+     unsigned ElementIndex = Idx / ElementSize;
+     ElementListConstIter ElementIter = FindLowerBoundConst(ElementIndex);
+
+     // If we can't find an element that is supposed to contain this bit, there
+     // is nothing more to do.
+     if (ElementIter == Elements.end() ||
+         ElementIter->index() != ElementIndex)
+       return false;
+     return ElementIter->test(Idx % ElementSize);
+   }
+
+   void reset(unsigned Idx) {
+     if (Elements.empty())
+       return;
+
+     unsigned ElementIndex = Idx / ElementSize;
+     ElementListIter ElementIter = FindLowerBound(ElementIndex);
+
+     // If we can't find an element that is supposed to contain this bit, there
+     // is nothing more to do.
+     if (ElementIter == Elements.end() ||
+         ElementIter->index() != ElementIndex)
+       return;
+     ElementIter->reset(Idx % ElementSize);
+
+     // When the element is zeroed out, delete it.
+     if (ElementIter->empty()) {
+       ++CurrElementIter;
+       Elements.erase(ElementIter);
+     }
+   }
+
+   void set(unsigned Idx) {
+     unsigned ElementIndex = Idx / ElementSize;
+     ElementListIter ElementIter;
+     if (Elements.empty()) {
+       ElementIter = Elements.emplace(Elements.end(), ElementIndex);
+     } else {
+       ElementIter = FindLowerBound(ElementIndex);
+
+       if (ElementIter == Elements.end() ||
+           ElementIter->index() != ElementIndex) {
+         // We may have hit the beginning of our SparseBitVector, in which case,
+         // we may need to insert right after this element, which requires moving
+         // the current iterator forward one, because insert does insert before.
+         if (ElementIter != Elements.end() &&
+             ElementIter->index() < ElementIndex)
+           ++ElementIter;
+         ElementIter = Elements.emplace(ElementIter, ElementIndex);
+       }
+     }
+     CurrElementIter = ElementIter;
+
+     ElementIter->set(Idx % ElementSize);
+   }
+
+   bool test_and_set(unsigned Idx) {
+     bool old = test(Idx);
+     if (!old) {
+       set(Idx);
+       return true;
+     }
+     return false;
+   }
+
+   bool operator!=(const SparseBitVector &RHS) const {
+     return !(*this == RHS);
+   }
+
+   bool operator==(const SparseBitVector &RHS) const {
+     ElementListConstIter Iter1 = Elements.begin();
+     ElementListConstIter Iter2 = RHS.Elements.begin();
+
+     for (; Iter1 != Elements.end() && Iter2 != RHS.Elements.end();
+          ++Iter1, ++Iter2) {
+       if (*Iter1 != *Iter2)
+         return false;
+     }
+     return Iter1 == Elements.end() && Iter2 == RHS.Elements.end();
+   }
+
+   // Union our bitmap with the RHS and return true if we changed.
+   bool operator|=(const SparseBitVector &RHS) {
+     if (this == &RHS)
+       return false;
+
+     bool changed = false;
+     ElementListIter Iter1 = Elements.begin();
+     ElementListConstIter Iter2 = RHS.Elements.begin();
+
+     // If RHS is empty, we are done
+     if (RHS.Elements.empty())
+       return false;
+
+     while (Iter2 != RHS.Elements.end()) {
+       if (Iter1 == Elements.end() || Iter1->index() > Iter2->index()) {
+         Elements.insert(Iter1, *Iter2);
+         ++Iter2;
+         changed = true;
+       } else if (Iter1->index() == Iter2->index()) {
+         changed |= Iter1->unionWith(*Iter2);
+         ++Iter1;
+         ++Iter2;
+       } else {
+         ++Iter1;
+       }
+     }
+     CurrElementIter = Elements.begin();
+     return changed;
+   }
+
+   // Intersect our bitmap with the RHS and return true if ours changed.
+   bool operator&=(const SparseBitVector &RHS) {
+     if (this == &RHS)
+       return false;
+
+     bool changed = false;
+     ElementListIter Iter1 = Elements.begin();
+     ElementListConstIter Iter2 = RHS.Elements.begin();
+
+     // Check if both bitmaps are empty.
+     if (Elements.empty() && RHS.Elements.empty())
+       return false;
+
+     // Loop through, intersecting as we go, erasing elements when necessary.
+     while (Iter2 != RHS.Elements.end()) {
+       if (Iter1 == Elements.end()) {
+         CurrElementIter = Elements.begin();
+         return changed;
+       }
+
+       if (Iter1->index() > Iter2->index()) {
+         ++Iter2;
+       } else if (Iter1->index() == Iter2->index()) {
+         bool BecameZero;
+         changed |= Iter1->intersectWith(*Iter2, BecameZero);
+         if (BecameZero) {
+           ElementListIter IterTmp = Iter1;
+           ++Iter1;
+           Elements.erase(IterTmp);
+         } else {
+           ++Iter1;
+         }
+         ++Iter2;
+       } else {
+         ElementListIter IterTmp = Iter1;
+         ++Iter1;
+         Elements.erase(IterTmp);
+         changed = true;
+       }
+     }
+     if (Iter1 != Elements.end()) {
+       Elements.erase(Iter1, Elements.end());
+       changed = true;
+     }
+     CurrElementIter = Elements.begin();
+     return changed;
+   }
+
+   // Intersect our bitmap with the complement of the RHS and return true
+   // if ours changed.
+   bool intersectWithComplement(const SparseBitVector &RHS) {
+     if (this == &RHS) {
+       if (!empty()) {
+         clear();
+         return true;
+       }
+       return false;
+     }
+
+     bool changed = false;
+     ElementListIter Iter1 = Elements.begin();
+     ElementListConstIter Iter2 = RHS.Elements.begin();
+
+     // If either our bitmap or RHS is empty, we are done
+     if (Elements.empty() || RHS.Elements.empty())
+       return false;
+
+     // Loop through, intersecting as we go, erasing elements when necessary.
+     while (Iter2 != RHS.Elements.end()) {
+       if (Iter1 == Elements.end()) {
+         CurrElementIter = Elements.begin();
+         return changed;
+       }
+
+       if (Iter1->index() > Iter2->index()) {
+         ++Iter2;
+       } else if (Iter1->index() == Iter2->index()) {
+         bool BecameZero;
+         changed |= Iter1->intersectWithComplement(*Iter2, BecameZero);
+         if (BecameZero) {
+           ElementListIter IterTmp = Iter1;
+           ++Iter1;
+           Elements.erase(IterTmp);
+         } else {
+           ++Iter1;
+         }
+         ++Iter2;
+       } else {
+         ++Iter1;
+       }
+     }
+     CurrElementIter = Elements.begin();
+     return changed;
+   }
+
+   bool intersectWithComplement(const SparseBitVector<ElementSize> *RHS) const {
+     return intersectWithComplement(*RHS);
+   }
+
+   //  Three argument version of intersectWithComplement.
+   //  Result of RHS1 & ~RHS2 is stored into this bitmap.
+   void intersectWithComplement(const SparseBitVector<ElementSize> &RHS1,
+                                const SparseBitVector<ElementSize> &RHS2)
+   {
+     if (this == &RHS1) {
+       intersectWithComplement(RHS2);
+       return;
+     } else if (this == &RHS2) {
+       SparseBitVector RHS2Copy(RHS2);
+       intersectWithComplement(RHS1, RHS2Copy);
+       return;
+     }
+
+     Elements.clear();
+     CurrElementIter = Elements.begin();
+     ElementListConstIter Iter1 = RHS1.Elements.begin();
+     ElementListConstIter Iter2 = RHS2.Elements.begin();
+
+     // If RHS1 is empty, we are done
+     // If RHS2 is empty, we still have to copy RHS1
+     if (RHS1.Elements.empty())
+       return;
+
+     // Loop through, intersecting as we go, erasing elements when necessary.
+     while (Iter2 != RHS2.Elements.end()) {
+       if (Iter1 == RHS1.Elements.end())
+         return;
+
+       if (Iter1->index() > Iter2->index()) {
+         ++Iter2;
+       } else if (Iter1->index() == Iter2->index()) {
+         bool BecameZero = false;
+         Elements.emplace_back(Iter1->index());
+         Elements.back().intersectWithComplement(*Iter1, *Iter2, BecameZero);
+         if (BecameZero)
+           Elements.pop_back();
+         ++Iter1;
+         ++Iter2;
+       } else {
+         Elements.push_back(*Iter1++);
+       }
+     }
+
+     // copy the remaining elements
+     std::copy(Iter1, RHS1.Elements.end(), std::back_inserter(Elements));
+   }
+
+   void intersectWithComplement(const SparseBitVector<ElementSize> *RHS1,
+                                const SparseBitVector<ElementSize> *RHS2) {
+     intersectWithComplement(*RHS1, *RHS2);
+   }
+
+   bool intersects(const SparseBitVector<ElementSize> *RHS) const {
+     return intersects(*RHS);
+   }
+
+   // Return true if we share any bits in common with RHS
+   bool intersects(const SparseBitVector<ElementSize> &RHS) const {
+     ElementListConstIter Iter1 = Elements.begin();
+     ElementListConstIter Iter2 = RHS.Elements.begin();
+
+     // Check if both bitmaps are empty.
+     if (Elements.empty() && RHS.Elements.empty())
+       return false;
+
+     // Loop through, intersecting stopping when we hit bits in common.
+     while (Iter2 != RHS.Elements.end()) {
+       if (Iter1 == Elements.end())
+         return false;
+
+       if (Iter1->index() > Iter2->index()) {
+         ++Iter2;
+       } else if (Iter1->index() == Iter2->index()) {
+         if (Iter1->intersects(*Iter2))
+           return true;
+         ++Iter1;
+         ++Iter2;
+       } else {
+         ++Iter1;
+       }
+     }
+     return false;
+   }
+
+   // Return true iff all bits set in this SparseBitVector are
+   // also set in RHS.
+   bool contains(const SparseBitVector<ElementSize> &RHS) const {
+     SparseBitVector<ElementSize> Result(*this);
+     Result &= RHS;
+     return (Result == RHS);
+   }
+
+   // Return the first set bit in the bitmap.  Return -1 if no bits are set.
+   int find_first() const {
+     if (Elements.empty())
+       return -1;
+     const SparseBitVectorElement<ElementSize> &First = *(Elements.begin());
+     return (First.index() * ElementSize) + First.find_first();
+   }
+
+   // Return the last set bit in the bitmap.  Return -1 if no bits are set.
+   int find_last() const {
+     if (Elements.empty())
+       return -1;
+     const SparseBitVectorElement<ElementSize> &Last = *(Elements.rbegin());
+     return (Last.index() * ElementSize) + Last.find_last();
+   }
+
+   // Return true if the SparseBitVector is empty
+   bool empty() const {
+     return Elements.empty();
+   }
+
+   unsigned count() const {
+     unsigned BitCount = 0;
+     for (ElementListConstIter Iter = Elements.begin();
+          Iter != Elements.end();
+          ++Iter)
+       BitCount += Iter->count();
+
+     return BitCount;
+   }
+
+   iterator begin() const {
+     return iterator(this);
+   }
+
+   iterator end() const {
+     return iterator(this, true);
+   }
+ };
+
+ // Convenience functions to allow Or and And without dereferencing in the user
+ // code.
+
+ template <unsigned ElementSize>
+ inline bool operator |=(SparseBitVector<ElementSize> &LHS,
+                         const SparseBitVector<ElementSize> *RHS) {
+   return LHS |= *RHS;
+ }
+
+ template <unsigned ElementSize>
+ inline bool operator |=(SparseBitVector<ElementSize> *LHS,
+                         const SparseBitVector<ElementSize> &RHS) {
+   return LHS->operator|=(RHS);
+ }
+
+ template <unsigned ElementSize>
+ inline bool operator &=(SparseBitVector<ElementSize> *LHS,
+                         const SparseBitVector<ElementSize> &RHS) {
+   return LHS->operator&=(RHS);
+ }
+
+ template <unsigned ElementSize>
+ inline bool operator &=(SparseBitVector<ElementSize> &LHS,
+                         const SparseBitVector<ElementSize> *RHS) {
+   return LHS &= *RHS;
+ }
+
+ // Convenience functions for infix union, intersection, difference operators.
+
+ template <unsigned ElementSize>
+ inline SparseBitVector<ElementSize>
+ operator|(const SparseBitVector<ElementSize> &LHS,
+           const SparseBitVector<ElementSize> &RHS) {
+   SparseBitVector<ElementSize> Result(LHS);
+   Result |= RHS;
+   return Result;
+ }
+
+ template <unsigned ElementSize>
+ inline SparseBitVector<ElementSize>
+ operator&(const SparseBitVector<ElementSize> &LHS,
+           const SparseBitVector<ElementSize> &RHS) {
+   SparseBitVector<ElementSize> Result(LHS);
+   Result &= RHS;
+   return Result;
+ }
+
+ template <unsigned ElementSize>
+ inline SparseBitVector<ElementSize>
+ operator-(const SparseBitVector<ElementSize> &LHS,
+           const SparseBitVector<ElementSize> &RHS) {
+   SparseBitVector<ElementSize> Result;
+   Result.intersectWithComplement(LHS, RHS);
+   return Result;
+ }
+
+
+ } // end namespace llvm
\ No newline at end of file
diff --git a/torch/csrc/jit/passes/alias_analysis.cpp b/torch/csrc/jit/passes/alias_analysis.cpp
index 5555cacfe20aa..b6a6cc403c304 100644
--- a/torch/csrc/jit/passes/alias_analysis.cpp
+++ b/torch/csrc/jit/passes/alias_analysis.cpp
@@ -83,14 +83,7 @@ bool AliasDb::hasWriters(const Value* v) const {
   if (isWriteCacheStale_) {
     rebuildWriteCache();
   }
-
-  for (const auto loc : elementMap_.at(v)->getMemoryLocations()) {
-    if (writeCache_.count(loc)) {
-      return true;
-    }
-  }
-
-  return false;
+  return writeCache_.intersects(elementMap_.at(v)->getMemoryLocations());
 }
 
 void AliasDb::getWritesImpl(Block* b, ValueSet& ret, bool recurseBlocks) const {
@@ -166,17 +159,17 @@ void AliasDb::dump() const {
   std::cout << "\n===2. ALIAS DB===\n";
   for (const auto& ptrPair : elementMap_) {
     const auto element = ptrPair.second;
-    if (element->pointsTo.size() > 0) {
+    if (!element->pointsTo.empty()) {
       std::cout << getElementName(element) << " points to: ";
       for (const auto pointedTo : element->pointsTo) {
-        std::cout << getElementName(pointedTo) << ", ";
+        std::cout << getElementName(Element::fromIndex(pointedTo)) << ", ";
       }
       std::cout << "\n";
     }
-    if (element->contained_elements.size() > 0) {
+    if (!element->contained_elements.empty()) {
       std::cout << getElementName(element) << " contains: ";
       for (const auto contained : element->contained_elements) {
-        std::cout << getElementName(contained) << ", ";
+        std::cout << getElementName(Element::fromIndex(contained)) << ", ";
       }
       std::cout << "\n";
     }
@@ -547,7 +540,7 @@ void AliasDb::analyzeWait(Node* node) {
     const auto el = pr.second;
     const auto& pointedFrom = el->pointedFrom;
     TORCH_INTERNAL_ASSERT(!pointedFrom.empty());
-    const auto wildcardValue = (*pointedFrom.begin())->value;
+    const auto wildcardValue = Element::fromIndex(*pointedFrom.begin())->value;
     TORCH_INTERNAL_ASSERT(wildcardValue);
     registerWrite(wildcardValue, node);
   }
@@ -1154,9 +1147,7 @@ void AliasDb::rebuildWriteCache() const {
     const auto& writtenValues = pr.second;
 
     for (const auto value : writtenValues) {
-      for (const auto loc : elementMap_.at(value)->getMemoryLocations()) {
-        writeCache_.insert(loc);
-      }
+      writeCache_ |= elementMap_.at(value)->getMemoryLocations();
     }
   }
   isWriteCacheStale_ = false;
diff --git a/torch/csrc/jit/passes/alias_analysis.h b/torch/csrc/jit/passes/alias_analysis.h
index 66774be2cfbb2..4058507570f9b 100644
--- a/torch/csrc/jit/passes/alias_analysis.h
+++ b/torch/csrc/jit/passes/alias_analysis.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <c10/util/flat_hash_map.h>
 #include <torch/csrc/jit/alias_info.h>
 #include <torch/csrc/jit/ir.h>
 #include <torch/csrc/jit/passes/utils/memory_dag.h>
@@ -200,7 +201,7 @@ class AliasDb {
   // The points-to graph that stores aliasing relationships
   std::unique_ptr<MemoryDAG> memoryDAG_;
   // Mapping of values to MemoryDAG elements
-  std::unordered_map<const Value*, Element*> elementMap_;
+  ska::flat_hash_map<const Value*, Element*> elementMap_;
   // All wildcard elements (one for each unique mutable type).
   std::map<TypeKind, Element*> wildcardIndex_;
   Element* getWildcard(const TypePtr& type) const;
@@ -211,9 +212,9 @@ class AliasDb {
    * State for tracking write info.
    */
   // Map of nodes to the values that they write to
-  std::unordered_map<Node*, ValueSet> writeIndex_;
+  ska::flat_hash_map<Node*, ValueSet> writeIndex_;
   // Set of all memory locations that may have been written to.
-  mutable std::unordered_set<const Element*> writeCache_;
+  mutable MemoryLocations writeCache_;
   mutable bool isWriteCacheStale_ = true;
   void rebuildWriteCache() const;
 };
diff --git a/torch/csrc/jit/passes/utils/memory_dag.cpp b/torch/csrc/jit/passes/utils/memory_dag.cpp
index 1dc74c55c1c6f..0ed9c5151077d 100644
--- a/torch/csrc/jit/passes/utils/memory_dag.cpp
+++ b/torch/csrc/jit/passes/utils/memory_dag.cpp
@@ -1,11 +1,25 @@
 #include "memory_dag.h"
 
+#include <c10/util/flat_hash_map.h>
 #include <torch/csrc/utils/memory.h>
 #include <algorithm>
 #include <queue>
 
 namespace torch {
 namespace jit {
+namespace {
+std::vector<const Element*> indexToElementMap;
+} // namespace
+unsigned Element::indexCount = 0;
+Element::Element(const Value* value_) : value(value_), index(indexCount++) {
+  indexToElementMap.push_back(this);
+}
+
+const Element* Element::fromIndex(unsigned x) {
+  TORCH_INTERNAL_ASSERT(x < indexToElementMap.size());
+  auto res = indexToElementMap[x];
+  return res;
+}
 
 bool MemoryDAG::mayAlias(Element* a, Element* b) const {
   return mayAliasImpl(a, b);
@@ -15,28 +29,11 @@ bool MemoryDAG::mayAlias(const Element* a, const Element* b) const {
   return mayAliasImpl(a, b);
 }
 
-bool MemoryDAG::memoryLocationOverlap(
-    const std::unordered_set<const Element*>& aMemLoc,
-    const std::unordered_set<const Element*>& bMemLoc) const {
-  // XXX: This could be more efficiently done as a bitwise AND on two bitfields
-  // that represent memory location membership. If these comparisons end up
-  // being a bottleneck, consider implementing it that way.
-  for (const auto aLoc : aMemLoc) {
-    for (const auto bLoc : bMemLoc) {
-      if (aLoc == bLoc) {
-        return true;
-      }
-    }
-  }
-
-  return false;
-}
-
 bool MemoryDAG::mayAliasImpl(const Element* a, const Element* b) const {
   const auto aMemLoc = a->getMemoryLocations();
   const auto bMemLoc = b->getMemoryLocations();
 
-  return memoryLocationOverlap(aMemLoc, bMemLoc);
+  return aMemLoc.intersects(bMemLoc);
 }
 
 bool MemoryDAG::mayContainAlias(const Element* a, const Element* b) const {
@@ -49,31 +46,31 @@ bool MemoryDAG::mayContainAlias(Element* a, Element* b) const {
 
 void collectAllContainedMemoryLocations(
     const Element* elem,
-    std::unordered_set<const Element*>& cont) {
+    MemoryLocations& cont) {
   // we have already recursed on this element
-  if (cont.count(elem)) {
+  unsigned compIdx = elem->index;
+  if (cont.test(compIdx)) {
     return;
   }
-
-  cont.insert(elem);
+  cont.set(compIdx);
 
   for (const auto& mem_loc : elem->getMemoryLocations()) {
-    collectAllContainedMemoryLocations(mem_loc, cont);
+    collectAllContainedMemoryLocations(Element::fromIndex(mem_loc), cont);
   }
 
   for (const auto& contained : elem->contained_elements) {
-    collectAllContainedMemoryLocations(contained, cont);
+    collectAllContainedMemoryLocations(Element::fromIndex(contained), cont);
   }
 }
 
 bool MemoryDAG::mayContainAliasImpl(const Element* a, const Element* b) const {
-  std::unordered_set<const Element*> all_a_mlocs;
-  std::unordered_set<const Element*> all_b_mlocs;
+  MemoryLocations all_a_mlocs;
+  MemoryLocations all_b_mlocs;
 
   collectAllContainedMemoryLocations(a, all_a_mlocs);
   collectAllContainedMemoryLocations(b, all_b_mlocs);
 
-  return memoryLocationOverlap(all_a_mlocs, all_b_mlocs);
+  return all_a_mlocs.intersects(all_b_mlocs);
 }
 
 bool MemoryDAG::mayContainAlias(
@@ -83,72 +80,64 @@ bool MemoryDAG::mayContainAlias(
     return false;
   }
 
-  std::unordered_set<const Element*> all_a_mlocs;
+  MemoryLocations all_a_mlocs;
   for (const auto& elem : a) {
     collectAllContainedMemoryLocations(elem, all_a_mlocs);
   }
 
-  std::unordered_set<const Element*> all_b_mlocs;
+  MemoryLocations all_b_mlocs;
   for (const auto& elem : b) {
     collectAllContainedMemoryLocations(elem, all_b_mlocs);
   }
 
-  return memoryLocationOverlap(all_a_mlocs, all_b_mlocs);
+  return all_a_mlocs.intersects(all_b_mlocs);
 }
 
 // Make `v` point at `to`.
 void MemoryDAG::makePointerTo(Element* from, Element* to) {
-  from->pointsTo.insert(to);
-  to->pointedFrom.insert(from);
+  from->pointsTo.set(to->index);
+  to->pointedFrom.set(from->index);
 }
 
 void MemoryDAG::addToContainedElements(Element* elem, Element* container) {
-  container->contained_elements.insert(elem);
+  container->contained_elements.set(elem->index);
 }
 
 // Give `v` a fresh alias (i.e. it does not point to any value)
 Element* MemoryDAG::makeFreshValue(const Value* v) {
-  auto el = torch::make_unique<Element>();
-  el->value = v;
+  auto el = torch::make_unique<Element>(v);
 
   auto rawPtr = el.get();
   elements_.emplace(rawPtr, std::move(el));
   return rawPtr;
 }
 
-std::unordered_set<const Element*> Element::getMemoryLocations() const {
+const MemoryLocations& Element::getMemoryLocations() const {
   if (!cachedMemoryLocations_.empty()) {
     return cachedMemoryLocations_;
   }
 
   // Do a BFS in the `points-to` direction, collecting all memory locations
-  std::unordered_set<const Element*> ret;
-  this->bfs(
-      [&](const Element* el) {
-        if (el->pointsTo.empty()) {
-          ret.insert(el);
-        }
-      },
-      BfsDirection::POINTS_TO);
-
+  MemoryLocations ret;
+  this->bfs(BfsDirection::POINTS_TO, ret);
   cachedMemoryLocations_ = ret;
-  return ret;
+  return cachedMemoryLocations_;
 }
 
 // Do a breadth-first search over the graph, starting at `this` and
 // traversing in the direction `dir`.`fn` will be run on each element.
-template <typename Fn>
-bool Element::bfs(Fn fn, BfsDirection dir) const {
-  std::queue<const Element*> queue;
-  std::unordered_set<const Element*> seen;
-
-  queue.push(this);
+void Element::bfs(BfsDirection dir, MemoryLocations& res) const {
+  std::queue<unsigned> queue;
+  ska::flat_hash_set<int> seen;
+  queue.push(this->index);
   while (!queue.empty()) {
-    const auto el = queue.front();
+    const auto index = queue.front();
     queue.pop();
-    seen.insert(el);
-
-    fn(el);
+    seen.insert(index);
+    auto el = Element::fromIndex(index);
+    if (el->pointsTo.empty()) {
+      res.set(index);
+    }
 
     switch (dir) {
       case BfsDirection::POINTS_TO: {
@@ -168,7 +157,6 @@ bool Element::bfs(Fn fn, BfsDirection dir) const {
       } break;
     }
   }
-  return false;
 }
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/passes/utils/memory_dag.h b/torch/csrc/jit/passes/utils/memory_dag.h
index 373503d9a8a14..ae8a2e1ec24fe 100644
--- a/torch/csrc/jit/passes/utils/memory_dag.h
+++ b/torch/csrc/jit/passes/utils/memory_dag.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <c10/util/ArrayRef.h>
+#include <c10/util/sparse_bitset.h>
 #include <memory>
 #include <unordered_map>
 #include <unordered_set>
@@ -8,6 +9,8 @@
 
 #include <torch/csrc/WindowsTorchApiMacro.h>
 
+// Uses a compressed index representation for faster comparisons
+typedef c10::SparseBitVector<128> MemoryLocations;
 namespace torch {
 namespace jit {
 
@@ -31,12 +34,12 @@ struct Value;
 // which memory locations an element may point to.
 class TORCH_API MemoryDAG {
  public:
-
-  // explicitly delete copy constructor because otherwise windows build is confused for an exported class
-  // see https://stackoverflow.com/a/51033485/105137
+  // explicitly delete copy constructor because otherwise windows build is
+  // confused for an exported class see
+  // https://stackoverflow.com/a/51033485/105137
   MemoryDAG() {}
-  MemoryDAG(const MemoryDAG&)=delete;
-  MemoryDAG& operator=(const MemoryDAG&)=delete;
+  MemoryDAG(const MemoryDAG&) = delete;
+  MemoryDAG& operator=(const MemoryDAG&) = delete;
 
   // Make `from` point at `to`.
   void makePointerTo(Element* from, Element* to);
@@ -70,13 +73,11 @@ class TORCH_API MemoryDAG {
     }
 
     // Record all memory locations from group `a`
-    std::unordered_set<const Element*> memoryLocations;
+    MemoryLocations memoryLocations;
     for (auto it = a.cbegin(); it != a.cend();) {
       const auto element = *it;
 
-      for (const auto loc : element->getMemoryLocations()) {
-        memoryLocations.insert(loc);
-      }
+      memoryLocations |= element->getMemoryLocations();
 
       const auto cnt = a.count(*it);
       std::advance(it, cnt);
@@ -85,11 +86,8 @@ class TORCH_API MemoryDAG {
     // If any of group `b`s memory locations overlap, return true.
     for (auto it = b.cbegin(); it != b.cend();) {
       const auto element = *it;
-
-      for (const auto loc : element->getMemoryLocations()) {
-        if (memoryLocations.count(loc)) {
-          return true;
-        }
+      if (memoryLocations.intersects(element->getMemoryLocations())) {
+        return true;
       }
 
       const auto cnt = b.count(*it);
@@ -100,9 +98,6 @@ class TORCH_API MemoryDAG {
   }
 
  private:
-  bool memoryLocationOverlap(
-      const std::unordered_set<const Element*>& a,
-      const std::unordered_set<const Element*>& b) const;
   bool mayAliasImpl(const Element* a, const Element* b) const;
   bool mayContainAliasImpl(const Element* contained, const Element* container)
       const;
@@ -126,23 +121,28 @@ struct Element {
 
   // All elements that this element *may* point to. It's possible to have
   // multiple elements that you might point to due to control flow/complex ops
-  std::unordered_set<Element*> pointsTo;
+  MemoryLocations pointsTo;
   // Backreference for points-to.
-  std::unordered_set<Element*> pointedFrom;
+  MemoryLocations pointedFrom;
 
-  std::unordered_set<Element*> contained_elements;
+  MemoryLocations contained_elements;
+  static unsigned indexCount;
+  signed index;
+  Element(const Value* value_);
 
   // Return the unique memory locations that `Element` might represent.
-  TORCH_API std::unordered_set<const Element*> getMemoryLocations() const;
+  TORCH_API const MemoryLocations& getMemoryLocations() const;
   // We do path compression to make repeated memory location queries faster.
   // An empty cache means it is invalidated (it can never be empty otherwise,
   // since every element must point to at least one memory location).
-  mutable std::unordered_set<const Element*> cachedMemoryLocations_;
+  mutable MemoryLocations cachedMemoryLocations_;
 
   // Do a breadth-first search over the graph, starting at `this` and
   // traversing in the direction `dir`.`fn` will be run on each element.
-  template <typename Fn>
-  bool bfs(Fn fn, BfsDirection dir) const;
+  void bfs(BfsDirection dir, MemoryLocations& res) const;
+
+  // Converts from the compressed index representation
+  static const Element* fromIndex(unsigned x);
 };
 } // namespace jit
 } // namespace torch