diff --git a/torch/lib/TH/THAllocator.c b/torch/lib/TH/THAllocator.c
index e69b3cc9c8a9..5225edefcc87 100644
--- a/torch/lib/TH/THAllocator.c
+++ b/torch/lib/TH/THAllocator.c
@@ -39,7 +39,11 @@ struct THMapAllocatorContext_ {
   char *filename; /* file name */
   int flags;
   ptrdiff_t size; /* mapped size */
+#ifdef _WIN32
+  HANDLE handle;
+#else
   int fd;
+#endif
 };
 
 #define TH_ALLOC_ALIGNMENT 64
@@ -68,17 +72,25 @@ THMapAllocatorContext *THMapAllocatorContext_new(const char *filename, int flags
   }
   ctx->flags = flags;
   ctx->size = 0;
+#ifdef _WIN32
+  ctx->handle = INVALID_HANDLE_VALUE;
+#else
   ctx->fd = -1;
+#endif
 
   return ctx;
 }
 
 THMapAllocatorContext *THMapAllocatorContext_newWithFd(const char *filename, int fd, int flags)
 {
+#ifdef _WIN32
+  THError("THMapAllocatorContext_newWithFd is unsupported on Windows");
+#else
   THMapAllocatorContext *ctx = THMapAllocatorContext_new(filename, flags);
   ctx->fd = fd;
 
   return ctx;
+#endif
 }
 
 char * THMapAllocatorContext_filename(THMapAllocatorContext *ctx)
@@ -88,7 +100,11 @@ char * THMapAllocatorContext_filename(THMapAllocatorContext *ctx)
 
 int THMapAllocatorContext_fd(THMapAllocatorContext *ctx)
 {
+#ifdef _WIN32
+  THError("THMapAllocatorContext_fd is unsupported on Windows");
+#else
   return ctx->fd;
+#endif
 }
 
 ptrdiff_t THMapAllocatorContext_size(THMapAllocatorContext *ctx)
@@ -105,15 +121,49 @@ void THMapAllocatorContext_free(THMapAllocatorContext *ctx)
 
 static void *_map_alloc(void* ctx_, ptrdiff_t size)
 {
-  if (size == 0) {
+  if (size == 0)
     return NULL;
-  }
 
   THMapAllocatorContext *ctx = ctx_;
   void *data = NULL;
 
 #ifdef _WIN32
+  if (ctx->flags & TH_ALLOCATOR_MAPPED_SHAREDMEM)
+  {
+    char *filename;
+    LARGE_INTEGER hfilesz;
+
+    if (ctx->filename[0] == '/')
+      filename = ctx->filename + 1;
+    else
+      filename = ctx->filename;
+
+    hfilesz.QuadPart = size;
+
+    if (ctx->flags & TH_ALLOCATOR_MAPPED_EXCLUSIVE)
+    {
+      ctx->handle = CreateFileMapping(INVALID_HANDLE_VALUE, NULL, PAGE_READWRITE, hfilesz.HighPart, hfilesz.LowPart, filename);
+    }
+    else if (ctx->flags & TH_ALLOCATOR_MAPPED_NOCREATE)
+    {
+      ctx->handle = OpenFileMapping(FILE_MAP_ALL_ACCESS, FALSE, filename);
+    }
+    else
+    {
+      THError("Excpected either TH_ALLOCATOR_MAPPED_EXCLUSIVE or TH_ALLOCATOR_MAPPED_NOCREATE");
+    }
+
+    if (ctx->handle == NULL)
+      THError("Couldn't open shared file mapping: <%s>, error code: <%d>", filename, GetLastError());
+
+    ctx->size = size;
+    data = MapViewOfFile(ctx->handle, FILE_MAP_ALL_ACCESS, 0, 0, size);
+    if (!data)
+      THError("Couldn't map view of shared file <%s>, error code: <%d>", filename, GetLastError());
+  }
+  else
   {
+
     HANDLE hfile;
     HANDLE hmfile;
     LARGE_INTEGER hfilesz;
@@ -143,9 +193,7 @@ static void *_map_alloc(void* ctx_, ptrdiff_t size)
     }
 
     if (GetFileSizeEx(hfile, &hfilesz) == 0)
-    {
       THError("could not get file size: <%s>; error code: <%d>", ctx->filename, GetLastError());
-    }
 
     if(size > 0)
     {
@@ -342,6 +390,8 @@ static void THMapAllocator_free(void* ctx_, void* data) {
   THMapAllocatorContext *ctx = ctx_;
 
 #ifdef _WIN32
+  if ((ctx->flags & TH_ALLOCATOR_MAPPED_KEEPFD) || (ctx->flags & TH_ALLOCATOR_MAPPED_SHAREDMEM))
+    CloseHandle(ctx->handle);
   if(UnmapViewOfFile(data) == 0)
     THError("could not unmap the shared memory file");
 #else /* _WIN32 */
diff --git a/torch/lib/TH/THAllocator.h b/torch/lib/TH/THAllocator.h
index 18fc9ec0a270..b4e8ca7b93ad 100644
--- a/torch/lib/TH/THAllocator.h
+++ b/torch/lib/TH/THAllocator.h
@@ -22,7 +22,7 @@ typedef struct THAllocator {
 /* default malloc/free allocator. malloc and realloc raise an error (using
  * THError) on allocation failure.
  */
-extern THAllocator THDefaultAllocator;
+TH_API THAllocator THDefaultAllocator;
 
 /* file map allocator
  */
@@ -37,7 +37,7 @@ TH_API void THMapAllocatorContext_free(THMapAllocatorContext *ctx);
 TH_API void THRefcountedMapAllocator_incref(THMapAllocatorContext *ctx, void *data);
 TH_API int THRefcountedMapAllocator_decref(THMapAllocatorContext *ctx, void *data);
 
-extern THAllocator THMapAllocator;
-extern THAllocator THRefcountedMapAllocator;
+TH_API THAllocator THMapAllocator;
+TH_API THAllocator THRefcountedMapAllocator;
 
 #endif
diff --git a/torch/lib/TH/THAtomic.c b/torch/lib/TH/THAtomic.c
index 714fc52dbcfa..16f0ddb48010 100644
--- a/torch/lib/TH/THAtomic.c
+++ b/torch/lib/TH/THAtomic.c
@@ -19,29 +19,29 @@
 static pthread_mutex_t ptm = PTHREAD_MUTEX_INITIALIZER;
 #endif
 
-void THAtomicSet(int volatile *a, int newvalue)
+void THAtomicSet(int32_t volatile *a, int32_t newvalue)
 {
 #if defined(USE_C11_ATOMICS)
   atomic_store(a, newvalue);
 #elif defined(USE_MSC_ATOMICS)
-  assert(sizeof(int) == sizeof(long));
-  _InterlockedExchange((long*)a, newvalue);
+  assert(sizeof(int) == sizeof(int32_t));
+  _InterlockedExchange((int32_t*)a, newvalue);
 #elif defined(USE_GCC_ATOMICS)
   __sync_lock_test_and_set(a, newvalue);
 #else
-  int oldvalue;
+  int32_t oldvalue;
   do {
     oldvalue = *a;
   } while (!THAtomicCompareAndSwap(a, oldvalue, newvalue));
 #endif
 }
 
-int THAtomicGet(int volatile *a)
+int THAtomicGet(int32_t volatile *a)
 {
 #if defined(USE_C11_ATOMICS)
   return atomic_load(a);
 #else
-  int value;
+  int32_t value;
   do {
     value = *a;
   } while (!THAtomicCompareAndSwap(a, value, value));
@@ -49,17 +49,16 @@ int THAtomicGet(int volatile *a)
 #endif
 }
 
-int THAtomicAdd(int volatile *a, int value)
+int THAtomicAdd(int32_t volatile *a, int32_t value)
 {
 #if defined(USE_C11_ATOMICS)
   return atomic_fetch_add(a, value);
 #elif defined(USE_MSC_ATOMICS)
-  assert(sizeof(int) == sizeof(long));
-  return _InterlockedExchangeAdd((long*)a, value);
+  return _InterlockedExchangeAdd((int32_t*)a, value);
 #elif defined(USE_GCC_ATOMICS)
   return __sync_fetch_and_add(a, value);
 #else
-  int oldvalue;
+  int32_t oldvalue;
   do {
     oldvalue = *a;
   } while (!THAtomicCompareAndSwap(a, oldvalue, (oldvalue + value)));
@@ -67,27 +66,26 @@ int THAtomicAdd(int volatile *a, int value)
 #endif
 }
 
-void THAtomicIncrementRef(int volatile *a)
+void THAtomicIncrementRef(int32_t volatile *a)
 {
   THAtomicAdd(a, 1);
 }
 
-int THAtomicDecrementRef(int volatile *a)
+int THAtomicDecrementRef(int32_t volatile *a)
 {
   return (THAtomicAdd(a, -1) == 1);
 }
 
-int THAtomicCompareAndSwap(int volatile *a, int oldvalue, int newvalue)
+int THAtomicCompareAndSwap(int32_t volatile *a, int32_t oldvalue, int32_t newvalue)
 {
 #if defined(USE_C11_ATOMICS)
   return atomic_compare_exchange_strong(a, &oldvalue, newvalue);
 #elif defined(USE_MSC_ATOMICS)
-  assert(sizeof(int) == sizeof(long));
-  return (_InterlockedCompareExchange((long*)a, (long)newvalue, (long)oldvalue) == (long)oldvalue);
+  return (_InterlockedCompareExchange((int32_t*)a, (int32_t)newvalue, (int32_t)oldvalue) == (int32_t)oldvalue);
 #elif defined(USE_GCC_ATOMICS)
   return __sync_bool_compare_and_swap(a, oldvalue, newvalue);
 #elif defined(USE_PTHREAD_ATOMICS)
-  int ret = 0;
+  int32_t ret = 0;
   pthread_mutex_lock(&ptm);
   if(*a == oldvalue) {
     *a = newvalue;
@@ -106,28 +104,28 @@ int THAtomicCompareAndSwap(int volatile *a, int oldvalue, int newvalue)
 #endif
 }
 
-void THAtomicSetLong(long volatile *a, long newvalue)
+void THAtomicSetLong(int64_t volatile *a, int64_t newvalue)
 {
 #if defined(USE_C11_ATOMICS)
   atomic_store(a, newvalue);
 #elif defined(USE_MSC_ATOMICS)
-  _InterlockedExchange(a, newvalue);
+  _InterlockedExchange64(a, newvalue);
 #elif defined(USE_GCC_ATOMICS)
   __sync_lock_test_and_set(a, newvalue);
 #else
-  long oldvalue;
+  int64_t oldvalue;
   do {
     oldvalue = *a;
   } while (!THAtomicCompareAndSwapLong(a, oldvalue, newvalue));
 #endif
 }
 
-long THAtomicGetLong(long volatile *a)
+int64_t THAtomicGetLong(int64_t volatile *a)
 {
 #if defined(USE_C11_ATOMICS)
   return atomic_load(a);
 #else
-  long value;
+  int64_t value;
   do {
     value = *a;
   } while (!THAtomicCompareAndSwapLong(a, value, value));
@@ -135,16 +133,16 @@ long THAtomicGetLong(long volatile *a)
 #endif
 }
 
-long THAtomicAddLong(long volatile *a, long value)
+int64_t THAtomicAddLong(int64_t volatile *a, int64_t value)
 {
 #if defined(USE_C11_ATOMICS)
   return atomic_fetch_add(a, value);
 #elif defined(USE_MSC_ATOMICS)
-  return _InterlockedExchangeAdd(a, value);
+  return _InterlockedExchangeAdd64(a, value);
 #elif defined(USE_GCC_ATOMICS)
   return __sync_fetch_and_add(a, value);
 #else
-  long oldvalue;
+  int64_t oldvalue;
   do {
     oldvalue = *a;
   } while (!THAtomicCompareAndSwapLong(a, oldvalue, (oldvalue + value)));
@@ -152,16 +150,16 @@ long THAtomicAddLong(long volatile *a, long value)
 #endif
 }
 
-long THAtomicCompareAndSwapLong(long volatile *a, long oldvalue, long newvalue)
+int64_t THAtomicCompareAndSwapLong(int64_t volatile *a, int64_t oldvalue, int64_t newvalue)
 {
 #if defined(USE_C11_ATOMICS)
   return atomic_compare_exchange_strong(a, &oldvalue, newvalue);
 #elif defined(USE_MSC_ATOMICS)
-  return (_InterlockedCompareExchange(a, newvalue, oldvalue) == oldvalue);
+  return (_InterlockedCompareExchange64(a, newvalue, oldvalue) == oldvalue);
 #elif defined(USE_GCC_ATOMICS)
   return __sync_bool_compare_and_swap(a, oldvalue, newvalue);
 #elif defined(USE_PTHREAD_ATOMICS)
-  long ret = 0;
+  int64_t ret = 0;
   pthread_mutex_lock(&ptm);
   if(*a == oldvalue) {
     *a = newvalue;
diff --git a/torch/lib/TH/THAtomic.h b/torch/lib/TH/THAtomic.h
index d77b20b24032..24c43d3776f5 100644
--- a/torch/lib/TH/THAtomic.h
+++ b/torch/lib/TH/THAtomic.h
@@ -21,25 +21,25 @@
 /*
  * *a = newvalue
 */
-TH_API void THAtomicSet(int volatile *a, int newvalue);
+TH_API void THAtomicSet(int32_t volatile *a, int32_t newvalue);
 
 /*
  * return *a
 */
-TH_API int THAtomicGet(int volatile *a);
+TH_API int32_t THAtomicGet(int32_t volatile *a);
 
 /*
  * *a += value,
  * return previous *a
 */
-TH_API int THAtomicAdd(int volatile *a, int value);
+TH_API int32_t THAtomicAdd(int32_t volatile *a, int32_t value);
 
 /*
  * check if (*a == oldvalue)
  * if true: set *a to newvalue, return 1
  * if false: return 0
 */
-TH_API int THAtomicCompareAndSwap(int volatile *a, int oldvalue, int newvalue);
+TH_API int32_t THAtomicCompareAndSwap(int32_t volatile *a, int32_t oldvalue, int32_t newvalue);
 
 
 /******************************************************************************
@@ -49,13 +49,13 @@ TH_API int THAtomicCompareAndSwap(int volatile *a, int oldvalue, int newvalue);
 /*
  * *a++
 */
-TH_API void THAtomicIncrementRef(int volatile *a);
+TH_API void THAtomicIncrementRef(int32_t volatile *a);
 
 /*
  * *a--,
  * return 1 if *a == 0 after the operation, 0 otherwise
 */
-TH_API int THAtomicDecrementRef(int volatile *a);
+TH_API int32_t THAtomicDecrementRef(int32_t volatile *a);
 
 
 
@@ -66,25 +66,25 @@ TH_API int THAtomicDecrementRef(int volatile *a);
 /*
  * *a = newvalue
 */
-TH_API void THAtomicSetLong(long volatile *a, long newvalue);
+TH_API void THAtomicSetLong(int64_t volatile *a, int64_t newvalue);
 
 /*
  * return *a
 */
-TH_API long THAtomicGetLong(long volatile *a);
+TH_API int64_t THAtomicGetLong(int64_t volatile *a);
 
 /*
  * *a += value,
  * return previous *a
 */
-TH_API long THAtomicAddLong(long volatile *a, long value);
+TH_API int64_t THAtomicAddLong(int64_t volatile *a, int64_t value);
 
 /*
  * check if (*a == oldvalue)
  * if true: set *a to newvalue, return 1
  * if false: return 0
 */
-TH_API long THAtomicCompareAndSwapLong(long volatile *a, long oldvalue, long newvalue);
+TH_API int64_t THAtomicCompareAndSwapLong(int64_t volatile *a, int64_t oldvalue, int64_t newvalue);
 
 
 
diff --git a/torch/lib/TH/THDiskFile.c b/torch/lib/TH/THDiskFile.c
index 3f57b3b35cd9..0b9023534c20 100644
--- a/torch/lib/TH/THDiskFile.c
+++ b/torch/lib/TH/THDiskFile.c
@@ -177,10 +177,10 @@ static void THDiskFile_seek(THFile *self, size_t position)
 
 #if defined(_WIN64)
   THArgCheck(position <= (size_t)INT64_MAX, 2, "position must be smaller than INT64_MAX");
-  if(_fseeki64(dfself->handle, (__int64)position, SEEK_SET) < 0)
+  if(_fseeki64(dfself->handle, (int64_t)position, SEEK_SET) < 0)
 #elif defined(_WIN32)
   THArgCheck(position <= (size_t)LONG_MAX, 2, "position must be smaller than LONG_MAX");
-  if(fseek(dfself->handle, (long)position, SEEK_SET) < 0)
+  if(fseek(dfself->handle, (int32_t)position, SEEK_SET) < 0)
 #else
   THArgCheck(position <= (size_t)LLONG_MAX, 2, "position must be smaller than LLONG_MAX");
   if(fseeko(dfself->handle, (off_t)position, SEEK_SET) < 0)
@@ -218,9 +218,9 @@ static size_t THDiskFile_position(THFile *self)
   THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
 
 #if defined(_WIN64)
-  __int64 offset = _ftelli64(dfself->handle);
+  int64_t offset = _ftelli64(dfself->handle);
 #elif defined(_WIN32)
-  long offset = ftell(dfself->handle);
+  int32_t offset = ftell(dfself->handle);
 #else
   off_t offset = ftello(dfself->handle);
 #endif
@@ -365,7 +365,7 @@ READ_WRITE_METHODS(double, Double,
 
 
 /* For Long we need to rewrite everything, because of the special management of longSize */
-static size_t THDiskFile_readLong(THFile *self, long *data, size_t n)
+static size_t THDiskFile_readLong(THFile *self, int64_t *data, size_t n)
 {
   THDiskFile *dfself = (THDiskFile*)(self);
   size_t nread = 0L;
@@ -375,11 +375,11 @@ static size_t THDiskFile_readLong(THFile *self, long *data, size_t n)
 
   if(dfself->file.isBinary)
   {
-    if(dfself->longSize == 0 || dfself->longSize == sizeof(long))
+    if(dfself->longSize == 0 || dfself->longSize == sizeof(int64_t))
     {
-      nread = fread__(data, sizeof(long), n, dfself->handle);
-      if(!dfself->isNativeEncoding && (sizeof(long) > 1) && (nread > 0))
-        THDiskFile_reverseMemory(data, data, sizeof(long), nread);
+      nread = fread__(data, sizeof(int64_t), n, dfself->handle);
+      if(!dfself->isNativeEncoding && (sizeof(int64_t) > 1) && (nread > 0))
+        THDiskFile_reverseMemory(data, data, sizeof(int64_t), nread);
     } else if(dfself->longSize == 4)
     {
       nread = fread__(data, 4, n, dfself->handle);
@@ -407,7 +407,7 @@ static size_t THDiskFile_readLong(THFile *self, long *data, size_t n)
     size_t i;
     for(i = 0; i < n; i++)
     {
-      int ret = fscanf(dfself->handle, "%ld", &data[i]); if(ret <= 0) break; else nread++;
+      int ret = fscanf(dfself->handle, "%" PRId64, &data[i]); if(ret <= 0) break; else nread++;
     }
     if(dfself->file.isAutoSpacing && (n > 0))
     {
@@ -427,7 +427,7 @@ static size_t THDiskFile_readLong(THFile *self, long *data, size_t n)
   return nread;
 }
 
-static size_t THDiskFile_writeLong(THFile *self, long *data, size_t n)
+static size_t THDiskFile_writeLong(THFile *self, int64_t *data, size_t n)
 {
   THDiskFile *dfself = (THDiskFile*)(self);
   size_t nwrite = 0L;
@@ -437,17 +437,17 @@ static size_t THDiskFile_writeLong(THFile *self, long *data, size_t n)
 
   if(dfself->file.isBinary)
   {
-    if(dfself->longSize == 0 || dfself->longSize == sizeof(long))
+    if(dfself->longSize == 0 || dfself->longSize == sizeof(int64_t))
     {
       if(dfself->isNativeEncoding)
       {
-        nwrite = fwrite(data, sizeof(long), n, dfself->handle);
+        nwrite = fwrite(data, sizeof(int64_t), n, dfself->handle);
       }
       else
       {
-        char *buffer = THAlloc(sizeof(long)*n);
-        THDiskFile_reverseMemory(buffer, data, sizeof(long), n);
-        nwrite = fwrite(buffer, sizeof(long), n, dfself->handle);
+        char *buffer = THAlloc(sizeof(int64_t)*n);
+        THDiskFile_reverseMemory(buffer, data, sizeof(int64_t), n);
+        nwrite = fwrite(buffer, sizeof(int64_t), n, dfself->handle);
         THFree(buffer);
       }
     } else if(dfself->longSize == 4)
@@ -455,7 +455,7 @@ static size_t THDiskFile_writeLong(THFile *self, long *data, size_t n)
       int32_t *buffer = THAlloc(4*n);
       size_t i;
       for(i = 0; i < n; i++)
-        buffer[i] = data[i];
+        buffer[i] = (int32_t) data[i];
       if(!dfself->isNativeEncoding)
         THDiskFile_reverseMemory(buffer, buffer, 4, n);
       nwrite = fwrite(buffer, 4, n, dfself->handle);
@@ -469,7 +469,7 @@ static size_t THDiskFile_writeLong(THFile *self, long *data, size_t n)
       for(i = 0; i < n; i++)
       {
         buffer[2*i + !big_endian] = 0;
-        buffer[2*i + big_endian] = data[i];
+        buffer[2*i + big_endian] = (int32_t) data[i];
       }
       if(!dfself->isNativeEncoding)
         THDiskFile_reverseMemory(buffer, buffer, 8, n);
@@ -482,7 +482,7 @@ static size_t THDiskFile_writeLong(THFile *self, long *data, size_t n)
     size_t i;
     for(i = 0; i < n; i++)
     {
-      int ret = fprintf(dfself->handle, "%ld", data[i]); if(ret <= 0) break; else nwrite++;
+      int ret = fprintf(dfself->handle, "%" PRId64, data[i]); if(ret <= 0) break; else nwrite++;
       if( dfself->file.isAutoSpacing && (i < n-1) )
         fprintf(dfself->handle, " ");
     }
@@ -556,7 +556,7 @@ static size_t THDiskFile_readString(THFile *self, const char *format, char **str
         total += TBRS_BSZ;
         p = THRealloc(p, total);
       }
-      if (fgets(p+pos, total-pos, dfself->handle) == NULL) /* eof? */
+      if (fgets(p+pos, (int) (total-pos), dfself->handle) == NULL) /* eof? */
       {
         if(pos == 0)
         {
diff --git a/torch/lib/TH/THFile.c b/torch/lib/TH/THFile.c
index 3717b7b5cbd7..649c8543dabb 100644
--- a/torch/lib/TH/THFile.c
+++ b/torch/lib/TH/THFile.c
@@ -12,11 +12,11 @@
     return (*self->vtable->write##TYPEC)(self, data, n);          \
   }
 
-IMPLEMENT_THFILE_RW(Byte, unsigned char)
-IMPLEMENT_THFILE_RW(Char, char)
-IMPLEMENT_THFILE_RW(Short, short)
-IMPLEMENT_THFILE_RW(Int, int)
-IMPLEMENT_THFILE_RW(Long, long)
+IMPLEMENT_THFILE_RW(Byte, uint8_t)
+IMPLEMENT_THFILE_RW(Char, int8_t)
+IMPLEMENT_THFILE_RW(Short, int16_t)
+IMPLEMENT_THFILE_RW(Int, int32_t)
+IMPLEMENT_THFILE_RW(Long, int64_t)
 IMPLEMENT_THFILE_RW(Float, float)
 IMPLEMENT_THFILE_RW(Double, double)
 IMPLEMENT_THFILE_RW(Half, THHalf)
@@ -127,11 +127,11 @@ void THFile_clearError(THFile *self)
     THFile_write##TYPEC##Raw(self, &scalar, 1);               \
   }
 
-IMPLEMENT_THFILE_SCALAR(Byte, unsigned char)
-IMPLEMENT_THFILE_SCALAR(Char, char)
-IMPLEMENT_THFILE_SCALAR(Short, short)
-IMPLEMENT_THFILE_SCALAR(Int, int)
-IMPLEMENT_THFILE_SCALAR(Long, long)
+IMPLEMENT_THFILE_SCALAR(Byte, uint8_t)
+IMPLEMENT_THFILE_SCALAR(Char, int8_t)
+IMPLEMENT_THFILE_SCALAR(Short, int16_t)
+IMPLEMENT_THFILE_SCALAR(Int, int32_t)
+IMPLEMENT_THFILE_SCALAR(Long, int64_t)
 IMPLEMENT_THFILE_SCALAR(Float, float)
 IMPLEMENT_THFILE_SCALAR(Double, double)
 IMPLEMENT_THFILE_SCALAR(Half, THHalf)
@@ -147,11 +147,11 @@ IMPLEMENT_THFILE_SCALAR(Half, THHalf)
     return THFile_write##TYPEC##Raw(self, storage->data, storage->size); \
   }
 
-IMPLEMENT_THFILE_STORAGE(Byte, unsigned char)
-IMPLEMENT_THFILE_STORAGE(Char, char)
-IMPLEMENT_THFILE_STORAGE(Short, short)
-IMPLEMENT_THFILE_STORAGE(Int, int)
-IMPLEMENT_THFILE_STORAGE(Long, long)
+IMPLEMENT_THFILE_STORAGE(Byte, uint8_t)
+IMPLEMENT_THFILE_STORAGE(Char, int8_t)
+IMPLEMENT_THFILE_STORAGE(Short, int16_t)
+IMPLEMENT_THFILE_STORAGE(Int, int32_t)
+IMPLEMENT_THFILE_STORAGE(Long, int64_t)
 IMPLEMENT_THFILE_STORAGE(Float, float)
 IMPLEMENT_THFILE_STORAGE(Double, double)
 IMPLEMENT_THFILE_STORAGE(Half, THHalf)
diff --git a/torch/lib/TH/THFile.h b/torch/lib/TH/THFile.h
index e097bdf3436d..27041f51c709 100644
--- a/torch/lib/TH/THFile.h
+++ b/torch/lib/TH/THFile.h
@@ -22,19 +22,19 @@ TH_API void THFile_pedantic(THFile *self);
 TH_API void THFile_clearError(THFile *self);
 
 /* scalar */
-TH_API unsigned char THFile_readByteScalar(THFile *self);
-TH_API char THFile_readCharScalar(THFile *self);
-TH_API short THFile_readShortScalar(THFile *self);
-TH_API int THFile_readIntScalar(THFile *self);
-TH_API long THFile_readLongScalar(THFile *self);
+TH_API uint8_t THFile_readByteScalar(THFile *self);
+TH_API int8_t THFile_readCharScalar(THFile *self);
+TH_API int16_t THFile_readShortScalar(THFile *self);
+TH_API int32_t THFile_readIntScalar(THFile *self);
+TH_API int64_t THFile_readLongScalar(THFile *self);
 TH_API float THFile_readFloatScalar(THFile *self);
 TH_API double THFile_readDoubleScalar(THFile *self);
 
-TH_API void THFile_writeByteScalar(THFile *self, unsigned char scalar);
-TH_API void THFile_writeCharScalar(THFile *self, char scalar);
-TH_API void THFile_writeShortScalar(THFile *self, short scalar);
-TH_API void THFile_writeIntScalar(THFile *self, int scalar);
-TH_API void THFile_writeLongScalar(THFile *self, long scalar);
+TH_API void THFile_writeByteScalar(THFile *self, uint8_t scalar);
+TH_API void THFile_writeCharScalar(THFile *self, int8_t scalar);
+TH_API void THFile_writeShortScalar(THFile *self, int16_t scalar);
+TH_API void THFile_writeIntScalar(THFile *self, int32_t scalar);
+TH_API void THFile_writeLongScalar(THFile *self, int64_t scalar);
 TH_API void THFile_writeFloatScalar(THFile *self, float scalar);
 TH_API void THFile_writeDoubleScalar(THFile *self, double scalar);
 
@@ -56,20 +56,20 @@ TH_API size_t THFile_writeFloat(THFile *self, THFloatStorage *storage);
 TH_API size_t THFile_writeDouble(THFile *self, THDoubleStorage *storage);
 
 /* raw */
-TH_API size_t THFile_readByteRaw(THFile *self, unsigned char *data, size_t n);
-TH_API size_t THFile_readCharRaw(THFile *self, char *data, size_t n);
-TH_API size_t THFile_readShortRaw(THFile *self, short *data, size_t n);
-TH_API size_t THFile_readIntRaw(THFile *self, int *data, size_t n);
-TH_API size_t THFile_readLongRaw(THFile *self, long *data, size_t n);
+TH_API size_t THFile_readByteRaw(THFile *self, uint8_t *data, size_t n);
+TH_API size_t THFile_readCharRaw(THFile *self, int8_t *data, size_t n);
+TH_API size_t THFile_readShortRaw(THFile *self, int16_t *data, size_t n);
+TH_API size_t THFile_readIntRaw(THFile *self, int32_t *data, size_t n);
+TH_API size_t THFile_readLongRaw(THFile *self, int64_t *data, size_t n);
 TH_API size_t THFile_readFloatRaw(THFile *self, float *data, size_t n);
 TH_API size_t THFile_readDoubleRaw(THFile *self, double *data, size_t n);
 TH_API size_t THFile_readStringRaw(THFile *self, const char *format, char **str_); /* you must deallocate str_ */
 
-TH_API size_t THFile_writeByteRaw(THFile *self, unsigned char *data, size_t n);
-TH_API size_t THFile_writeCharRaw(THFile *self, char *data, size_t n);
-TH_API size_t THFile_writeShortRaw(THFile *self, short *data, size_t n);
-TH_API size_t THFile_writeIntRaw(THFile *self, int *data, size_t n);
-TH_API size_t THFile_writeLongRaw(THFile *self, long *data, size_t n);
+TH_API size_t THFile_writeByteRaw(THFile *self, uint8_t *data, size_t n);
+TH_API size_t THFile_writeCharRaw(THFile *self, int8_t *data, size_t n);
+TH_API size_t THFile_writeShortRaw(THFile *self, int16_t *data, size_t n);
+TH_API size_t THFile_writeIntRaw(THFile *self, int32_t *data, size_t n);
+TH_API size_t THFile_writeLongRaw(THFile *self, int64_t *data, size_t n);
 TH_API size_t THFile_writeFloatRaw(THFile *self, float *data, size_t n);
 TH_API size_t THFile_writeDoubleRaw(THFile *self, double *data, size_t n);
 TH_API size_t THFile_writeStringRaw(THFile *self, const char *str, size_t size);
diff --git a/torch/lib/TH/THFilePrivate.h b/torch/lib/TH/THFilePrivate.h
index 55169c3bc75a..1bbd03cf0903 100644
--- a/torch/lib/TH/THFilePrivate.h
+++ b/torch/lib/TH/THFilePrivate.h
@@ -21,21 +21,21 @@ struct THFileVTable
 {
     int (*isOpened)(THFile *self);
 
-    size_t (*readByte)(THFile *self, unsigned char *data, size_t n);
+    size_t (*readByte)(THFile *self, uint8_t *data, size_t n);
     size_t (*readChar)(THFile *self, char *data, size_t n);
-    size_t (*readShort)(THFile *self, short *data, size_t n);
-    size_t (*readInt)(THFile *self, int *data, size_t n);
-    size_t (*readLong)(THFile *self, long *data, size_t n);
+    size_t (*readShort)(THFile *self, int16_t *data, size_t n);
+    size_t (*readInt)(THFile *self, int32_t *data, size_t n);
+    size_t (*readLong)(THFile *self, int64_t *data, size_t n);
     size_t (*readFloat)(THFile *self, float *data, size_t n);
     size_t (*readDouble)(THFile *self, double *data, size_t n);
     size_t (*readHalf)(THFile *self, THHalf *data, size_t n);
     size_t (*readString)(THFile *self, const char *format, char **str_);
 
-    size_t (*writeByte)(THFile *self, unsigned char *data, size_t n);
+    size_t (*writeByte)(THFile *self, uint8_t *data, size_t n);
     size_t (*writeChar)(THFile *self, char *data, size_t n);
-    size_t (*writeShort)(THFile *self, short *data, size_t n);
-    size_t (*writeInt)(THFile *self, int *data, size_t n);
-    size_t (*writeLong)(THFile *self, long *data, size_t n);
+    size_t (*writeShort)(THFile *self, int16_t *data, size_t n);
+    size_t (*writeInt)(THFile *self, int32_t *data, size_t n);
+    size_t (*writeLong)(THFile *self, int64_t *data, size_t n);
     size_t (*writeFloat)(THFile *self, float *data, size_t n);
     size_t (*writeDouble)(THFile *self, double *data, size_t n);
     size_t (*writeHalf)(THFile *self, THHalf *data, size_t n);
diff --git a/torch/lib/TH/THGeneral.c b/torch/lib/TH/THGeneral.c
index ac032b992e3b..08fb92e6968c 100644
--- a/torch/lib/TH/THGeneral.c
+++ b/torch/lib/TH/THGeneral.c
@@ -359,25 +359,27 @@ TH_API void THInferNumThreads(void)
 #endif
 }
 
-TH_API THDescBuff _THSizeDesc(const long *size, const long ndim) {
+TH_API THDescBuff _THSizeDesc(const int64_t *size, const int64_t ndim) {
   const int L = TH_DESC_BUFF_LEN;
   THDescBuff buf;
   char *str = buf.str;
-  int n = 0;
+  int i, n = 0;
   n += snprintf(str, L-n, "[");
-  int i;
-  for(i = 0; i < ndim; i++) {
-    if(n >= L) break;
-    n += snprintf(str+n, L-n, "%ld", size[i]);
-    if(i < ndim-1) {
+
+  for (i = 0; i < ndim; i++) {
+    if (n >= L) break;
+    n += snprintf(str+n, L-n, "%" PRId64, size[i]);
+    if (i < ndim-1) {
       n += snprintf(str+n, L-n, " x ");
     }
   }
-  if(n < L - 2) {
+
+  if (n < L - 2) {
     snprintf(str+n, L-n, "]");
   } else {
     snprintf(str+L-5, 5, "...]");
   }
+
   return buf;
 }
 
diff --git a/torch/lib/TH/THGeneral.h.in b/torch/lib/TH/THGeneral.h.in
index 0e1cb5dead3e..4aee7b8e25bf 100644
--- a/torch/lib/TH/THGeneral.h.in
+++ b/torch/lib/TH/THGeneral.h.in
@@ -1,6 +1,7 @@
 #ifndef TH_GENERAL_INC
 #define TH_GENERAL_INC
 
+#include <stdint.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <stdarg.h>
@@ -10,6 +11,7 @@
 #include <time.h>
 #include <string.h>
 #include <stddef.h>
+#include <inttypes.h>
 
 #cmakedefine USE_BLAS
 #cmakedefine USE_LAPACK
@@ -50,7 +52,7 @@ typedef struct {
 
 
 TH_API double THLog1p(const double x);
-TH_API THDescBuff _THSizeDesc(const long *size, const long ndim);
+TH_API THDescBuff _THSizeDesc(const int64_t *size, const int64_t ndim);
 TH_API void _THError(const char *file, const int line, const char *fmt, ...);
 TH_API void _THAssertionFailed(const char *file, const int line, const char *exp, const char *fmt, ...);
 TH_API void THSetErrorHandler(THErrorHandlerFunction new_handler, void *data);
@@ -120,7 +122,12 @@ do {                                                                  \
 #define THMax(X, Y)  ((X) > (Y) ? (X) : (Y))
 
 #if (defined(_MSC_VER) || defined(__MINGW32__))
-# define log1p(x) THLog1p(x)
+#if defined(_MSC_VER)
+__inline double log1p(double x) { return THLog1p(x); }
+#else
+inline double log1p(double x) { return THLog1p(x); }
+#endif
+
 #define snprintf _snprintf
 #define popen _popen
 #define pclose _pclose
diff --git a/torch/lib/TH/THGenerateByteType.h b/torch/lib/TH/THGenerateByteType.h
index 71ce7c405c12..0ec234de4d36 100644
--- a/torch/lib/TH/THGenerateByteType.h
+++ b/torch/lib/TH/THGenerateByteType.h
@@ -2,8 +2,9 @@
 #error "You must define TH_GENERIC_FILE before including THGenerateByteType.h"
 #endif
 
-#define real unsigned char
-#define accreal long
+#define real uint8_t
+#define ureal uint8_t
+#define accreal int64_t
 #define Real Byte
 #define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val)
 #define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val)
@@ -12,6 +13,7 @@
 #line 1 TH_GENERIC_FILE
 #include TH_GENERIC_FILE
 #undef real
+#undef ureal
 #undef accreal
 #undef Real
 #undef THInf
diff --git a/torch/lib/TH/THGenerateCharType.h b/torch/lib/TH/THGenerateCharType.h
index 158dd0e8039f..349a86f40c6c 100644
--- a/torch/lib/TH/THGenerateCharType.h
+++ b/torch/lib/TH/THGenerateCharType.h
@@ -2,8 +2,9 @@
 #error "You must define TH_GENERIC_FILE before including THGenerateCharType.h"
 #endif
 
-#define real char
-#define accreal long
+#define real int8_t
+#define ureal uint8_t
+#define accreal int64_t
 #define Real Char
 #define THInf CHAR_MAX
 #define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val)
@@ -12,6 +13,7 @@
 #line 1 TH_GENERIC_FILE
 #include TH_GENERIC_FILE
 #undef real
+#undef ureal
 #undef accreal
 #undef Real
 #undef THInf
diff --git a/torch/lib/TH/THGenerateIntType.h b/torch/lib/TH/THGenerateIntType.h
index 1562b9e98600..5135bc5b6d82 100644
--- a/torch/lib/TH/THGenerateIntType.h
+++ b/torch/lib/TH/THGenerateIntType.h
@@ -2,8 +2,9 @@
 #error "You must define TH_GENERIC_FILE before including THGenerateIntType.h"
 #endif
 
-#define real int
-#define accreal long
+#define real int32_t
+#define ureal uint32_t
+#define accreal int64_t
 #define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val)
 #define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val)
 #define Real Int
@@ -12,6 +13,7 @@
 #line 1 TH_GENERIC_FILE
 #include TH_GENERIC_FILE
 #undef real
+#undef ureal
 #undef accreal
 #undef Real
 #undef THInf
diff --git a/torch/lib/TH/THGenerateLongType.h b/torch/lib/TH/THGenerateLongType.h
index 75f90e1a609d..d2b9af077658 100644
--- a/torch/lib/TH/THGenerateLongType.h
+++ b/torch/lib/TH/THGenerateLongType.h
@@ -2,8 +2,9 @@
 #error "You must define TH_GENERIC_FILE before including THGenerateLongType.h"
 #endif
 
-#define real long
-#define accreal long
+#define real int64_t
+#define ureal uint64_t
+#define accreal int64_t
 #define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val)
 #define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val)
 #define Real Long
@@ -12,6 +13,7 @@
 #line 1 TH_GENERIC_FILE
 #include TH_GENERIC_FILE
 #undef real
+#undef ureal
 #undef accreal
 #undef Real
 #undef THInf
diff --git a/torch/lib/TH/THGenerateShortType.h b/torch/lib/TH/THGenerateShortType.h
index 047e51a8d75f..5b83c476335f 100644
--- a/torch/lib/TH/THGenerateShortType.h
+++ b/torch/lib/TH/THGenerateShortType.h
@@ -2,8 +2,9 @@
 #error "You must define TH_GENERIC_FILE before including THGenerateShortType.h"
 #endif
 
-#define real short
-#define accreal long
+#define real int16_t
+#define ureal uint16_t
+#define accreal int64_t
 #define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val)
 #define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val)
 #define Real Short
@@ -12,6 +13,7 @@
 #line 1 TH_GENERIC_FILE
 #include TH_GENERIC_FILE
 #undef real
+#undef ureal
 #undef accreal
 #undef Real
 #undef THInf
diff --git a/torch/lib/TH/THMemoryFile.c b/torch/lib/TH/THMemoryFile.c
index ecce6e1b1f3c..c4c5a3405b61 100644
--- a/torch/lib/TH/THMemoryFile.c
+++ b/torch/lib/TH/THMemoryFile.c
@@ -351,7 +351,7 @@ READ_WRITE_METHODS(double, Double,
 
 int THDiskFile_isLittleEndianCPU(void);
 
-static size_t THMemoryFile_readLong(THFile *self, long *data, size_t n)
+static size_t THMemoryFile_readLong(THFile *self, int64_t *data, size_t n)
 {
   THMemoryFile *mfself = (THMemoryFile*)self;
   size_t nread = 0L;
@@ -364,13 +364,13 @@ static size_t THMemoryFile_readLong(THFile *self, long *data, size_t n)
 
   if(mfself->file.isBinary)
   {
-    if(mfself->longSize == 0 || mfself->longSize == sizeof(long))
+    if(mfself->longSize == 0 || mfself->longSize == sizeof(int64_t))
     {
-      size_t nByte = sizeof(long)*n;
+      size_t nByte = sizeof(int64_t)*n;
       size_t nByteRemaining = (mfself->position + nByte <= mfself->size ? nByte : mfself->size-mfself->position);
-      nread = nByteRemaining/sizeof(long);
-      memmove(data, mfself->storage->data+mfself->position, nread*sizeof(long));
-      mfself->position += nread*sizeof(long);
+      nread = nByteRemaining/sizeof(int64_t);
+      memmove(data, mfself->storage->data+mfself->position, nread*sizeof(int64_t));
+      mfself->position += nread*sizeof(int64_t);
     } else if(mfself->longSize == 4)
     {
       size_t nByte = 4*n;
@@ -403,7 +403,7 @@ static size_t THMemoryFile_readLong(THFile *self, long *data, size_t n)
       size_t nByteRead = 0;
       char spaceChar = 0;
       char *spacePtr = THMemoryFile_strnextspace(mfself->storage->data+mfself->position, &spaceChar);
-      int nByteRead_; int ret = sscanf(mfself->storage->data+mfself->position, "%ld%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++;
+      int nByteRead_; int ret = sscanf(mfself->storage->data+mfself->position, "%" PRId64 "%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++;
       if(ret == EOF)
       {
         while(mfself->storage->data[mfself->position])
@@ -431,7 +431,7 @@ static size_t THMemoryFile_readLong(THFile *self, long *data, size_t n)
   return nread;
 }
 
-static size_t THMemoryFile_writeLong(THFile *self, long *data, size_t n)
+static size_t THMemoryFile_writeLong(THFile *self, int64_t *data, size_t n)
 {
   THMemoryFile *mfself = (THMemoryFile*)self;
 
@@ -443,9 +443,9 @@ static size_t THMemoryFile_writeLong(THFile *self, long *data, size_t n)
 
   if(mfself->file.isBinary)
   {
-    if(mfself->longSize == 0 || mfself->longSize == sizeof(long))
+    if(mfself->longSize == 0 || mfself->longSize == sizeof(int64_t))
     {
-      size_t nByte = sizeof(long)*n;
+      size_t nByte = sizeof(int64_t)*n;
       THMemoryFile_grow(mfself, mfself->position+nByte);
       memmove(mfself->storage->data+mfself->position, data, nByte);
       mfself->position += nByte;
@@ -456,7 +456,7 @@ static size_t THMemoryFile_writeLong(THFile *self, long *data, size_t n)
       int32_t *storage = (int32_t *)(mfself->storage->data + mfself->position);
       size_t i;
       for(i = 0; i < n; i++)
-        storage[i] = data[i];
+        storage[i] = (int32_t) data[i];
       mfself->position += nByte;
     }
     else /* if(mfself->longSize == 8) */
@@ -469,7 +469,7 @@ static size_t THMemoryFile_writeLong(THFile *self, long *data, size_t n)
       for(i = 0; i < n; i++)
       {
         storage[2*i + !big_endian] = 0;
-        storage[2*i + big_endian] = data[i];
+        storage[2*i + big_endian] = (int32_t) data[i];
       }
       mfself->position += nByte;
     }
@@ -487,7 +487,7 @@ static size_t THMemoryFile_writeLong(THFile *self, long *data, size_t n)
       ssize_t nByteWritten;
       while (1)
       {
-        nByteWritten = snprintf(mfself->storage->data+mfself->position, mfself->storage->size-mfself->position, "%ld", data[i]);
+        nByteWritten = snprintf(mfself->storage->data+mfself->position, mfself->storage->size-mfself->position, "%" PRId64, data[i]);
         if( (nByteWritten > -1) && (nByteWritten < mfself->storage->size-mfself->position) )
         {
           mfself->position += nByteWritten;
diff --git a/torch/lib/TH/THRandom.c b/torch/lib/TH/THRandom.c
index fbaf2820cdc3..4b8832669116 100644
--- a/torch/lib/TH/THRandom.c
+++ b/torch/lib/TH/THRandom.c
@@ -50,10 +50,10 @@ int THGenerator_isValid(THGenerator *_generator)
 }
 
 #ifndef _WIN32
-static unsigned long readURandomLong()
+static uint64_t readURandomLong()
 {
   int randDev = open("/dev/urandom", O_RDONLY);
-  unsigned long randValue;
+  uint64_t randValue;
   if (randDev < 0) {
     THError("Unable to open /dev/urandom");
   }
@@ -66,12 +66,12 @@ static unsigned long readURandomLong()
 }
 #endif // _WIN32
 
-unsigned long THRandom_seed(THGenerator *_generator)
+uint64_t THRandom_seed(THGenerator *_generator)
 {
 #ifdef _WIN32
-  unsigned long s = (unsigned long)time(0);
+  uint64_t s = (uint64_t)time(0);
 #else
-  unsigned long s = readURandomLong();
+  uint64_t s = readURandomLong();
 #endif
   THRandom_manualSeed(_generator, s);
   return s;
@@ -137,7 +137,7 @@ unsigned long THRandom_seed(THGenerator *_generator)
 #define TWIST(u,v) ((MIXBITS(u,v) >> 1) ^ ((v)&1UL ? MATRIX_A : 0UL))
 /*********************************************************** That's it. */
 
-void THRandom_manualSeed(THGenerator *_generator, unsigned long the_seed_)
+void THRandom_manualSeed(THGenerator *_generator, uint64_t the_seed_)
 {
   int j;
 
@@ -161,14 +161,14 @@ void THRandom_manualSeed(THGenerator *_generator, unsigned long the_seed_)
   _generator->seeded = 1;
 }
 
-unsigned long THRandom_initialSeed(THGenerator *_generator)
+uint64_t THRandom_initialSeed(THGenerator *_generator)
 {
   return _generator->the_initial_seed;
 }
 
 void THRandom_nextState(THGenerator *_generator)
 {
-  unsigned long *p = _generator->state;
+  uint64_t *p = _generator->state;
   int j;
 
   _generator->left = n;
@@ -183,9 +183,9 @@ void THRandom_nextState(THGenerator *_generator)
   *p = p[m-n] ^ TWIST(p[0], _generator->state[0]);
 }
 
-unsigned long THRandom_random(THGenerator *_generator)
+uint64_t THRandom_random(THGenerator *_generator)
 {
-  unsigned long y;
+  uint64_t y;
 
   if (--(_generator->left) == 0)
     THRandom_nextState(_generator);
diff --git a/torch/lib/TH/THRandom.h b/torch/lib/TH/THRandom.h
index 28a14c0d7f78..f147b1686625 100644
--- a/torch/lib/TH/THRandom.h
+++ b/torch/lib/TH/THRandom.h
@@ -8,11 +8,11 @@
 /* A THGenerator contains all the state required for a single random number stream */
 typedef struct THGenerator {
   /* The initial seed. */
-  unsigned long the_initial_seed;
+  uint64_t the_initial_seed;
   int left;  /* = 1; */
   int seeded; /* = 0; */
-  unsigned long next;
-  unsigned long state[_MERSENNE_STATE_N]; /* the array for the state vector  */
+  uint64_t next;
+  uint64_t state[_MERSENNE_STATE_N]; /* the array for the state vector  */
   /********************************/
 
   /* For normal distribution */
@@ -34,16 +34,16 @@ TH_API int THGenerator_isValid(THGenerator *_generator);
 
 /* Initializes the random number generator from /dev/urandom (or on Windows
 platforms with the current time (granularity: seconds)) and returns the seed. */
-TH_API unsigned long THRandom_seed(THGenerator *_generator);
+TH_API uint64_t THRandom_seed(THGenerator *_generator);
 
-/* Initializes the random number generator with the given long "the_seed_". */
-TH_API void THRandom_manualSeed(THGenerator *_generator, unsigned long the_seed_);
+/* Initializes the random number generator with the given int64_t "the_seed_". */
+TH_API void THRandom_manualSeed(THGenerator *_generator, uint64_t the_seed_);
 
 /* Returns the starting seed used. */
-TH_API unsigned long THRandom_initialSeed(THGenerator *_generator);
+TH_API uint64_t THRandom_initialSeed(THGenerator *_generator);
 
 /* Generates a uniform 32 bits integer. */
-TH_API unsigned long THRandom_random(THGenerator *_generator);
+TH_API uint64_t THRandom_random(THGenerator *_generator);
 
 /* Generates a uniform random number on [0,1[. */
 TH_API double THRandom_uniform(THGenerator *_generator, double a, double b);
diff --git a/torch/lib/TH/THSize.c b/torch/lib/TH/THSize.c
index ccf1f61ddf76..2eb00393a763 100644
--- a/torch/lib/TH/THSize.c
+++ b/torch/lib/TH/THSize.c
@@ -1,6 +1,6 @@
 #include "THSize.h"
 
-int THSize_isSameSizeAs(const long *sizeA, long dimsA, const long *sizeB, long dimsB) {
+int THSize_isSameSizeAs(const int64_t *sizeA, int64_t dimsA, const int64_t *sizeB, int64_t dimsB) {
   int d;
   if (dimsA != dimsB)
     return 0;
@@ -12,7 +12,7 @@ int THSize_isSameSizeAs(const long *sizeA, long dimsA, const long *sizeB, long d
   return 1;
 }
 
-ptrdiff_t THSize_nElement(long dims, long *size) {
+ptrdiff_t THSize_nElement(int64_t dims, int64_t *size) {
   if(dims == 0)
     return 0;
   else
diff --git a/torch/lib/TH/THSize.h b/torch/lib/TH/THSize.h
index 3d39696f6b37..2927f21d5126 100644
--- a/torch/lib/TH/THSize.h
+++ b/torch/lib/TH/THSize.h
@@ -7,7 +7,7 @@
 // THTensor functions that would work on a THSize if we had such a class in C++,
 // i.e. THTensor functions that depend only on the shape of the tensor, not the type.
 
-TH_API int THSize_isSameSizeAs(const long *sizeA, long dimsA, const long *sizeB, long dimsB);
-TH_API ptrdiff_t THSize_nElement(long dims, long *size);
+TH_API int THSize_isSameSizeAs(const int64_t *sizeA, int64_t dimsA, const int64_t *sizeB, int64_t dimsB);
+TH_API ptrdiff_t THSize_nElement(int64_t dims, int64_t *size);
 
 #endif
diff --git a/torch/lib/TH/THStorage.c b/torch/lib/TH/THStorage.c
index 9f86eb6965f5..00ea326a7d2a 100644
--- a/torch/lib/TH/THStorage.c
+++ b/torch/lib/TH/THStorage.c
@@ -48,7 +48,7 @@ THLongStorage *THLongStorage_newInferSize(THLongStorage *size, ptrdiff_t nElemen
   return copy;
 }
 
-int THLongStorage_inferSize2(THLongStorage *output, long *sizesA, long dimsA, long *sizesB, long dimsB,
+int THLongStorage_inferSize2(THLongStorage *output, int64_t *sizesA, int64_t dimsA, int64_t *sizesB, int64_t dimsB,
                              char *error_buffer, int buffer_len) {
   THArgCheck(sizesA != NULL, 1, "sizesA must not be null");
   THArgCheck(sizesB != NULL, 2, "sizesB must not be null");
@@ -56,30 +56,30 @@ int THLongStorage_inferSize2(THLongStorage *output, long *sizesA, long dimsA, lo
   THArgCheck(dimsB, 1, "Can't expand empty tensor b");
   ptrdiff_t ndim = dimsA > dimsB ? dimsA : dimsB;
 
-  long *expandedSizes = THAlloc(sizeof(long)*ndim);
+  int64_t *expandedSizes = THAlloc(sizeof(int64_t)*ndim);
 
-  for (long i = ndim - 1; i >= 0; --i) {
-    long offset = ndim - 1 - i;
-    long dimA = dimsA - 1 - offset;
-    long dimB = dimsB - 1 - offset;
-    long sizeA = (dimA >= 0) ? sizesA[dimA] : 1;
-    long sizeB = (dimB >= 0) ? sizesB[dimB] : 1;
+  for (int64_t i = ndim - 1; i >= 0; --i) {
+    int64_t offset = ndim - 1 - i;
+    int64_t dimA = dimsA - 1 - offset;
+    int64_t dimB = dimsB - 1 - offset;
+    int64_t sizeA = (dimA >= 0) ? sizesA[dimA] : 1;
+    int64_t sizeB = (dimB >= 0) ? sizesB[dimB] : 1;
     if (sizeA == sizeB || sizeA == 1 || sizeB == 1) {
       expandedSizes[i] = THMax(sizeA, sizeB);
     } else {
       THFree(expandedSizes);
-      snprintf(error_buffer, buffer_len, "The size of tensor a (%ld) must match the size of tensor b (%ld) at "
-               "non-singleton dimension %ld.", sizeA, sizeB, i);
+      snprintf(error_buffer, buffer_len, "The size of tensor a (%" PRId64 ") must match the size of tensor b (%" PRId64 ") at "
+               "non-singleton dimension %" PRId64 ".", sizeA, sizeB, i);
       return -1;
     }
   }
   THLongStorage_resize(output, ndim);
-  memcpy(THLongStorage_data(output), expandedSizes, sizeof(long)*ndim);
+  memcpy(THLongStorage_data(output), expandedSizes, sizeof(int64_t)*ndim);
   THFree(expandedSizes);
   return 0;
 }
 
-int THLongStorage_inferSizeN(THLongStorage *output, int n, long **sizes, long *dims,
+int THLongStorage_inferSizeN(THLongStorage *output, int n, int64_t **sizes, int64_t *dims,
                              char *error_buffer, int buffer_len) {
   THArgCheck(n > 0, 2, "n must be greater than 0");
   THArgCheck(sizes != NULL, 1, "sizes must not be null");
@@ -92,51 +92,51 @@ int THLongStorage_inferSizeN(THLongStorage *output, int n, long **sizes, long *d
     ndim = dims[ j ] > ndim ? dims[ j ] : ndim;
   }
 
-  long *expandedSizes = THAlloc(sizeof(long)*ndim);
+  int64_t *expandedSizes = THAlloc(sizeof(int64_t)*ndim);
 
-  for (long i = ndim - 1; i >= 0; --i) {
+  for (int64_t i = ndim - 1; i >= 0; --i) {
     expandedSizes[ i ] = 1;
-    long offset = ndim - 1 - i;
+    int64_t offset = ndim - 1 - i;
     for (int j  = 0; j < n; ++j) {
-      long dim = dims[ j ] - 1 - offset;
-      long size = (dim >= 0) ? sizes[ j ][ dim ] : 1;
+      int64_t dim = dims[ j ] - 1 - offset;
+      int64_t size = (dim >= 0) ? sizes[ j ][ dim ] : 1;
       if (size == expandedSizes[ i ] || size == 1 || expandedSizes[ i ] == 1) {
         expandedSizes[ i ] =  THMax(expandedSizes[ i ], size);
       } else {
         THFree(expandedSizes);
-        snprintf(error_buffer, buffer_len, "The size of tensor %i (%ld) must match the expanded size"
-                 "of tensor (%ld) at non-singleton dimension %ld.", j, size, expandedSizes[ i ], i);
+        snprintf(error_buffer, buffer_len, "The size of tensor %i (%" PRId64 ") must match the expanded size"
+                 "of tensor (%" PRId64 ") at non-singleton dimension %" PRId64 ".", j, size, expandedSizes[ i ], i);
         return -1;
       }
     }
   }
   THLongStorage_resize(output, ndim);
-  memcpy(THLongStorage_data(output), expandedSizes, sizeof(long)*ndim);
+  memcpy(THLongStorage_data(output), expandedSizes, sizeof(int64_t)*ndim);
   THFree(expandedSizes);
   return 0;
 }
 
-int THLongStorage_inferExpandGeometry(long *tensorSizes, long *tensorStrides, long tensorDim,
-                                        THLongStorage *sizes, long **expandedSizes, long **expandedStrides,
+int THLongStorage_inferExpandGeometry(int64_t *tensorSizes, int64_t *tensorStrides, int64_t tensorDim,
+                                        THLongStorage *sizes, int64_t **expandedSizes, int64_t **expandedStrides,
                                         char *error_buffer, int buffer_len) {
   ptrdiff_t ndim = THLongStorage_size(sizes);
 
-  long *expandedSizesCalc = THAlloc(sizeof(long)*ndim);
-  long *expandedStridesCalc = THAlloc(sizeof(long)*ndim);
+  int64_t *expandedSizesCalc = THAlloc(sizeof(int64_t)*ndim);
+  int64_t *expandedStridesCalc = THAlloc(sizeof(int64_t)*ndim);
 
   // create a new geometry for the tensors
-  for (long i = ndim - 1; i >= 0; --i) {
-    long offset = ndim - 1 - i;
-    long dim = tensorDim - 1 - offset;
-    long size = (dim >= 0) ? tensorSizes[dim] : 1;
-    long stride = (dim >= 0) ?
+  for (int64_t i = ndim - 1; i >= 0; --i) {
+    int64_t offset = ndim - 1 - i;
+    int64_t dim = tensorDim - 1 - offset;
+    int64_t size = (dim >= 0) ? tensorSizes[dim] : 1;
+    int64_t stride = (dim >= 0) ?
         tensorStrides[dim] : expandedSizesCalc[i + 1] * expandedStridesCalc[i+1];
-    long targetSize = THLongStorage_data(sizes)[i];
+    int64_t targetSize = THLongStorage_data(sizes)[i];
     if (targetSize == -1) {
       if (dim < 0) {
         THFree(expandedSizesCalc);
         THFree(expandedStridesCalc);
-        snprintf(error_buffer, buffer_len, "The expanded size of the tensor (%ld) isn't allowed in a leading, non-existing dimension %ld.", targetSize, i);
+        snprintf(error_buffer, buffer_len, "The expanded size of the tensor (%" PRId64 ") isn't allowed in a leading, non-existing dimension %" PRId64 ".", targetSize, i);
         return -1;
       } else {
         targetSize = size;
@@ -149,8 +149,8 @@ int THLongStorage_inferExpandGeometry(long *tensorSizes, long *tensorStrides, lo
       } else {
         THFree(expandedSizesCalc);
         THFree(expandedStridesCalc);
-        snprintf(error_buffer, buffer_len, "The expanded size of the tensor (%ld) must match the existing size (%ld) at "
-                 "non-singleton dimension %ld.", targetSize, size, i);
+        snprintf(error_buffer, buffer_len, "The expanded size of the tensor (%" PRId64 ") must match the existing size (%" PRId64 ") at "
+                 "non-singleton dimension %" PRId64 ".", targetSize, size, i);
         return -1;
       }
     }
diff --git a/torch/lib/TH/THStorage.h b/torch/lib/TH/THStorage.h
index fb7946bd98d9..49a74538e3ca 100644
--- a/torch/lib/TH/THStorage.h
+++ b/torch/lib/TH/THStorage.h
@@ -27,13 +27,13 @@ TH_API THDescBuff THLongStorage_sizeDesc(const THLongStorage *size);
 TH_API THLongStorage *THLongStorage_newInferSize(THLongStorage *size, ptrdiff_t nElement);
 
 // Given the sizes of {2,N} tensors, write out the size when the tensors are expanded together.
-TH_API int THLongStorage_inferSize2(THLongStorage *output, long *sizesA, long dimsA,
-                                    long *sizesB, long dimsB, char *error_buffer, int buffer_len);
-TH_API int THLongStorage_inferSizeN(THLongStorage *output, int n, long **sizes, long *dims,
+TH_API int THLongStorage_inferSize2(THLongStorage *output, int64_t *sizesA, int64_t dimsA,
+                                    int64_t *sizesB, int64_t dimsB, char *error_buffer, int buffer_len);
+TH_API int THLongStorage_inferSizeN(THLongStorage *output, int n, int64_t **sizes, int64_t *dims,
                                     char *error_buffer, int buffer_len);
 
-TH_API int THLongStorage_inferExpandGeometry(long *tensorSizes, long *tensorStrides, long tensorDim,
-                                             THLongStorage *sizes, long **expandedSizes, long **expandedStrides,
+TH_API int THLongStorage_inferExpandGeometry(int64_t *tensorSizes, int64_t *tensorStrides, int64_t tensorDim,
+                                             THLongStorage *sizes, int64_t **expandedSizes, int64_t **expandedStrides,
                                              char *error_buffer, int buffer_len);
 
 #endif
diff --git a/torch/lib/TH/THTensorApply.h b/torch/lib/TH/THTensorApply.h
index 7f48da47ec5a..d17c0423733b 100644
--- a/torch/lib/TH/THTensorApply.h
+++ b/torch/lib/TH/THTensorApply.h
@@ -32,8 +32,8 @@
 
 #define __TH_TENSOR_APPLYX_PREAMBLE(TYPE, TENSOR, DIM, ALLOW_CONTIGUOUS) \
   TYPE *TENSOR##_data = NULL; \
-  long *TENSOR##_counter = NULL, *TENSOR##_sizes = NULL, *TENSOR##_strides = NULL, *TENSOR##_dimOffset = NULL; \
-  long TENSOR##_stride = 0, TENSOR##_size = 0, TENSOR##_dim = 0, TENSOR##_i, TENSOR##_n; \
+  int64_t *TENSOR##_counter = NULL, *TENSOR##_sizes = NULL, *TENSOR##_strides = NULL, *TENSOR##_dimOffset = NULL; \
+  int64_t TENSOR##_stride = 0, TENSOR##_size = 0, TENSOR##_dim = 0, TENSOR##_i, TENSOR##_n; \
   int TENSOR##_contiguous = ALLOW_CONTIGUOUS && DIM < 0; \
   TENSOR##_n = (TENSOR->nDimension ? 1 : 0); \
   for(TENSOR##_i = 0; TENSOR##_i < TENSOR->nDimension; TENSOR##_i++) \
@@ -65,7 +65,7 @@
           TENSOR##_dim++; \
       } \
       /* Allocate an array of 3*dim elements, where dim is the number of contiguous sections */ \
-      TENSOR##_counter = (long*)THAlloc(sizeof(long)*(3*TENSOR##_dim)); \
+      TENSOR##_counter = (int64_t*)THAlloc(sizeof(int64_t)*(3*TENSOR##_dim)); \
       TENSOR##_sizes = TENSOR##_counter + TENSOR##_dim; \
       TENSOR##_strides = TENSOR##_counter + 2*TENSOR##_dim; \
       TH_TENSOR_dim_index = TENSOR##_dim-1; \
@@ -137,7 +137,7 @@
 #define TH_TENSOR_APPLY3_D(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, DIM, CODE) \
 { \
   int TH_TENSOR_APPLY_hasFinished = 0; \
-  long TH_TENSOR_dim_index = 0; \
+  int64_t TH_TENSOR_dim_index = 0; \
   __TH_TENSOR_APPLYX_PREAMBLE(TYPE1, TENSOR1, DIM, 1) \
   __TH_TENSOR_APPLYX_PREAMBLE(TYPE2, TENSOR2, DIM, 1) \
   __TH_TENSOR_APPLYX_PREAMBLE(TYPE3, TENSOR3, DIM, 1) \
@@ -184,7 +184,7 @@
 #define TH_TENSOR_APPLY2_D(TYPE1, TENSOR1, TYPE2, TENSOR2, DIM, CODE) \
 { \
   int TH_TENSOR_APPLY_hasFinished = 0; \
-  long TH_TENSOR_dim_index = 0; \
+  int64_t TH_TENSOR_dim_index = 0; \
   __TH_TENSOR_APPLYX_PREAMBLE(TYPE1, TENSOR1, DIM, 1) \
   __TH_TENSOR_APPLYX_PREAMBLE(TYPE2, TENSOR2, DIM, 1) \
 \
@@ -217,7 +217,7 @@
 #define TH_TENSOR_APPLY_D(TYPE, TENSOR, DIM, CODE) \
 { \
   int TH_TENSOR_APPLY_hasFinished = 0; \
-  long TH_TENSOR_dim_index = 0; \
+  int64_t TH_TENSOR_dim_index = 0; \
   __TH_TENSOR_APPLYX_PREAMBLE(TYPE, TENSOR, DIM, 0) \
 \
   while(!TH_TENSOR_APPLY_hasFinished) \
diff --git a/torch/lib/TH/THTensorDimApply.h b/torch/lib/TH/THTensorDimApply.h
index 6727e1f7f05d..aba6f9303231 100644
--- a/torch/lib/TH/THTensorDimApply.h
+++ b/torch/lib/TH/THTensorDimApply.h
@@ -4,12 +4,12 @@
 #define TH_TENSOR_DIM_APPLY3(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, DIMENSION, CODE) \
 { \
   TYPE1 *TENSOR1##_data = NULL; \
-  long TENSOR1##_stride = 0, TENSOR1##_size = 0; \
+  int64_t TENSOR1##_stride = 0, TENSOR1##_size = 0; \
   TYPE2 *TENSOR2##_data = NULL; \
-  long TENSOR2##_stride = 0, TENSOR2##_size = 0; \
+  int64_t TENSOR2##_stride = 0, TENSOR2##_size = 0; \
   TYPE3 *TENSOR3##_data = NULL; \
-  long TENSOR3##_stride = 0, TENSOR3##_size = 0; \
-  long *TH_TENSOR_DIM_APPLY_counter = NULL; \
+  int64_t TENSOR3##_stride = 0, TENSOR3##_size = 0; \
+  int64_t *TH_TENSOR_DIM_APPLY_counter = NULL; \
   int TH_TENSOR_DIM_APPLY_hasFinished = 0; \
   int TH_TENSOR_DIM_APPLY_i; \
 \
@@ -48,7 +48,7 @@
             #TENSOR1, T1buff.str, #TENSOR2, T2buff.str, #TENSOR3, T3buff.str, DIMENSION); \
   } \
 \
-  TH_TENSOR_DIM_APPLY_counter = (long*)THAlloc(sizeof(long)*(TENSOR1->nDimension)); \
+  TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(TENSOR1->nDimension)); \
   for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->nDimension; TH_TENSOR_DIM_APPLY_i++) \
     TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
 \
@@ -116,7 +116,7 @@
  * specified DIMENSION. This function makes it easy to store the output from reducing the
  * TENSOR at index. For example, in the sum example described below, we could instead do:
  *
- * long i = 0;
+ * int64_t i = 0;
  * TYPE1 sum;
  *
  * for (i = 0; i < TENSOR1##_size; ++i) {
@@ -130,10 +130,10 @@
 #define TH_TENSOR_DIM_APPLY2(TYPE1, TENSOR1, TYPE2, TENSOR2, DIMENSION, CODE) \
 { \
   TYPE1 *TENSOR1##_data = NULL; \
-  long TENSOR1##_stride = 0, TENSOR1##_size = 0; \
+  int64_t TENSOR1##_stride = 0, TENSOR1##_size = 0; \
   TYPE2 *TENSOR2##_data = NULL; \
-  long TENSOR2##_stride = 0, TENSOR2##_size = 0; \
-  long *TH_TENSOR_DIM_APPLY_counter = NULL; \
+  int64_t TENSOR2##_stride = 0, TENSOR2##_size = 0; \
+  int64_t *TH_TENSOR_DIM_APPLY_counter = NULL; \
   int TH_TENSOR_DIM_APPLY_hasFinished = 0; \
   int TH_TENSOR_DIM_APPLY_i; \
 \
@@ -158,7 +158,7 @@
     }                                                                   \
   } \
 \
-  TH_TENSOR_DIM_APPLY_counter = (long*)THAlloc(sizeof(long)*(TENSOR1->nDimension)); \
+  TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(TENSOR1->nDimension)); \
   for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->nDimension; TH_TENSOR_DIM_APPLY_i++) \
     TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
 \
@@ -232,7 +232,7 @@
  * And at each point, we can access the data for each of the four elements of the Tensor via
  * TENSOR##_stride. So for example, if we wanted to sum the elements there, we could do:
  *
- * long i = 0;
+ * int64_t i = 0;
  * TYPE sum;
  * for (i = 0; i < TENSOR##_size; i++) {
  *  sum += TENSOR##_data[i * TENSOR##_stride]
@@ -256,8 +256,8 @@
 #define TH_TENSOR_DIM_APPLY(TYPE, TENSOR, DIMENSION, CODE) \
 { \
   TYPE *TENSOR##_data = NULL; \
-  long TENSOR##_stride = 0, TENSOR##_size = 0; \
-  long *TH_TENSOR_DIM_APPLY_counter = NULL; \
+  int64_t TENSOR##_stride = 0, TENSOR##_size = 0; \
+  int64_t *TH_TENSOR_DIM_APPLY_counter = NULL; \
   int TH_TENSOR_DIM_APPLY_hasFinished = 0; \
   int TH_TENSOR_DIM_APPLY_i; \
 \
@@ -268,7 +268,7 @@
   TENSOR##_stride = (TENSOR)->stride[DIMENSION]; \
   TENSOR##_size = TENSOR->size[DIMENSION]; \
   /* Counter stores the indices into the Tensor at any time */ \
-  TH_TENSOR_DIM_APPLY_counter = (long*)THAlloc(sizeof(long)*(TENSOR->nDimension)); \
+  TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(TENSOR->nDimension)); \
   for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR->nDimension; TH_TENSOR_DIM_APPLY_i++) \
     TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
 \
diff --git a/torch/lib/TH/generic/THBlas.c b/torch/lib/TH/generic/THBlas.c
index bcd2a65b3d5e..a54ac54f89e8 100644
--- a/torch/lib/TH/generic/THBlas.c
+++ b/torch/lib/TH/generic/THBlas.c
@@ -39,7 +39,7 @@ TH_EXTERNC void sgemm_(char *transa, char *transb, int *m, int *n, int *k, float
 
 
 
-void THBlas_(swap)(long n, real *x, long incx, real *y, long incy)
+void THBlas_(swap)(int64_t n, real *x, int64_t incx, real *y, int64_t incy)
 {
   if(n == 1)
   {
@@ -63,7 +63,7 @@ void THBlas_(swap)(long n, real *x, long incx, real *y, long incy)
   }
 #endif
   {
-    long i;
+    int64_t i;
     for(i = 0; i < n; i++)
     {
       real z = x[i*incx];
@@ -73,7 +73,7 @@ void THBlas_(swap)(long n, real *x, long incx, real *y, long incy)
   }
 }
 
-void THBlas_(scal)(long n, real a, real *x, long incx)
+void THBlas_(scal)(int64_t n, real a, real *x, int64_t incx)
 {
   if(n == 1)
     incx = 1;
@@ -93,7 +93,7 @@ void THBlas_(scal)(long n, real a, real *x, long incx)
   }
 #endif
   {
-    long i;
+    int64_t i;
     for(i = 0; i < n; i++) {
       if (a == 0) {
         x[i*incx] = 0;
@@ -104,7 +104,7 @@ void THBlas_(scal)(long n, real a, real *x, long incx)
   }
 }
 
-void THBlas_(copy)(long n, real *x, long incx, real *y, long incy)
+void THBlas_(copy)(int64_t n, real *x, int64_t incx, real *y, int64_t incy)
 {
   if(n == 1)
   {
@@ -128,13 +128,13 @@ void THBlas_(copy)(long n, real *x, long incx, real *y, long incy)
   }
 #endif
   {
-    long i;
+    int64_t i;
     for(i = 0; i < n; i++)
       y[i*incy] = x[i*incx];
   }
 }
 
-void THBlas_(axpy)(long n, real a, real *x, long incx, real *y, long incy)
+void THBlas_(axpy)(int64_t n, real a, real *x, int64_t incx, real *y, int64_t incy)
 {
   if(n == 1)
   {
@@ -158,13 +158,13 @@ void THBlas_(axpy)(long n, real a, real *x, long incx, real *y, long incy)
   }
 #endif
   {
-    long i;
+    int64_t i;
     for(i = 0; i < n; i++)
       y[i*incy] += a*x[i*incx];
   }
 }
 
-real THBlas_(dot)(long n, real *x, long incx, real *y, long incy)
+real THBlas_(dot)(int64_t n, real *x, int64_t incx, real *y, int64_t incy)
 {
   if(n == 1)
   {
@@ -187,7 +187,7 @@ real THBlas_(dot)(long n, real *x, long incx, real *y, long incy)
   }
 #endif
   {
-    long i;
+    int64_t i;
     real sum = 0;
     for(i = 0; i < n; i++)
     sum += x[i*incx]*y[i*incy];
@@ -195,7 +195,7 @@ real THBlas_(dot)(long n, real *x, long incx, real *y, long incy)
   }
 }
 
-void THBlas_(gemv)(char trans, long m, long n, real alpha, real *a, long lda, real *x, long incx, real beta, real *y, long incy)
+void THBlas_(gemv)(char trans, int64_t m, int64_t n, real alpha, real *a, int64_t lda, real *x, int64_t incx, real beta, real *y, int64_t incy)
 {
   if(n == 1)
     lda = m;
@@ -221,7 +221,7 @@ void THBlas_(gemv)(char trans, long m, long n, real alpha, real *a, long lda, re
   }
 #endif
   {
-    long i, j;
+    int64_t i, j;
 
     if( (trans == 'T') || (trans == 't') )
     {
@@ -253,7 +253,7 @@ void THBlas_(gemv)(char trans, long m, long n, real alpha, real *a, long lda, re
   }
 }
 
-void THBlas_(ger)(long m, long n, real alpha, real *x, long incx, real *y, long incy, real *a, long lda)
+void THBlas_(ger)(int64_t m, int64_t n, real alpha, real *x, int64_t incx, real *y, int64_t incy, real *a, int64_t lda)
 {
   if(n == 1)
     lda = m;
@@ -276,7 +276,7 @@ void THBlas_(ger)(long m, long n, real alpha, real *x, long incx, real *y, long
   }
 #endif
   {
-    long i, j;
+    int64_t i, j;
     for(j = 0; j < n; j++)
     {
       real *column_ = a+j*lda;
@@ -287,7 +287,7 @@ void THBlas_(ger)(long m, long n, real alpha, real *x, long incx, real *y, long
   }
 }
 
-void THBlas_(gemm)(char transa, char transb, long m, long n, long k, real alpha, real *a, long lda, real *b, long ldb, real beta, real *c, long ldc)
+void THBlas_(gemm)(char transa, char transb, int64_t m, int64_t n, int64_t k, real alpha, real *a, int64_t lda, real *b, int64_t ldb, real beta, real *c, int64_t ldc)
 {
   int transa_ = ((transa == 't') || (transa == 'T'));
   int transb_ = ((transb == 't') || (transb == 'T'));
@@ -336,7 +336,7 @@ void THBlas_(gemm)(char transa, char transb, long m, long n, long k, real alpha,
   }
 #endif
   {
-    long i, j, l;
+    int64_t i, j, l;
     if(!transa_ && !transb_)
     {
       real *a_ = a;
diff --git a/torch/lib/TH/generic/THBlas.h b/torch/lib/TH/generic/THBlas.h
index 9e14f5a844d5..c36e796a0cb5 100644
--- a/torch/lib/TH/generic/THBlas.h
+++ b/torch/lib/TH/generic/THBlas.h
@@ -3,17 +3,17 @@
 #else
 
 /* Level 1 */
-TH_API void THBlas_(swap)(long n, real *x, long incx, real *y, long incy);
-TH_API void THBlas_(scal)(long n, real a, real *x, long incx);
-TH_API void THBlas_(copy)(long n, real *x, long incx, real *y, long incy);
-TH_API void THBlas_(axpy)(long n, real a, real *x, long incx, real *y, long incy);
-TH_API real THBlas_(dot)(long n, real *x, long incx, real *y, long incy);
+TH_API void THBlas_(swap)(int64_t n, real *x, int64_t incx, real *y, int64_t incy);
+TH_API void THBlas_(scal)(int64_t n, real a, real *x, int64_t incx);
+TH_API void THBlas_(copy)(int64_t n, real *x, int64_t incx, real *y, int64_t incy);
+TH_API void THBlas_(axpy)(int64_t n, real a, real *x, int64_t incx, real *y, int64_t incy);
+TH_API real THBlas_(dot)(int64_t n, real *x, int64_t incx, real *y, int64_t incy);
 
 /* Level 2 */
-TH_API void THBlas_(gemv)(char trans, long m, long n, real alpha, real *a, long lda, real *x, long incx, real beta, real *y, long incy);
-TH_API void THBlas_(ger)(long m, long n, real alpha, real *x, long incx, real *y, long incy, real *a, long lda);
+TH_API void THBlas_(gemv)(char trans, int64_t m, int64_t n, real alpha, real *a, int64_t lda, real *x, int64_t incx, real beta, real *y, int64_t incy);
+TH_API void THBlas_(ger)(int64_t m, int64_t n, real alpha, real *x, int64_t incx, real *y, int64_t incy, real *a, int64_t lda);
 
 /* Level 3 */
-TH_API void THBlas_(gemm)(char transa, char transb, long m, long n, long k, real alpha, real *a, long lda, real *b, long ldb, real beta, real *c, long ldc);
+TH_API void THBlas_(gemm)(char transa, char transb, int64_t m, int64_t n, int64_t k, real alpha, real *a, int64_t lda, real *b, int64_t ldb, real beta, real *c, int64_t ldc);
 
 #endif
diff --git a/torch/lib/TH/generic/THTensor.c b/torch/lib/TH/generic/THTensor.c
index 8ebec674a07a..b6db905f60db 100644
--- a/torch/lib/TH/generic/THTensor.c
+++ b/torch/lib/TH/generic/THTensor.c
@@ -18,14 +18,14 @@ int THTensor_(nDimension)(const THTensor *self)
   return self->nDimension;
 }
 
-long THTensor_(size)(const THTensor *self, int dim)
+int64_t THTensor_(size)(const THTensor *self, int dim)
 {
   THArgCheck((dim >= 0) && (dim < self->nDimension), 2, "dimension %d out of range of %dD tensor",
       dim+TH_INDEX_BASE, THTensor_(nDimension)(self));
   return self->size[dim];
 }
 
-long THTensor_(stride)(const THTensor *self, int dim)
+int64_t THTensor_(stride)(const THTensor *self, int dim)
 {
   THArgCheck((dim >= 0) && (dim < self->nDimension), 2, "dimension %d out of range of %dD tensor",
       dim+TH_INDEX_BASE, THTensor_(nDimension)(self));
@@ -112,34 +112,34 @@ THTensor *THTensor_(newWithStorage)(THStorage *storage, ptrdiff_t storageOffset,
   return self;
 }
 THTensor *THTensor_(newWithStorage1d)(THStorage *storage, ptrdiff_t storageOffset,
-                               long size0, long stride0)
+                               int64_t size0, int64_t stride0)
 {
   return THTensor_(newWithStorage4d)(storage, storageOffset, size0, stride0, -1, -1,  -1, -1,  -1, -1);
 }
 
 THTensor *THTensor_(newWithStorage2d)(THStorage *storage, ptrdiff_t storageOffset,
-                               long size0, long stride0,
-                               long size1, long stride1)
+                               int64_t size0, int64_t stride0,
+                               int64_t size1, int64_t stride1)
 {
   return THTensor_(newWithStorage4d)(storage, storageOffset, size0, stride0, size1, stride1,  -1, -1,  -1, -1);
 }
 
 THTensor *THTensor_(newWithStorage3d)(THStorage *storage, ptrdiff_t storageOffset,
-                               long size0, long stride0,
-                               long size1, long stride1,
-                               long size2, long stride2)
+                               int64_t size0, int64_t stride0,
+                               int64_t size1, int64_t stride1,
+                               int64_t size2, int64_t stride2)
 {
   return THTensor_(newWithStorage4d)(storage, storageOffset, size0, stride0, size1, stride1,  size2, stride2,  -1, -1);
 }
 
 THTensor *THTensor_(newWithStorage4d)(THStorage *storage, ptrdiff_t storageOffset,
-                               long size0, long stride0,
-                               long size1, long stride1,
-                               long size2, long stride2,
-                               long size3, long stride3)
+                               int64_t size0, int64_t stride0,
+                               int64_t size1, int64_t stride1,
+                               int64_t size2, int64_t stride2,
+                               int64_t size3, int64_t stride3)
 {
-  long size[4] = {size0, size1, size2, size3};
-  long stride[4] = {stride0, stride1, stride2, stride3};
+  int64_t size[4] = {size0, size1, size2, size3};
+  int64_t stride[4] = {stride0, stride1, stride2, stride3};
 
   THTensor *self = THAlloc(sizeof(THTensor));
   THTensor_(rawInit)(self);
@@ -153,24 +153,24 @@ THTensor *THTensor_(newWithSize)(THLongStorage *size, THLongStorage *stride)
   return THTensor_(newWithStorage)(NULL, 0, size, stride);
 }
 
-THTensor *THTensor_(newWithSize1d)(long size0)
+THTensor *THTensor_(newWithSize1d)(int64_t size0)
 {
   return THTensor_(newWithSize4d)(size0, -1, -1, -1);
 }
 
-THTensor *THTensor_(newWithSize2d)(long size0, long size1)
+THTensor *THTensor_(newWithSize2d)(int64_t size0, int64_t size1)
 {
   return THTensor_(newWithSize4d)(size0, size1, -1, -1);
 }
 
-THTensor *THTensor_(newWithSize3d)(long size0, long size1, long size2)
+THTensor *THTensor_(newWithSize3d)(int64_t size0, int64_t size1, int64_t size2)
 {
   return THTensor_(newWithSize4d)(size0, size1, size2, -1);
 }
 
-THTensor *THTensor_(newWithSize4d)(long size0, long size1, long size2, long size3)
+THTensor *THTensor_(newWithSize4d)(int64_t size0, int64_t size1, int64_t size2, int64_t size3)
 {
-  long size[4] = {size0, size1, size2, size3};
+  int64_t size[4] = {size0, size1, size2, size3};
 
   THTensor *self = THAlloc(sizeof(THTensor));
   THTensor_(rawInit)(self);
@@ -198,14 +198,14 @@ THTensor *THTensor_(newContiguous)(THTensor *self)
   }
 }
 
-THTensor *THTensor_(newSelect)(THTensor *tensor, int dimension_, long sliceIndex_)
+THTensor *THTensor_(newSelect)(THTensor *tensor, int dimension_, int64_t sliceIndex_)
 {
   THTensor *self = THTensor_(newWithTensor)(tensor);
   THTensor_(select)(self, NULL, dimension_, sliceIndex_);
   return self;
 }
 
-THTensor *THTensor_(newNarrow)(THTensor *tensor, int dimension_, long firstIndex_, long size_)
+THTensor *THTensor_(newNarrow)(THTensor *tensor, int dimension_, int64_t firstIndex_, int64_t size_)
 {
   THTensor *self = THTensor_(newWithTensor)(tensor);
   THTensor_(narrow)(self, NULL, dimension_, firstIndex_, size_);
@@ -219,7 +219,7 @@ THTensor *THTensor_(newTranspose)(THTensor *tensor, int dimension1_, int dimensi
   return self;
 }
 
-THTensor *THTensor_(newUnfold)(THTensor *tensor, int dimension_, long size_, long step_)
+THTensor *THTensor_(newUnfold)(THTensor *tensor, int dimension_, int64_t size_, int64_t step_)
 {
   THTensor *self = THTensor_(newWithTensor)(tensor);
   THTensor_(unfold)(self, NULL, dimension_, size_, step_);
@@ -256,31 +256,31 @@ void THTensor_(resizeAs)(THTensor *self, THTensor *src)
     THTensor_(resizeNd)(self, src->nDimension, src->size, NULL);
 }
 
-void THTensor_(resize1d)(THTensor *tensor, long size0)
+void THTensor_(resize1d)(THTensor *tensor, int64_t size0)
 {
   THTensor_(resize4d)(tensor, size0, -1, -1, -1);
 }
 
-void THTensor_(resize2d)(THTensor *tensor, long size0, long size1)
+void THTensor_(resize2d)(THTensor *tensor, int64_t size0, int64_t size1)
 {
   THTensor_(resize4d)(tensor, size0, size1, -1, -1);
 }
 
-void THTensor_(resize3d)(THTensor *tensor, long size0, long size1, long size2)
+void THTensor_(resize3d)(THTensor *tensor, int64_t size0, int64_t size1, int64_t size2)
 {
   THTensor_(resize4d)(tensor, size0, size1, size2, -1);
 }
 
-void THTensor_(resize4d)(THTensor *self, long size0, long size1, long size2, long size3)
+void THTensor_(resize4d)(THTensor *self, int64_t size0, int64_t size1, int64_t size2, int64_t size3)
 {
-  long size[4] = {size0, size1, size2, size3};
+  int64_t size[4] = {size0, size1, size2, size3};
 
   THTensor_(resizeNd)(self, 4, size, NULL);
 }
 
-void THTensor_(resize5d)(THTensor *self, long size0, long size1, long size2, long size3, long size4)
+void THTensor_(resize5d)(THTensor *self, int64_t size0, int64_t size1, int64_t size2, int64_t size3, int64_t size4)
 {
-    long size[5] = {size0, size1, size2, size3, size4};
+    int64_t size[5] = {size0, size1, size2, size3, size4};
 
   THTensor_(resizeNd)(self, 5, size, NULL);
 }
@@ -297,8 +297,8 @@ void THTensor_(expand)(THTensor *r, THTensor *tensor, THLongStorage *sizes) {
              "the number of sizes provided must be greater or equal to the "
              "number of dimensions in the tensor");
 
-  long *expandedSizes;
-  long *expandedStrides;
+  int64_t *expandedSizes;
+  int64_t *expandedStrides;
   char error_buffer[1024];
   int ret =
       THLongStorage_inferExpandGeometry(tensor->size, tensor->stride, THTensor_(nDimension)(tensor),
@@ -321,8 +321,8 @@ void THTensor_(expandNd)(THTensor **rets, THTensor **ops, int count) {
     THArgCheck(THTensor_(nDimension)(ops[i]) > 0, i, "can't expand empty tensor %d", i);
   }
 
-  long **op_sizes = THAlloc(sizeof(long*) * count);
-  long *op_dims = THAlloc(sizeof(long) * count);
+  int64_t **op_sizes = THAlloc(sizeof(int64_t*) * count);
+  int64_t *op_dims = THAlloc(sizeof(int64_t) * count);
 
   for (int i = 0; i < count; ++i) {
     op_sizes[i] = ops[i]->size;
@@ -383,7 +383,7 @@ void THTensor_(setStorage)(THTensor *self, THStorage *storage_, ptrdiff_t storag
 }
 
 void THTensor_(setStorage1d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_,
-                             long size0_, long stride0_)
+                             int64_t size0_, int64_t stride0_)
 {
   THTensor_(setStorage4d)(self, storage_, storageOffset_,
                           size0_, stride0_,
@@ -393,8 +393,8 @@ void THTensor_(setStorage1d)(THTensor *self, THStorage *storage_, ptrdiff_t stor
 }
 
 void THTensor_(setStorage2d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_,
-                             long size0_, long stride0_,
-                             long size1_, long stride1_)
+                             int64_t size0_, int64_t stride0_,
+                             int64_t size1_, int64_t stride1_)
 {
   THTensor_(setStorage4d)(self, storage_, storageOffset_,
                           size0_, stride0_,
@@ -404,9 +404,9 @@ void THTensor_(setStorage2d)(THTensor *self, THStorage *storage_, ptrdiff_t stor
 }
 
 void THTensor_(setStorage3d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_,
-                             long size0_, long stride0_,
-                             long size1_, long stride1_,
-                             long size2_, long stride2_)
+                             int64_t size0_, int64_t stride0_,
+                             int64_t size1_, int64_t stride1_,
+                             int64_t size2_, int64_t stride2_)
 {
   THTensor_(setStorage4d)(self, storage_, storageOffset_,
                           size0_, stride0_,
@@ -416,20 +416,20 @@ void THTensor_(setStorage3d)(THTensor *self, THStorage *storage_, ptrdiff_t stor
 }
 
 void THTensor_(setStorage4d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_,
-                             long size0_, long stride0_,
-                             long size1_, long stride1_,
-                             long size2_, long stride2_,
-                             long size3_, long stride3_)
+                             int64_t size0_, int64_t stride0_,
+                             int64_t size1_, int64_t stride1_,
+                             int64_t size2_, int64_t stride2_,
+                             int64_t size3_, int64_t stride3_)
 {
 
-  long size[4] = {size0_, size1_, size2_, size3_};
-  long stride[4] = {stride0_, stride1_, stride2_, stride3_};
+  int64_t size[4] = {size0_, size1_, size2_, size3_};
+  int64_t stride[4] = {stride0_, stride1_, stride2_, stride3_};
 
   THTensor_(setStorageNd)(self, storage_, storageOffset_, 4, size, stride);
 }
 
 
-void THTensor_(narrow)(THTensor *self, THTensor *src, int dimension, long firstIndex, long size)
+void THTensor_(narrow)(THTensor *self, THTensor *src, int dimension, int64_t firstIndex, int64_t size)
 {
   if(!src)
     src = self;
@@ -446,7 +446,7 @@ void THTensor_(narrow)(THTensor *self, THTensor *src, int dimension, long firstI
   self->size[dimension] = size;
 }
 
-void THTensor_(select)(THTensor *self, THTensor *src, int dimension, long sliceIndex)
+void THTensor_(select)(THTensor *self, THTensor *src, int dimension, int64_t sliceIndex)
 {
   int d;
 
@@ -469,7 +469,7 @@ void THTensor_(select)(THTensor *self, THTensor *src, int dimension, long sliceI
 
 void THTensor_(transpose)(THTensor *self, THTensor *src, int dimension1, int dimension2)
 {
-  long z;
+  int64_t z;
 
   if(!src)
     src = self;
@@ -490,10 +490,10 @@ void THTensor_(transpose)(THTensor *self, THTensor *src, int dimension1, int dim
   self->size[dimension2] = z;
 }
 
-void THTensor_(unfold)(THTensor *self, THTensor *src, int dimension, long size, long step)
+void THTensor_(unfold)(THTensor *self, THTensor *src, int dimension, int64_t size, int64_t step)
 {
-  long *newSize;
-  long *newStride;
+  int64_t *newSize;
+  int64_t *newStride;
   int d;
 
   if(!src)
@@ -506,8 +506,8 @@ void THTensor_(unfold)(THTensor *self, THTensor *src, int dimension, long size,
 
   THTensor_(set)(self, src);
 
-  newSize = THAlloc(sizeof(long)*(self->nDimension+1));
-  newStride = THAlloc(sizeof(long)*(self->nDimension+1));
+  newSize = THAlloc(sizeof(int64_t)*(self->nDimension+1));
+  newStride = THAlloc(sizeof(int64_t)*(self->nDimension+1));
 
   newSize[self->nDimension] = size;
   newStride[self->nDimension] = self->stride[dimension];
@@ -601,8 +601,8 @@ void THTensor_(unsqueeze1d)(THTensor *self, THTensor *src, int dimension)
 
   THTensor_(set)(self, src);
 
-  self->size = (long*)THRealloc(self->size, sizeof(long)*(self->nDimension+1));
-  self->stride = (long*)THRealloc(self->stride, sizeof(long)*(self->nDimension+1));
+  self->size = (int64_t*)THRealloc(self->size, sizeof(int64_t)*(self->nDimension+1));
+  self->stride = (int64_t*)THRealloc(self->stride, sizeof(int64_t)*(self->nDimension+1));
   self->nDimension++;
   for (d = self->nDimension-1; d > dimension; d--) {
     self->size[d] = self->size[d-1];
@@ -621,9 +621,9 @@ int THTensor_(isTransposed)(const THTensor *self)
   if (THTensor_(isContiguous)(self)) {
     return 0;
   }
-  long max_stride = 1;
-  long size_max_stride = 1;
-  long z = 1;
+  int64_t max_stride = 1;
+  int64_t size_max_stride = 1;
+  int64_t z = 1;
   int d;
   for (d = 0; d < self->nDimension; ++d) {
     if (self->stride[d] == 0 && self->size[d] != 1)
@@ -642,7 +642,7 @@ int THTensor_(isTransposed)(const THTensor *self)
 
 int THTensor_(isContiguous)(const THTensor *self)
 {
-  long z = 1;
+  int64_t z = 1;
   int d;
   for(d = self->nDimension-1; d >= 0; d--)
   {
@@ -762,7 +762,7 @@ static void THTensor_(rawInit)(THTensor *self)
   self->flag = TH_TENSOR_REFCOUNTED;
 }
 
-void THTensor_(setStorageNd)(THTensor *self, THStorage *storage, ptrdiff_t storageOffset, int nDimension, long *size, long *stride)
+void THTensor_(setStorageNd)(THTensor *self, THStorage *storage, ptrdiff_t storageOffset, int nDimension, int64_t *size, int64_t *stride)
 {
   /* storage */
   if(self->storage != storage)
@@ -788,7 +788,7 @@ void THTensor_(setStorageNd)(THTensor *self, THStorage *storage, ptrdiff_t stora
   THTensor_(resizeNd)(self, nDimension, size, stride);
 }
 
-void THTensor_(resizeNd)(THTensor *self, int nDimension, long *size, long *stride)
+void THTensor_(resizeNd)(THTensor *self, int nDimension, int64_t *size, int64_t *stride)
 {
   int d;
   int nDimension_;
@@ -822,8 +822,8 @@ void THTensor_(resizeNd)(THTensor *self, int nDimension, long *size, long *strid
   {
     if(nDimension != self->nDimension)
     {
-      self->size = THRealloc(self->size, sizeof(long)*nDimension);
-      self->stride = THRealloc(self->stride, sizeof(long)*nDimension);
+      self->size = THRealloc(self->size, sizeof(int64_t)*nDimension);
+      self->stride = THRealloc(self->stride, sizeof(int64_t)*nDimension);
       self->nDimension = nDimension;
     }
 
@@ -855,56 +855,56 @@ void THTensor_(resizeNd)(THTensor *self, int nDimension, long *size, long *strid
     self->nDimension = 0;
 }
 
-void THTensor_(set1d)(THTensor *tensor, long x0, real value)
+void THTensor_(set1d)(THTensor *tensor, int64_t x0, real value)
 {
   THArgCheck(tensor->nDimension == 1, 1, "tensor must have one dimension");
   THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]), 2, "out of range");
   THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0], value);
 }
 
-real THTensor_(get1d)(const THTensor *tensor, long x0)
+real THTensor_(get1d)(const THTensor *tensor, int64_t x0)
 {
   THArgCheck(tensor->nDimension == 1, 1, "tensor must have one dimension");
   THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]), 2, "out of range");
   return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]);
 }
 
-void THTensor_(set2d)(THTensor *tensor, long x0, long x1, real value)
+void THTensor_(set2d)(THTensor *tensor, int64_t x0, int64_t x1, real value)
 {
   THArgCheck(tensor->nDimension == 2, 1, "tensor must have two dimensions");
   THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]), 2, "out of range");
   THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1], value);
 }
 
-real THTensor_(get2d)(const THTensor *tensor, long x0, long x1)
+real THTensor_(get2d)(const THTensor *tensor, int64_t x0, int64_t x1)
 {
   THArgCheck(tensor->nDimension == 2, 1, "tensor must have two dimensions");
   THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]), 2, "out of range");
   return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]);
 }
 
-void THTensor_(set3d)(THTensor *tensor, long x0, long x1, long x2, real value)
+void THTensor_(set3d)(THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, real value)
 {
   THArgCheck(tensor->nDimension == 3, 1, "tensor must have three dimensions");
   THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]), 2, "out of range");
   THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2], value);
 }
 
-real THTensor_(get3d)(const THTensor *tensor, long x0, long x1, long x2)
+real THTensor_(get3d)(const THTensor *tensor, int64_t x0, int64_t x1, int64_t x2)
 {
   THArgCheck(tensor->nDimension == 3, 1, "tensor must have three dimensions");
   THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]), 2, "out of range");
   return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]);
 }
 
-void THTensor_(set4d)(THTensor *tensor, long x0, long x1, long x2, long x3, real value)
+void THTensor_(set4d)(THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3, real value)
 {
   THArgCheck(tensor->nDimension == 4, 1, "tensor must have four dimensions");
   THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]) && (x3 >= 0) && (x3 < tensor->size[3]), 2, "out of range");
   THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]+x3*tensor->stride[3], value);
 }
 
-real THTensor_(get4d)(const THTensor *tensor, long x0, long x1, long x2, long x3)
+real THTensor_(get4d)(const THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3)
 {
   THArgCheck(tensor->nDimension == 4, 1, "tensor must have four dimensions");
   THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]) && (x3 >= 0) && (x3 < tensor->size[3]), 2, "out of range");
@@ -922,7 +922,7 @@ THDescBuff THTensor_(desc)(const THTensor *tensor) {
   int i;
   for(i = 0; i < tensor->nDimension; i++) {
     if(n >= L) break;
-    n += snprintf(str+n, L-n, "%ld", tensor->size[i]);
+    n += snprintf(str+n, L-n, "%" PRId64, tensor->size[i]);
     if(i < tensor->nDimension-1) {
       n += snprintf(str+n, L-n, "x");
     }
diff --git a/torch/lib/TH/generic/THTensor.h b/torch/lib/TH/generic/THTensor.h
index 2c2aeba21692..c51c21b0ecc6 100644
--- a/torch/lib/TH/generic/THTensor.h
+++ b/torch/lib/TH/generic/THTensor.h
@@ -8,8 +8,8 @@
 
 typedef struct THTensor
 {
-    long *size;
-    long *stride;
+    int64_t *size;
+    int64_t *stride;
     int nDimension;
 
     // Note: storage->size may be greater than the recorded size
@@ -27,8 +27,8 @@ typedef struct THTensor
 TH_API THStorage* THTensor_(storage)(const THTensor *self);
 TH_API ptrdiff_t THTensor_(storageOffset)(const THTensor *self);
 TH_API int THTensor_(nDimension)(const THTensor *self);
-TH_API long THTensor_(size)(const THTensor *self, int dim);
-TH_API long THTensor_(stride)(const THTensor *self, int dim);
+TH_API int64_t THTensor_(size)(const THTensor *self, int dim);
+TH_API int64_t THTensor_(stride)(const THTensor *self, int dim);
 TH_API THLongStorage *THTensor_(newSizeOf)(THTensor *self);
 TH_API THLongStorage *THTensor_(newStrideOf)(THTensor *self);
 TH_API real *THTensor_(data)(const THTensor *self);
@@ -43,33 +43,33 @@ TH_API THTensor *THTensor_(newWithTensor)(THTensor *tensor);
 /* stride might be NULL */
 TH_API THTensor *THTensor_(newWithStorage)(THStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_);
 TH_API THTensor *THTensor_(newWithStorage1d)(THStorage *storage_, ptrdiff_t storageOffset_,
-                                long size0_, long stride0_);
+                                int64_t size0_, int64_t stride0_);
 TH_API THTensor *THTensor_(newWithStorage2d)(THStorage *storage_, ptrdiff_t storageOffset_,
-                                long size0_, long stride0_,
-                                long size1_, long stride1_);
+                                int64_t size0_, int64_t stride0_,
+                                int64_t size1_, int64_t stride1_);
 TH_API THTensor *THTensor_(newWithStorage3d)(THStorage *storage_, ptrdiff_t storageOffset_,
-                                long size0_, long stride0_,
-                                long size1_, long stride1_,
-                                long size2_, long stride2_);
+                                int64_t size0_, int64_t stride0_,
+                                int64_t size1_, int64_t stride1_,
+                                int64_t size2_, int64_t stride2_);
 TH_API THTensor *THTensor_(newWithStorage4d)(THStorage *storage_, ptrdiff_t storageOffset_,
-                                long size0_, long stride0_,
-                                long size1_, long stride1_,
-                                long size2_, long stride2_,
-                                long size3_, long stride3_);
+                                int64_t size0_, int64_t stride0_,
+                                int64_t size1_, int64_t stride1_,
+                                int64_t size2_, int64_t stride2_,
+                                int64_t size3_, int64_t stride3_);
 
 /* stride might be NULL */
 TH_API THTensor *THTensor_(newWithSize)(THLongStorage *size_, THLongStorage *stride_);
-TH_API THTensor *THTensor_(newWithSize1d)(long size0_);
-TH_API THTensor *THTensor_(newWithSize2d)(long size0_, long size1_);
-TH_API THTensor *THTensor_(newWithSize3d)(long size0_, long size1_, long size2_);
-TH_API THTensor *THTensor_(newWithSize4d)(long size0_, long size1_, long size2_, long size3_);
+TH_API THTensor *THTensor_(newWithSize1d)(int64_t size0_);
+TH_API THTensor *THTensor_(newWithSize2d)(int64_t size0_, int64_t size1_);
+TH_API THTensor *THTensor_(newWithSize3d)(int64_t size0_, int64_t size1_, int64_t size2_);
+TH_API THTensor *THTensor_(newWithSize4d)(int64_t size0_, int64_t size1_, int64_t size2_, int64_t size3_);
 
 TH_API THTensor *THTensor_(newClone)(THTensor *self);
 TH_API THTensor *THTensor_(newContiguous)(THTensor *tensor);
-TH_API THTensor *THTensor_(newSelect)(THTensor *tensor, int dimension_, long sliceIndex_);
-TH_API THTensor *THTensor_(newNarrow)(THTensor *tensor, int dimension_, long firstIndex_, long size_);
+TH_API THTensor *THTensor_(newSelect)(THTensor *tensor, int dimension_, int64_t sliceIndex_);
+TH_API THTensor *THTensor_(newNarrow)(THTensor *tensor, int dimension_, int64_t firstIndex_, int64_t size_);
 TH_API THTensor *THTensor_(newTranspose)(THTensor *tensor, int dimension1_, int dimension2_);
-TH_API THTensor *THTensor_(newUnfold)(THTensor *tensor, int dimension_, long size_, long step_);
+TH_API THTensor *THTensor_(newUnfold)(THTensor *tensor, int dimension_, int64_t size_, int64_t step_);
 TH_API THTensor *THTensor_(newView)(THTensor *tensor, THLongStorage *size);
 TH_API THTensor *THTensor_(newExpand)(THTensor *tensor, THLongStorage *size);
 
@@ -78,35 +78,35 @@ TH_API void THTensor_(expandNd)(THTensor **rets, THTensor **ops, int count);
 
 TH_API void THTensor_(resize)(THTensor *tensor, THLongStorage *size, THLongStorage *stride);
 TH_API void THTensor_(resizeAs)(THTensor *tensor, THTensor *src);
-TH_API void THTensor_(resizeNd)(THTensor *tensor, int nDimension, long *size, long *stride);
-TH_API void THTensor_(resize1d)(THTensor *tensor, long size0_);
-TH_API void THTensor_(resize2d)(THTensor *tensor, long size0_, long size1_);
-TH_API void THTensor_(resize3d)(THTensor *tensor, long size0_, long size1_, long size2_);
-TH_API void THTensor_(resize4d)(THTensor *tensor, long size0_, long size1_, long size2_, long size3_);
-TH_API void THTensor_(resize5d)(THTensor *tensor, long size0_, long size1_, long size2_, long size3_, long size4_);
+TH_API void THTensor_(resizeNd)(THTensor *tensor, int nDimension, int64_t *size, int64_t *stride);
+TH_API void THTensor_(resize1d)(THTensor *tensor, int64_t size0_);
+TH_API void THTensor_(resize2d)(THTensor *tensor, int64_t size0_, int64_t size1_);
+TH_API void THTensor_(resize3d)(THTensor *tensor, int64_t size0_, int64_t size1_, int64_t size2_);
+TH_API void THTensor_(resize4d)(THTensor *tensor, int64_t size0_, int64_t size1_, int64_t size2_, int64_t size3_);
+TH_API void THTensor_(resize5d)(THTensor *tensor, int64_t size0_, int64_t size1_, int64_t size2_, int64_t size3_, int64_t size4_);
 
 TH_API void THTensor_(set)(THTensor *self, THTensor *src);
 TH_API void THTensor_(setStorage)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_);
-TH_API void THTensor_(setStorageNd)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_, int nDimension, long *size, long *stride);
+TH_API void THTensor_(setStorageNd)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_, int nDimension, int64_t *size, int64_t *stride);
 TH_API void THTensor_(setStorage1d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_,
-                                    long size0_, long stride0_);
+                                    int64_t size0_, int64_t stride0_);
 TH_API void THTensor_(setStorage2d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_,
-                                    long size0_, long stride0_,
-                                    long size1_, long stride1_);
+                                    int64_t size0_, int64_t stride0_,
+                                    int64_t size1_, int64_t stride1_);
 TH_API void THTensor_(setStorage3d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_,
-                                    long size0_, long stride0_,
-                                    long size1_, long stride1_,
-                                    long size2_, long stride2_);
+                                    int64_t size0_, int64_t stride0_,
+                                    int64_t size1_, int64_t stride1_,
+                                    int64_t size2_, int64_t stride2_);
 TH_API void THTensor_(setStorage4d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_,
-                                    long size0_, long stride0_,
-                                    long size1_, long stride1_,
-                                    long size2_, long stride2_,
-                                    long size3_, long stride3_);
+                                    int64_t size0_, int64_t stride0_,
+                                    int64_t size1_, int64_t stride1_,
+                                    int64_t size2_, int64_t stride2_,
+                                    int64_t size3_, int64_t stride3_);
 
-TH_API void THTensor_(narrow)(THTensor *self, THTensor *src, int dimension_, long firstIndex_, long size_);
-TH_API void THTensor_(select)(THTensor *self, THTensor *src, int dimension_, long sliceIndex_);
+TH_API void THTensor_(narrow)(THTensor *self, THTensor *src, int dimension_, int64_t firstIndex_, int64_t size_);
+TH_API void THTensor_(select)(THTensor *self, THTensor *src, int dimension_, int64_t sliceIndex_);
 TH_API void THTensor_(transpose)(THTensor *self, THTensor *src, int dimension1_, int dimension2_);
-TH_API void THTensor_(unfold)(THTensor *self, THTensor *src, int dimension_, long size_, long step_);
+TH_API void THTensor_(unfold)(THTensor *self, THTensor *src, int dimension_, int64_t size_, int64_t step_);
 
 TH_API void THTensor_(squeeze)(THTensor *self, THTensor *src);
 TH_API void THTensor_(squeeze1d)(THTensor *self, THTensor *src, int dimension_);
@@ -123,15 +123,15 @@ TH_API void THTensor_(free)(THTensor *self);
 TH_API void THTensor_(freeCopyTo)(THTensor *self, THTensor *dst);
 
 /* Slow access methods [check everything] */
-TH_API void THTensor_(set1d)(THTensor *tensor, long x0, real value);
-TH_API void THTensor_(set2d)(THTensor *tensor, long x0, long x1, real value);
-TH_API void THTensor_(set3d)(THTensor *tensor, long x0, long x1, long x2, real value);
-TH_API void THTensor_(set4d)(THTensor *tensor, long x0, long x1, long x2, long x3, real value);
-
-TH_API real THTensor_(get1d)(const THTensor *tensor, long x0);
-TH_API real THTensor_(get2d)(const THTensor *tensor, long x0, long x1);
-TH_API real THTensor_(get3d)(const THTensor *tensor, long x0, long x1, long x2);
-TH_API real THTensor_(get4d)(const THTensor *tensor, long x0, long x1, long x2, long x3);
+TH_API void THTensor_(set1d)(THTensor *tensor, int64_t x0, real value);
+TH_API void THTensor_(set2d)(THTensor *tensor, int64_t x0, int64_t x1, real value);
+TH_API void THTensor_(set3d)(THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, real value);
+TH_API void THTensor_(set4d)(THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3, real value);
+
+TH_API real THTensor_(get1d)(const THTensor *tensor, int64_t x0);
+TH_API real THTensor_(get2d)(const THTensor *tensor, int64_t x0, int64_t x1);
+TH_API real THTensor_(get3d)(const THTensor *tensor, int64_t x0, int64_t x1, int64_t x2);
+TH_API real THTensor_(get4d)(const THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3);
 
 /* Debug methods */
 TH_API THDescBuff THTensor_(desc)(const THTensor *tensor);
diff --git a/torch/lib/TH/generic/THTensorConv.c b/torch/lib/TH/generic/THTensorConv.c
index 684ff9db5f63..706999acec0b 100644
--- a/torch/lib/TH/generic/THTensorConv.c
+++ b/torch/lib/TH/generic/THTensorConv.c
@@ -7,14 +7,14 @@
 */
 void THTensor_(validXCorr2Dptr)(real *r_,
                                        real alpha,
-                                       real *t_, long ir, long ic,
-                                       real *k_, long kr, long kc,
-                                       long sr, long sc)
+                                       real *t_, int64_t ir, int64_t ic,
+                                       real *k_, int64_t kr, int64_t kc,
+                                       int64_t sr, int64_t sc)
 {
-  long or = (ir - kr) / sr + 1;
-  long oc = (ic - kc) / sc + 1;
+  int64_t or = (ir - kr) / sr + 1;
+  int64_t oc = (ic - kc) / sc + 1;
 
-  long xx, yy, kx, ky;
+  int64_t xx, yy, kx, ky;
 
   if ((sc != 1) || (oc < 4))  {
     /* regular convolution */
@@ -60,14 +60,14 @@ void THTensor_(validXCorr2Dptr)(real *r_,
 */
 void THTensor_(validConv2Dptr)(real *r_,
                                       real alpha,
-                                      real *t_, long ir, long ic,
-                                      real *k_, long kr, long kc,
-                                      long sr, long sc)
+                                      real *t_, int64_t ir, int64_t ic,
+                                      real *k_, int64_t kr, int64_t kc,
+                                      int64_t sr, int64_t sc)
 {
-  long or = (ir - kr) / sr + 1;
-  long oc = (ic - kc) / sc + 1;
+  int64_t or = (ir - kr) / sr + 1;
+  int64_t oc = (ic - kc) / sc + 1;
 
-  long xx, yy, kx, ky;
+  int64_t xx, yy, kx, ky;
 
   if ((sc != 1) || (oc < 4))  {
     /* regular convolution */
@@ -113,13 +113,13 @@ void THTensor_(validConv2Dptr)(real *r_,
 */
 void THTensor_(fullConv2Dptr)(real *r_,
                                      real alpha,
-                                     real *t_, long ir, long ic,
-                                     real *k_, long kr, long kc,
-                                     long sr, long sc)
+                                     real *t_, int64_t ir, int64_t ic,
+                                     real *k_, int64_t kr, int64_t kc,
+                                     int64_t sr, int64_t sc)
 {
-  long oc = (ic - 1) * sc + kc;
+  int64_t oc = (ic - 1) * sc + kc;
 
-  long xx, yy, kx, ky;
+  int64_t xx, yy, kx, ky;
 
   if ((sc != 1) || (ic < 4))  {
     /* regular convolution */
@@ -165,13 +165,13 @@ void THTensor_(fullConv2Dptr)(real *r_,
 */
 void THTensor_(fullXCorr2Dptr)(real *r_,
                                       real alpha,
-                                      real *t_, long ir, long ic,
-                                      real *k_, long kr, long kc,
-                                      long sr, long sc)
+                                      real *t_, int64_t ir, int64_t ic,
+                                      real *k_, int64_t kr, int64_t kc,
+                                      int64_t sr, int64_t sc)
 {
-  long oc = (ic - 1) * sc + kc;
+  int64_t oc = (ic - 1) * sc + kc;
 
-  long xx, yy, kx, ky;
+  int64_t xx, yy, kx, ky;
 
   if ((sc != 1) || (ic < 4))  {
     /* regular convolution */
@@ -180,7 +180,7 @@ void THTensor_(fullXCorr2Dptr)(real *r_,
         /* Outer product in two dimensions... (between input image and the mask) */
         real *po_ = r_ + yy*sr*oc + xx*sc;
         real *pw_ = k_ + kr*kc -1;
-        long kx, ky;
+        int64_t kx, ky;
         for(ky = 0; ky < kr; ky++)
         {
           real z = *t_ * alpha;
@@ -220,14 +220,14 @@ void THTensor_(fullXCorr2Dptr)(real *r_,
 */
 void THTensor_(validXCorr2DRevptr)(real *r_,
                                           real alpha,
-                                          real *t_, long ir, long ic,
-                                          real *k_, long kr, long kc,
-                                          long sr, long sc)
+                                          real *t_, int64_t ir, int64_t ic,
+                                          real *k_, int64_t kr, int64_t kc,
+                                          int64_t sr, int64_t sc)
 {
-  long or = ir - (kr - 1) * sr;
-  long oc = ic - (kc - 1) * sc;
+  int64_t or = ir - (kr - 1) * sr;
+  int64_t oc = ic - (kc - 1) * sc;
 
-  long xx, yy, kx, ky;
+  int64_t xx, yy, kx, ky;
 
   if ((sc != 1) || (kc < 4))  {
     /* regular convolution */
@@ -268,15 +268,15 @@ void THTensor_(validXCorr2DRevptr)(real *r_,
 */
 void THTensor_(validXCorr3Dptr)(real *r_,
                                        real alpha,
-                                       real *t_, long it, long ir, long ic,
-                                       real *k_, long kt, long kr, long kc,
-                                       long st, long sr, long sc)
+                                       real *t_, int64_t it, int64_t ir, int64_t ic,
+                                       real *k_, int64_t kt, int64_t kr, int64_t kc,
+                                       int64_t st, int64_t sr, int64_t sc)
 {
-  long ot = (it - kt) / st + 1;
-  long or = (ir - kr) / sr + 1;
-  long oc = (ic - kc) / sc + 1;
+  int64_t ot = (it - kt) / st + 1;
+  int64_t or = (ir - kr) / sr + 1;
+  int64_t oc = (ic - kc) / sc + 1;
 
-  long zz, xx, yy;
+  int64_t zz, xx, yy;
 
   for (zz = 0; zz < ot; zz++)
   {
@@ -288,7 +288,7 @@ void THTensor_(validXCorr3Dptr)(real *r_,
         real *pi_ = t_ + zz*st*ir*ic + yy*sr*ic + xx*sc;
         real *pw_ = k_;
         real sum = 0;
-        long kz, kx, ky;
+        int64_t kz, kx, ky;
         for(kz = 0; kz < kt; kz++)
         {
           for(ky = 0; ky < kr; ky++)
@@ -313,15 +313,15 @@ void THTensor_(validXCorr3Dptr)(real *r_,
 */
 void THTensor_(validConv3Dptr)(real *r_,
                                       real alpha,
-                                      real *t_, long it, long ir, long ic,
-                                      real *k_, long kt, long kr, long kc,
-                                      long st, long sr, long sc)
+                                      real *t_, int64_t it, int64_t ir, int64_t ic,
+                                      real *k_, int64_t kt, int64_t kr, int64_t kc,
+                                      int64_t st, int64_t sr, int64_t sc)
 {
-  long ot = (it - kt) / st + 1;
-  long or = (ir - kr) / sr + 1;
-  long oc = (ic - kc) / sc + 1;
+  int64_t ot = (it - kt) / st + 1;
+  int64_t or = (ir - kr) / sr + 1;
+  int64_t oc = (ic - kc) / sc + 1;
 
-  long zz, xx, yy;
+  int64_t zz, xx, yy;
 
   for(zz = 0; zz < ot; zz++)
   {
@@ -333,7 +333,7 @@ void THTensor_(validConv3Dptr)(real *r_,
         real *pi_ = t_ + zz*st*ir*ic + yy*sr*ic + xx*sc;
         real *pw_ = k_ + kt*kr*kc - 1;
         real sum = 0;
-        long kz, kx, ky;
+        int64_t kz, kx, ky;
         for(kz = 0; kz < kt; kz++)
         {
           for(ky = 0; ky < kr; ky++)
@@ -359,14 +359,14 @@ void THTensor_(validConv3Dptr)(real *r_,
 */
 void THTensor_(fullConv3Dptr)(real *r_,
                                      real alpha,
-                                     real *t_, long it, long ir, long ic,
-                                     real *k_, long kt, long kr, long kc,
-                                     long st, long sr, long sc)
+                                     real *t_, int64_t it, int64_t ir, int64_t ic,
+                                     real *k_, int64_t kt, int64_t kr, int64_t kc,
+                                     int64_t st, int64_t sr, int64_t sc)
 {
-  long or = (ir - 1) * sr + kr;
-  long oc = (ic - 1) * sc + kc;
+  int64_t or = (ir - 1) * sr + kr;
+  int64_t oc = (ic - 1) * sc + kc;
 
-  long zz, xx, yy;
+  int64_t zz, xx, yy;
 
   for(zz = 0; zz < it; zz++)
   {
@@ -377,7 +377,7 @@ void THTensor_(fullConv3Dptr)(real *r_,
         /* Outer product in two dimensions... (between input image and the mask) */
         real *po_ = r_ + zz*st*or*oc + yy*sr*oc + xx*sc;
         real *pw_ = k_;
-        long kz, kx, ky;
+        int64_t kz, kx, ky;
         /* printf("Output Plane : %ld,%ld,%ld, input val=%g\n",zz,yy,xx,*t_); */
         for(kz = 0; kz < kt; kz++)
         {
@@ -407,14 +407,14 @@ void THTensor_(fullConv3Dptr)(real *r_,
 */
 void THTensor_(fullXCorr3Dptr)(real *r_,
                                       real alpha,
-                                      real *t_, long it, long ir, long ic,
-                                      real *k_, long kt, long kr, long kc,
-                                      long st, long sr, long sc)
+                                      real *t_, int64_t it, int64_t ir, int64_t ic,
+                                      real *k_, int64_t kt, int64_t kr, int64_t kc,
+                                      int64_t st, int64_t sr, int64_t sc)
 {
-  long or = (ir - 1) * sr + kr;
-  long oc = (ic - 1) * sc + kc;
+  int64_t or = (ir - 1) * sr + kr;
+  int64_t oc = (ic - 1) * sc + kc;
 
-  long zz, xx, yy;
+  int64_t zz, xx, yy;
 
   for(zz = 0; zz < it; zz++)
   {
@@ -425,7 +425,7 @@ void THTensor_(fullXCorr3Dptr)(real *r_,
         /* Outer product in two dimensions... (between input image and the mask) */
         real *po_ = r_ + zz*st*or*oc + yy*sr*oc + xx*sc;
         real *pw_ = k_ + kt*kr*kc -1;
-        long kz, kx, ky;
+        int64_t kz, kx, ky;
         for(kz = 0; kz < kt; kz++)
         {
           for(ky = 0; ky < kr; ky++)
@@ -452,15 +452,15 @@ void THTensor_(fullXCorr3Dptr)(real *r_,
 */
 void THTensor_(validXCorr3DRevptr)(real *r_,
                                           real alpha,
-                                          real *t_, long it, long ir, long ic,
-                                          real *k_, long kt, long kr, long kc,
-                                          long st, long sr, long sc)
+                                          real *t_, int64_t it, int64_t ir, int64_t ic,
+                                          real *k_, int64_t kt, int64_t kr, int64_t kc,
+                                          int64_t st, int64_t sr, int64_t sc)
 {
-  long ot = it - (kt - 1) * st;
-  long or = ir - (kr - 1) * sr;
-  long oc = ic - (kc - 1) * sc;
+  int64_t ot = it - (kt - 1) * st;
+  int64_t or = ir - (kr - 1) * sr;
+  int64_t oc = ic - (kc - 1) * sc;
 
-  long zz, xx, yy;
+  int64_t zz, xx, yy;
   for(zz = 0; zz < kt; zz++)
   {
     for(yy = 0; yy < kr; yy++)
@@ -470,7 +470,7 @@ void THTensor_(validXCorr3DRevptr)(real *r_,
         real *po_ = r_;
         real *pi_ = t_ + zz*st*ir*ic + yy*sr*ic + xx*sc;
         real z = *k_++ * alpha;
-        long kz, kx, ky;
+        int64_t kz, kx, ky;
         for(kz = 0; kz < ot; kz++)
         {
           for(ky = 0; ky < or; ky++)
@@ -489,9 +489,9 @@ void THTensor_(validXCorr3DRevptr)(real *r_,
 
 void THTensor_(conv2d)(real* output_data,
                        real alpha,
-                       real* ptr_input, long nInputRows, long nInputCols,
-                       real* ptr_weight, long nKernelRows, long nKernelCols,
-                       long srow, long scol,
+                       real* ptr_input, int64_t nInputRows, int64_t nInputCols,
+                       real* ptr_weight, int64_t nKernelRows, int64_t nKernelCols,
+                       int64_t srow, int64_t scol,
                        const char *vf, const char *xc)
 {
   THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can be 'V' or 'F'");
@@ -526,9 +526,9 @@ void THTensor_(conv2d)(real* output_data,
 
 void THTensor_(conv3d)(real* output_data,
                        real alpha,
-                       real* ptr_input, long nInputDepth, long nInputRows, long nInputCols,
-                       real* ptr_weight, long nKernelDepth, long nKernelRows, long nKernelCols,
-                       long sdepth, long srow, long scol,
+                       real* ptr_input, int64_t nInputDepth, int64_t nInputRows, int64_t nInputCols,
+                       real* ptr_weight, int64_t nKernelDepth, int64_t nKernelRows, int64_t nKernelCols,
+                       int64_t sdepth, int64_t srow, int64_t scol,
                        const char *vf, const char *xc)
 {
   THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can be 'V' or 'F'");
@@ -561,7 +561,7 @@ void THTensor_(conv3d)(real* output_data,
                                 sdepth, srow, scol);
 }
 
-long THTensor_(convsize)(long x, long k, long s, const char* vf)
+int64_t THTensor_(convsize)(int64_t x, int64_t k, int64_t s, const char* vf)
 {
   THArgCheck(*vf == 'V' || *vf == 'F', 1, "type of convolution can be 'V' or 'F'");
   if (*vf == 'V')
@@ -578,19 +578,19 @@ long THTensor_(convsize)(long x, long k, long s, const char* vf)
   for sr,sc=1 this is equivalent to conv2Dger, but otherwise it is useful for
   calculating derivatives wrt a kernel that is applied with stride sr,sc != 1
 */
-void THTensor_(conv2DRevger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol)
+void THTensor_(conv2DRevger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol)
 {
-  long nInputPlane, nInputRows, nInputCols;
-  long nKernelPlane, nKernelRows, nKernelCols;
-  long nOutputPlane, nOutputRows, nOutputCols;
-  long istride0, kstride0;
+  int64_t nInputPlane, nInputRows, nInputCols;
+  int64_t nKernelPlane, nKernelRows, nKernelCols;
+  int64_t nOutputPlane, nOutputRows, nOutputCols;
+  int64_t istride0, kstride0;
   THTensor *input;
   THTensor *kernel;
   real *input_data;
   real *weight_data;
   real *output_data;
   ptrdiff_t nelem;
-  long k;
+  int64_t k;
 
   THArgCheck(t_->nDimension == 3 , 3, "input: 3D Tensor expected");
   THArgCheck(k_->nDimension == 3 , 4, "kernel: 3D Tensor expected");
@@ -631,7 +631,7 @@ void THTensor_(conv2DRevger)(THTensor *r_, real beta, real alpha, THTensor *t_,
     for (k = 0; k < r_->size[0]*r_->size[1]; k++)
     {
       real* ptr_output = output_data + k*nOutputCols*nOutputRows;
-      long l;
+      int64_t l;
       for (l = 0; l < nOutputRows*nOutputCols; l++)
         ptr_output[l] = 0.0;
     }
@@ -643,7 +643,7 @@ void THTensor_(conv2DRevger)(THTensor *r_, real beta, real alpha, THTensor *t_,
     for (k = 0; k < r_->size[0]*r_->size[1]; k++)
     {
       real* ptr_output = output_data + k*nOutputCols*nOutputRows;
-      long l;
+      int64_t l;
       for (l = 0; l < nOutputRows*nOutputCols; l++)
         ptr_output[l] *= beta;
     }
@@ -652,7 +652,7 @@ void THTensor_(conv2DRevger)(THTensor *r_, real beta, real alpha, THTensor *t_,
 #pragma omp parallel for private(k)
   for(k = 0; k < nKernelPlane; k++)
   {
-    long i;
+    int64_t i;
     /* get kernel */
     real *ptr_weight = weight_data+k*kstride0;
 
@@ -685,19 +685,19 @@ void THTensor_(conv2DRevger)(THTensor *r_, real beta, real alpha, THTensor *t_,
   for sr,sc=1 this is equivalent to conv2Dger, but otherwise it is useful for
   calculating derivatives wrt a kernel that is applied with stride sr,sc != 1
 */
-void THTensor_(conv2DRevgerm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol)
+void THTensor_(conv2DRevgerm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol)
 {
-  long nbatch, nInputPlane, nInputRows, nInputCols;
-  long nKernelPlane, nKernelRows, nKernelCols;
-  long nOutputRows, nOutputCols;
-  long istride0, kstride0, istride1, kstride1;
+  int64_t nbatch, nInputPlane, nInputRows, nInputCols;
+  int64_t nKernelPlane, nKernelRows, nKernelCols;
+  int64_t nOutputRows, nOutputCols;
+  int64_t istride0, kstride0, istride1, kstride1;
   THTensor *input;
   THTensor *kernel;
   real *input_data;
   real *weight_data;
   real *output_data;
   ptrdiff_t nelem;
-  long k;
+  int64_t k;
 
   THArgCheck(t_->nDimension == 4 , 3, "input: 4D Tensor expected");
   THArgCheck(k_->nDimension == 4 , 4, "kernel: 4D Tensor expected");
@@ -741,7 +741,7 @@ void THTensor_(conv2DRevgerm)(THTensor *r_, real beta, real alpha, THTensor *t_,
     for (k = 0; k < r_->size[0]*r_->size[1]; k++)
     {
       real* ptr_output = output_data + k*nOutputCols*nOutputRows;
-      long l;
+      int64_t l;
       for (l = 0; l < nOutputRows*nOutputCols; l++)
         ptr_output[l] = 0.0;
     }
@@ -753,7 +753,7 @@ void THTensor_(conv2DRevgerm)(THTensor *r_, real beta, real alpha, THTensor *t_,
     for (k = 0; k < r_->size[0]*r_->size[1]; k++)
     {
       real* ptr_output = output_data + k*nOutputCols*nOutputRows;
-      long l;
+      int64_t l;
       for (l = 0; l < nOutputRows*nOutputCols; l++)
         ptr_output[l] *= beta;
     }
@@ -762,10 +762,10 @@ void THTensor_(conv2DRevgerm)(THTensor *r_, real beta, real alpha, THTensor *t_,
 #pragma omp parallel for private(k)
   for(k = 0; k < nKernelPlane; k++)
   {
-    long i;
+    int64_t i;
     for(i = 0; i < nInputPlane; i++)
     {
-      long p;
+      int64_t p;
       for(p = 0; p < nbatch; p++)
       {
         /* get kernel */
@@ -796,12 +796,12 @@ void THTensor_(conv2DRevgerm)(THTensor *r_, real beta, real alpha, THTensor *t_,
   like rank1 update
   A <- xx' + beta*A
 */
-void THTensor_(conv2Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol, const char *vf, const char *xc)
+void THTensor_(conv2Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol, const char *vf, const char *xc)
 {
-  long nInputPlane, nInputRows, nInputCols;
-  long nKernelPlane, nKernelRows, nKernelCols;
-  long nOutputPlane, nOutputRows, nOutputCols;
-  long istride0, kstride0;
+  int64_t nInputPlane, nInputRows, nInputCols;
+  int64_t nKernelPlane, nKernelRows, nKernelCols;
+  int64_t nOutputPlane, nOutputRows, nOutputCols;
+  int64_t istride0, kstride0;
 
   THTensor *input;
   THTensor *kernel;
@@ -809,7 +809,7 @@ void THTensor_(conv2Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THT
   real *weight_data;
   real *output_data;
   ptrdiff_t nelem;
-  long k;
+  int64_t k;
 
   THArgCheck(t_->nDimension == 3 , 3, "input: 3D Tensor expected");
   THArgCheck(k_->nDimension == 3 , 4, "kernel: 3D Tensor expected");
@@ -856,7 +856,7 @@ void THTensor_(conv2Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THT
     for (k = 0; k < r_->size[0]*r_->size[1]; k++)
     {
       real* ptr_output = output_data + k*nOutputCols*nOutputRows;
-      long l;
+      int64_t l;
       for (l = 0; l < nOutputRows*nOutputCols; l++)
         ptr_output[l] = 0.0;
     }
@@ -868,7 +868,7 @@ void THTensor_(conv2Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THT
     for (k = 0; k < r_->size[0]*r_->size[1]; k++)
     {
       real* ptr_output = output_data + k*nOutputCols*nOutputRows;
-      long l;
+      int64_t l;
       for (l = 0; l < nOutputRows*nOutputCols; l++)
         ptr_output[l] *= beta;
     }
@@ -877,7 +877,7 @@ void THTensor_(conv2Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THT
 #pragma omp parallel for private(k)
   for(k = 0; k < nKernelPlane; k++)
   {
-    long i;
+    int64_t i;
     /* get kernel */
     real *ptr_weight = weight_data+k*kstride0;
 
@@ -929,19 +929,19 @@ void THTensor_(conv2Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THT
   matrix vector product like
   y <- Ax + beta*y
 */
-void THTensor_(conv2Dmv)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol, const char *vf, const char *xc)
+void THTensor_(conv2Dmv)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol, const char *vf, const char *xc)
 {
-  long nInputPlane, nInputRows, nInputCols;
-  long nKernelRows, nKernelCols;
-  long nOutputPlane, nOutputRows, nOutputCols;
-  long istride0, kstride0, kstride1;
+  int64_t nInputPlane, nInputRows, nInputCols;
+  int64_t nKernelRows, nKernelCols;
+  int64_t nOutputPlane, nOutputRows, nOutputCols;
+  int64_t istride0, kstride0, kstride1;
   THTensor *input;
   THTensor* kernel;
   real *input_data;
   real *weight_data;
   real *output_data;
   ptrdiff_t nelem;
-  long k;
+  int64_t k;
 
   THArgCheck(t_->nDimension == 3 , 3, "input: 3D Tensor expected");
   THArgCheck(k_->nDimension == 4 , 4, "kernel: 4D Tensor expected");
@@ -994,7 +994,7 @@ void THTensor_(conv2Dmv)(THTensor *r_, real beta, real alpha, THTensor *t_, THTe
     for (k = 0; k < r_->size[0]; k++)
     {
       real* ptr_output = output_data + k*nOutputCols*nOutputRows;
-      long l;
+      int64_t l;
       for (l = 0; l < nOutputRows*nOutputCols; l++)
         ptr_output[l] = 0.0;
     }
@@ -1006,7 +1006,7 @@ void THTensor_(conv2Dmv)(THTensor *r_, real beta, real alpha, THTensor *t_, THTe
     for (k = 0; k < r_->size[0]; k++)
     {
       real* ptr_output = output_data + k*nOutputCols*nOutputRows;
-      long l;
+      int64_t l;
       for (l = 0; l < nOutputRows*nOutputCols; l++)
         ptr_output[l] *= beta;
     }
@@ -1015,7 +1015,7 @@ void THTensor_(conv2Dmv)(THTensor *r_, real beta, real alpha, THTensor *t_, THTe
 #pragma omp parallel for private(k)
   for(k = 0; k < nOutputPlane; k++)
   {
-    long i;
+    int64_t i;
     /* get output */
     real *ptr_output = output_data + k*nOutputCols*nOutputRows;
     for(i = 0; i < nInputPlane; i++)
@@ -1066,20 +1066,20 @@ void THTensor_(conv2Dmv)(THTensor *r_, real beta, real alpha, THTensor *t_, THTe
   matrix vector product like
   y <- Ax + beta*y
 */
-void THTensor_(conv2Dmm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol, const char *vf, const char *xc)
+void THTensor_(conv2Dmm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol, const char *vf, const char *xc)
 {
-  long nInputPlane, nInputRows, nInputCols;
-  long nKernelRows, nKernelCols;
-  long nOutputPlane, nOutputRows, nOutputCols;
-  long kstride0, kstride1;
+  int64_t nInputPlane, nInputRows, nInputCols;
+  int64_t nKernelRows, nKernelCols;
+  int64_t nOutputPlane, nOutputRows, nOutputCols;
+  int64_t kstride0, kstride1;
   THTensor *input;
   THTensor* kernel;
-  long nbatch;
+  int64_t nbatch;
   ptrdiff_t nelem;
   real *input_data;
   real *weight_data;
   real *output_data;
-  long p;
+  int64_t p;
 
   THArgCheck(t_->nDimension == 4 , 3, "input: 4D Tensor expected");
   THArgCheck(k_->nDimension == 4 , 4, "kernel: 4D Tensor expected");
@@ -1131,11 +1131,11 @@ void THTensor_(conv2Dmm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTe
 #pragma omp parallel for private(p)
     for (p=0; p < r_->size[0]; p++)
     {
-      long k;
+      int64_t k;
       for (k = 0; k < r_->size[1]; k++)
       {
         real* ptr_output = output_data + p*nOutputPlane*nOutputRows*nOutputCols + k*nOutputCols*nOutputRows;
-        long l;
+        int64_t l;
         for (l = 0; l < nOutputRows*nOutputCols; l++)
           ptr_output[l] = 0.0;
       }
@@ -1147,11 +1147,11 @@ void THTensor_(conv2Dmm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTe
 #pragma omp parallel for private(p)
     for(p=0; p < r_->size[0]; p++)
     {
-      long k;
+      int64_t k;
       for (k = 0; k < r_->size[1]; k++)
       {
         real* ptr_output = output_data + p*nOutputPlane*nOutputRows*nOutputCols + k*nOutputCols*nOutputRows;
-        long l;
+        int64_t l;
         for (l = 0; l < nOutputRows*nOutputCols; l++)
           ptr_output[l] *= beta;
       }
@@ -1161,10 +1161,10 @@ void THTensor_(conv2Dmm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTe
 #pragma omp parallel for private(p)
   for(p=0; p < nbatch; p++)
   {
-    long k;
+    int64_t k;
     for(k = 0; k < nOutputPlane; k++)
     {
-      long i;
+      int64_t i;
       /* get output */
       real *ptr_output = output_data + p*nOutputPlane*nOutputCols*nOutputRows + k*nOutputCols*nOutputRows;
       for(i = 0; i < nInputPlane; i++)
@@ -1216,15 +1216,15 @@ void THTensor_(conv2Dmm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTe
   scalar multiplication like
   y <- x*y + beta*y
 */
-void THTensor_(conv2Dmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol, const char *vf, const char *xc)
+void THTensor_(conv2Dmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol, const char *vf, const char *xc)
 {
   THTensor *input;
   THTensor* kernel;
-  long nInputRows;
-  long nInputCols;
-  long nKernelRows;
-  long nKernelCols;
-  long nOutputRows, nOutputCols;
+  int64_t nInputRows;
+  int64_t nInputCols;
+  int64_t nKernelRows;
+  int64_t nKernelCols;
+  int64_t nOutputRows, nOutputCols;
   real *ptr_input;
   real *ptr_weight;
   real *output_data;
@@ -1275,19 +1275,19 @@ void THTensor_(conv2Dmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THT
   component wise multiplication like
   y <- y.*x + beta*y
 */
-void THTensor_(conv2Dcmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol, const char *vf, const char *xc)
+void THTensor_(conv2Dcmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol, const char *vf, const char *xc)
 {
-  long nInputPlane, nInputRows, nInputCols;
-  long nKernelRows, nKernelCols;
-  long nOutputPlane, nOutputRows, nOutputCols;
-  long istride0, kstride0;
+  int64_t nInputPlane, nInputRows, nInputCols;
+  int64_t nKernelRows, nKernelCols;
+  int64_t nOutputPlane, nOutputRows, nOutputCols;
+  int64_t istride0, kstride0;
   THTensor *input;
   THTensor *kernel;
   real *input_data;
   real *weight_data;
   real *output_data;
   ptrdiff_t nelem;
-  long k;
+  int64_t k;
 
   THArgCheck(t_->nDimension == 3 , 3, "input: 3D Tensor expected");
   THArgCheck(k_->nDimension == 3 , 4, "kernel: 3D Tensor expected");
@@ -1352,20 +1352,20 @@ void THTensor_(conv2Dcmul)(THTensor *r_, real beta, real alpha, THTensor *t_, TH
   component wise multiplication like with a permutation map
   y <- y.*x + beta*y
 */
-void THTensor_(conv2Dmap)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, THTensor *map, long srow, long scol, const char *vf, const char *xc)
+void THTensor_(conv2Dmap)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, THTensor *map, int64_t srow, int64_t scol, const char *vf, const char *xc)
 {
-  long nInputPlane, nInputRows, nInputCols;
-  long nKernelRows, nKernelCols;
-  long nOutputPlane, nOutputRows, nOutputCols;
-  long istride0, kstride0;
+  int64_t nInputPlane, nInputRows, nInputCols;
+  int64_t nKernelRows, nKernelCols;
+  int64_t nOutputPlane, nOutputRows, nOutputCols;
+  int64_t istride0, kstride0;
   THTensor *input;
   THTensor* kernel;
   real *input_data;
   real *weight_data;
   real *output_data;
-  long nmaps;
+  int64_t nmaps;
   ptrdiff_t nelem;
-  long k;
+  int64_t k;
 
   THArgCheck(t_->nDimension == 3 , 3, "input: 3D Tensor expected");
   THArgCheck(k_->nDimension == 3 , 4, "kernel: 3D Tensor expected");
@@ -1412,8 +1412,8 @@ void THTensor_(conv2Dmap)(THTensor *r_, real beta, real alpha, THTensor *t_, THT
   for(k = 0; k < nmaps; k++)
   {
     /* get indices */
-    long from = (long)THTensor_(get2d)(map,k,0)-1;
-    long to   = (long)THTensor_(get2d)(map,k,1)-1;
+    int64_t from = (int64_t)THTensor_(get2d)(map,k,0)-1;
+    int64_t to   = (int64_t)THTensor_(get2d)(map,k,1)-1;
 
     /* get kernel */
     real *ptr_weight = weight_data + k*kstride0;
@@ -1441,19 +1441,19 @@ void THTensor_(conv2Dmap)(THTensor *r_, real beta, real alpha, THTensor *t_, THT
   calculating derivatives wrt a kernel that is applied with stride sr,sc != 1
 */
 void THTensor_(conv3DRevger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_,
-                             long sdepth, long srow, long scol)
+                             int64_t sdepth, int64_t srow, int64_t scol)
 {
-  long nInputPlane, nInputDepth, nInputRows, nInputCols;
-  long nKernelPlane, nKernelDepth, nKernelRows, nKernelCols;
-  long nOutputPlane, nOutputDepth, nOutputRows, nOutputCols;
-  long istride0, kstride0;
+  int64_t nInputPlane, nInputDepth, nInputRows, nInputCols;
+  int64_t nKernelPlane, nKernelDepth, nKernelRows, nKernelCols;
+  int64_t nOutputPlane, nOutputDepth, nOutputRows, nOutputCols;
+  int64_t istride0, kstride0;
   THTensor *input;
   THTensor *kernel;
   real *input_data;
   real *weight_data;
   real *output_data;
   ptrdiff_t nelem;
-  long k, i;
+  int64_t k, i;
 
   THArgCheck(t_->nDimension == 4 , 3, "input: 4D Tensor expected");
   THArgCheck(k_->nDimension == 4 , 4, "kernel: 4D Tensor expected");
@@ -1528,19 +1528,19 @@ void THTensor_(conv3DRevger)(THTensor *r_, real beta, real alpha, THTensor *t_,
   A <- xx' + beta*A
 */
 void THTensor_(conv3Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_,
-                          long sdepth, long srow, long scol, const char *vf, const char *xc)
+                          int64_t sdepth, int64_t srow, int64_t scol, const char *vf, const char *xc)
 {
-  long nInputPlane, nInputDepth, nInputRows, nInputCols;
-  long nKernelPlane, nKernelDepth, nKernelRows, nKernelCols;
-  long nOutputPlane, nOutputDepth, nOutputRows, nOutputCols;
-  long istride0, kstride0;
+  int64_t nInputPlane, nInputDepth, nInputRows, nInputCols;
+  int64_t nKernelPlane, nKernelDepth, nKernelRows, nKernelCols;
+  int64_t nOutputPlane, nOutputDepth, nOutputRows, nOutputCols;
+  int64_t istride0, kstride0;
   THTensor *input;
   THTensor *kernel;
   real *input_data;
   real *weight_data;
   real *output_data;
   ptrdiff_t nelem;
-  long k, i;
+  int64_t k, i;
 
   THArgCheck(t_->nDimension == 4 , 3, "input: 4D Tensor expected");
   THArgCheck(k_->nDimension == 4 , 4, "kernel: 4D Tensor expected");
@@ -1620,19 +1620,19 @@ void THTensor_(conv3Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THT
   y <- Ax + beta*y
 */
 void THTensor_(conv3Dmv)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_,
-                         long sdepth, long srow, long scol, const char *vf, const char *xc)
+                         int64_t sdepth, int64_t srow, int64_t scol, const char *vf, const char *xc)
 {
-  long nInputPlane, nInputDepth, nInputRows, nInputCols;
-  long nKernelDepth, nKernelRows, nKernelCols;
-  long nOutputPlane, nOutputDepth, nOutputRows, nOutputCols;
-  long istride0, kstride0, kstride1;
+  int64_t nInputPlane, nInputDepth, nInputRows, nInputCols;
+  int64_t nKernelDepth, nKernelRows, nKernelCols;
+  int64_t nOutputPlane, nOutputDepth, nOutputRows, nOutputCols;
+  int64_t istride0, kstride0, kstride1;
   THTensor *input;
   THTensor *kernel;
   real *input_data;
   real *weight_data;
   real *output_data;
   ptrdiff_t nelem;
-  long k, i;
+  int64_t k, i;
 
   THArgCheck(t_->nDimension == 4 , 3, "input: 4D Tensor expected");
   THArgCheck(k_->nDimension == 5 , 4, "kernel: 5D Tensor expected");
@@ -1713,17 +1713,17 @@ void THTensor_(conv3Dmv)(THTensor *r_, real beta, real alpha, THTensor *t_, THTe
   y <- x*y + beta*y
 */
 void THTensor_(conv3Dmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_,
-                          long sdepth, long srow, long scol, const char *vf, const char *xc)
+                          int64_t sdepth, int64_t srow, int64_t scol, const char *vf, const char *xc)
 {
   THTensor *input;
   THTensor* kernel;
-  long nInputDepth;
-  long nInputRows;
-  long nInputCols;
-  long nKernelDepth;
-  long nKernelRows;
-  long nKernelCols;
-  long nOutputDepth, nOutputRows, nOutputCols;
+  int64_t nInputDepth;
+  int64_t nInputRows;
+  int64_t nInputCols;
+  int64_t nKernelDepth;
+  int64_t nKernelRows;
+  int64_t nKernelCols;
+  int64_t nOutputDepth, nOutputRows, nOutputCols;
   real *ptr_input;
   real *ptr_weight;
   real *output_data;
@@ -1781,12 +1781,12 @@ void THTensor_(conv3Dmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THT
   y <- y.*x + beta*y
 */
 void THTensor_(conv3Dcmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_,
-                           long sdepth, long srow, long scol, const char *vf, const char *xc)
+                           int64_t sdepth, int64_t srow, int64_t scol, const char *vf, const char *xc)
 {
-  long nInputPlane, nInputDepth, nInputRows, nInputCols;
-  long nKernelDepth, nKernelRows, nKernelCols;
-  long nOutputPlane, nOutputDepth, nOutputRows, nOutputCols;
-  long istride0, kstride0;
+  int64_t nInputPlane, nInputDepth, nInputRows, nInputCols;
+  int64_t nKernelDepth, nKernelRows, nKernelCols;
+  int64_t nOutputPlane, nOutputDepth, nOutputRows, nOutputCols;
+  int64_t istride0, kstride0;
 
   THTensor *input;
   THTensor *kernel;
@@ -1794,7 +1794,7 @@ void THTensor_(conv3Dcmul)(THTensor *r_, real beta, real alpha, THTensor *t_, TH
   real *weight_data;
   real *output_data;
   ptrdiff_t nelem;
-  long k;
+  int64_t k;
 
   THArgCheck(t_->nDimension == 4 , 3, "input: 3D Tensor expected");
   THArgCheck(k_->nDimension == 4 , 4, "kernel: 3D Tensor expected");
@@ -1866,12 +1866,12 @@ void THTensor_(conv3Dcmul)(THTensor *r_, real beta, real alpha, THTensor *t_, TH
   y <- y.*x + beta*y
 */
 void THTensor_(conv3Dmap)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, THTensor *map,
-                          long sdepth, long srow, long scol, const char *vf, const char *xc)
+                          int64_t sdepth, int64_t srow, int64_t scol, const char *vf, const char *xc)
 {
-  long nInputPlane, nInputDepth, nInputRows, nInputCols;
-  long nKernelDepth, nKernelRows, nKernelCols;
-  long nOutputPlane, nOutputDepth, nOutputRows, nOutputCols;
-  long istride0, kstride0;
+  int64_t nInputPlane, nInputDepth, nInputRows, nInputCols;
+  int64_t nKernelDepth, nKernelRows, nKernelCols;
+  int64_t nOutputPlane, nOutputDepth, nOutputRows, nOutputCols;
+  int64_t istride0, kstride0;
 
   THTensor *input;
   THTensor *kernel;
@@ -1879,8 +1879,8 @@ void THTensor_(conv3Dmap)(THTensor *r_, real beta, real alpha, THTensor *t_, THT
   real *input_data;
   real *weight_data;
   real *output_data;
-  long nmaps;
-  long k;
+  int64_t nmaps;
+  int64_t k;
 
   THArgCheck(t_->nDimension == 4 , 3, "input: 4D Tensor expected");
   THArgCheck(k_->nDimension == 4 , 4, "kernel: 4D Tensor expected");
@@ -1934,8 +1934,8 @@ void THTensor_(conv3Dmap)(THTensor *r_, real beta, real alpha, THTensor *t_, THT
   for(k = 0; k < nmaps; k++)
   {
     /* get indices */
-    long from = (long)THTensor_(get2d)(map,k,0)-1;
-    long to   = (long)THTensor_(get2d)(map,k,1)-1;
+    int64_t from = (int64_t)THTensor_(get2d)(map,k,0)-1;
+    int64_t to   = (int64_t)THTensor_(get2d)(map,k,1)-1;
 
     /* get kernel */
     real *ptr_weight = weight_data + k*kstride0;
diff --git a/torch/lib/TH/generic/THTensorConv.h b/torch/lib/TH/generic/THTensorConv.h
index 79866f3901e3..279ece636281 100644
--- a/torch/lib/TH/generic/THTensorConv.h
+++ b/torch/lib/TH/generic/THTensorConv.h
@@ -4,76 +4,76 @@
 
 TH_API void THTensor_(validXCorr2Dptr)(real *r_,
                                     real alpha,
-                                    real *t_, long ir, long ic,
-                                    real *k_, long kr, long kc,
-                                    long sr, long sc);
+                                    real *t_, int64_t ir, int64_t ic,
+                                    real *k_, int64_t kr, int64_t kc,
+                                    int64_t sr, int64_t sc);
 
 TH_API void THTensor_(validConv2Dptr)(real *r_,
                                    real alpha,
-                                   real *t_, long ir, long ic,
-                                   real *k_, long kr, long kc,
-                                   long sr, long sc);
+                                   real *t_, int64_t ir, int64_t ic,
+                                   real *k_, int64_t kr, int64_t kc,
+                                   int64_t sr, int64_t sc);
 
 TH_API void THTensor_(fullXCorr2Dptr)(real *r_,
                                    real alpha,
-                                   real *t_, long ir, long ic,
-                                   real *k_, long kr, long kc,
-                                   long sr, long sc);
+                                   real *t_, int64_t ir, int64_t ic,
+                                   real *k_, int64_t kr, int64_t kc,
+                                   int64_t sr, int64_t sc);
 
 TH_API void THTensor_(fullConv2Dptr)(real *r_,
                                   real alpha,
-                                  real *t_, long ir, long ic,
-                                  real *k_, long kr, long kc,
-                                  long sr, long sc);
+                                  real *t_, int64_t ir, int64_t ic,
+                                  real *k_, int64_t kr, int64_t kc,
+                                  int64_t sr, int64_t sc);
 
 TH_API void THTensor_(validXCorr2DRevptr)(real *r_,
                                        real alpha,
-                                       real *t_, long ir, long ic,
-                                       real *k_, long kr, long kc,
-                                       long sr, long sc);
+                                       real *t_, int64_t ir, int64_t ic,
+                                       real *k_, int64_t kr, int64_t kc,
+                                       int64_t sr, int64_t sc);
 
-TH_API void THTensor_(conv2DRevger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol);
-TH_API void THTensor_(conv2DRevgerm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol);
-TH_API void THTensor_(conv2Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol, const char *vf, const char *xc);
-TH_API void THTensor_(conv2Dmv)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol, const char *vf, const char *xc);
-TH_API void THTensor_(conv2Dmm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol, const char *vf, const char *xc);
-TH_API void THTensor_(conv2Dmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol, const char *vf, const char *xc);
-TH_API void THTensor_(conv2Dcmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol, const char *vf, const char *xc);
+TH_API void THTensor_(conv2DRevger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol);
+TH_API void THTensor_(conv2DRevgerm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol);
+TH_API void THTensor_(conv2Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol, const char *vf, const char *xc);
+TH_API void THTensor_(conv2Dmv)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol, const char *vf, const char *xc);
+TH_API void THTensor_(conv2Dmm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol, const char *vf, const char *xc);
+TH_API void THTensor_(conv2Dmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol, const char *vf, const char *xc);
+TH_API void THTensor_(conv2Dcmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol, const char *vf, const char *xc);
 
 TH_API void THTensor_(validXCorr3Dptr)(real *r_,
                                     real alpha,
-                                    real *t_, long it, long ir, long ic,
-                                    real *k_, long kt, long kr, long kc,
-                                    long st, long sr, long sc);
+                                    real *t_, int64_t it, int64_t ir, int64_t ic,
+                                    real *k_, int64_t kt, int64_t kr, int64_t kc,
+                                    int64_t st, int64_t sr, int64_t sc);
 
 TH_API void THTensor_(validConv3Dptr)(real *r_,
                                    real alpha,
-                                   real *t_, long it, long ir, long ic,
-                                   real *k_, long kt, long kr, long kc,
-                                   long st, long sr, long sc);
+                                   real *t_, int64_t it, int64_t ir, int64_t ic,
+                                   real *k_, int64_t kt, int64_t kr, int64_t kc,
+                                   int64_t st, int64_t sr, int64_t sc);
 
 TH_API void THTensor_(fullXCorr3Dptr)(real *r_,
                                    real alpha,
-                                   real *t_, long it, long ir, long ic,
-                                   real *k_, long kt, long kr, long kc,
-                                   long st, long sr, long sc);
+                                   real *t_, int64_t it, int64_t ir, int64_t ic,
+                                   real *k_, int64_t kt, int64_t kr, int64_t kc,
+                                   int64_t st, int64_t sr, int64_t sc);
 
 TH_API void THTensor_(fullConv3Dptr)(real *r_,
                                   real alpha,
-                                  real *t_, long it, long ir, long ic,
-                                  real *k_, long kt, long kr, long kc,
-                                  long st, long sr, long sc);
+                                  real *t_, int64_t it, int64_t ir, int64_t ic,
+                                  real *k_, int64_t kt, int64_t kr, int64_t kc,
+                                  int64_t st, int64_t sr, int64_t sc);
 
 TH_API void THTensor_(validXCorr3DRevptr)(real *r_,
                                        real alpha,
-                                       real *t_, long it, long ir, long ic,
-                                       real *k_, long kt, long kr, long kc,
-                                       long st, long sr, long sc);
+                                       real *t_, int64_t it, int64_t ir, int64_t ic,
+                                       real *k_, int64_t kt, int64_t kr, int64_t kc,
+                                       int64_t st, int64_t sr, int64_t sc);
 
-TH_API void THTensor_(conv3DRevger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long sdepth, long srow, long scol);
-TH_API void THTensor_(conv3Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long sdepth, long srow, long scol, const char *vf, const char *xc);
-TH_API void THTensor_(conv3Dmv)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long sdepth, long srow, long scol, const char *vf, const char *xc);
-TH_API void THTensor_(conv3Dmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long sdepth, long srow, long scol, const char *vf, const char *xc);
-TH_API void THTensor_(conv3Dcmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long sdepth, long srow, long scol, const char *vf, const char *xc);
+TH_API void THTensor_(conv3DRevger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t sdepth, int64_t srow, int64_t scol);
+TH_API void THTensor_(conv3Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t sdepth, int64_t srow, int64_t scol, const char *vf, const char *xc);
+TH_API void THTensor_(conv3Dmv)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t sdepth, int64_t srow, int64_t scol, const char *vf, const char *xc);
+TH_API void THTensor_(conv3Dmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t sdepth, int64_t srow, int64_t scol, const char *vf, const char *xc);
+TH_API void THTensor_(conv3Dcmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, int64_t sdepth, int64_t srow, int64_t scol, const char *vf, const char *xc);
 
 #endif
diff --git a/torch/lib/TH/generic/THTensorCopy.c b/torch/lib/TH/generic/THTensorCopy.c
index d9cd1c0d501e..265a22de7ddc 100644
--- a/torch/lib/TH/generic/THTensorCopy.c
+++ b/torch/lib/TH/generic/THTensorCopy.c
@@ -28,10 +28,10 @@ void THTensor_(copyTranspose)(THTensor *tensor, THTensor *src) {
   real *rp = THTensor_(data)(tensor);
   real *bp = THTensor_(data)(buf);
 
-  long NR = THTensor_(size)(src, 0);
-  long NC = THTensor_(size)(src, 1);
-  for (long R = 0; R < NR; R += BLOCK_SZ) {
-    for (long C = 0; C < NC; C += BLOCK_SZ) {
+  int64_t NR = THTensor_(size)(src, 0);
+  int64_t NC = THTensor_(size)(src, 1);
+  for (int64_t R = 0; R < NR; R += BLOCK_SZ) {
+    for (int64_t C = 0; C < NC; C += BLOCK_SZ) {
       real *spo = sp + R + C * NR;
       real *rpo = rp + C + R * NC;
 
@@ -112,22 +112,22 @@ void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src
 }
 
 #ifndef TH_REAL_IS_HALF
-IMPLEMENT_THTensor_COPY(Byte, unsigned char)
-IMPLEMENT_THTensor_COPY(Char, char)
-IMPLEMENT_THTensor_COPY(Short, short)
-IMPLEMENT_THTensor_COPY(Int, int)
-IMPLEMENT_THTensor_COPY(Long, long)
+IMPLEMENT_THTensor_COPY(Byte, uint8_t)
+IMPLEMENT_THTensor_COPY(Char, int8_t)
+IMPLEMENT_THTensor_COPY(Short, int16_t)
+IMPLEMENT_THTensor_COPY(Int, int32_t)
+IMPLEMENT_THTensor_COPY(Long, int64_t)
 IMPLEMENT_THTensor_COPY(Float, float)
 IMPLEMENT_THTensor_COPY(Double, double)
 IMPLEMENT_THTensor_COPY_FROM_HALF(Half, THHalf)
 #else
 /* only allow pass-through for Half */
 IMPLEMENT_THTensor_COPY_TO_FROM_HALF(Half, THHalf)
-IMPLEMENT_THTensor_COPY_TO_HALF(Byte, unsigned char)
-IMPLEMENT_THTensor_COPY_TO_HALF(Char, char)
-IMPLEMENT_THTensor_COPY_TO_HALF(Short, short)
-IMPLEMENT_THTensor_COPY_TO_HALF(Int, int)
-IMPLEMENT_THTensor_COPY_TO_HALF(Long, long)
+IMPLEMENT_THTensor_COPY_TO_HALF(Byte, uint8_t)
+IMPLEMENT_THTensor_COPY_TO_HALF(Char, int8_t)
+IMPLEMENT_THTensor_COPY_TO_HALF(Short, int16_t)
+IMPLEMENT_THTensor_COPY_TO_HALF(Int, int32_t)
+IMPLEMENT_THTensor_COPY_TO_HALF(Long, int64_t)
 IMPLEMENT_THTensor_COPY_TO_HALF(Float, float)
 IMPLEMENT_THTensor_COPY_TO_HALF(Double, double)
 
diff --git a/torch/lib/TH/generic/THTensorLapack.c b/torch/lib/TH/generic/THTensorLapack.c
index d4e52f6d7b3d..d7d2143a954b 100644
--- a/torch/lib/TH/generic/THTensorLapack.c
+++ b/torch/lib/TH/generic/THTensorLapack.c
@@ -134,7 +134,7 @@ void THTensor_(gesv)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
   lda  = n;
   ldb  = n;
 
-  ipiv = THIntTensor_newWithSize1d((long)n);
+  ipiv = THIntTensor_newWithSize1d((int64_t)n);
   THLapack_(gesv)(n, nrhs,
 		  THTensor_(data)(ra__), lda, THIntTensor_data(ipiv),
 		  THTensor_(data)(rb__), ldb, &info);
@@ -276,7 +276,7 @@ void THTensor_(geev)(THTensor *re_, THTensor *rv_, THTensor *a_, const char *job
   THTensor *work, *wi, *wr, *a;
   real wkopt;
   real *rv_data;
-  long i;
+  int64_t i;
 
   THTensor *re__ = NULL;
   THTensor *rv__ = NULL;
@@ -494,7 +494,7 @@ void THTensor_(getri)(THTensor *ra_, THTensor *a)
   m = ra__->size[0];
   n = ra__->size[1];
   lda = m;
-  ipiv = THIntTensor_newWithSize1d((long)m);
+  ipiv = THIntTensor_newWithSize1d((int64_t)m);
 
   /* Run LU */
   THLapack_(getrf)(n, n, THTensor_(data)(ra__), lda, THIntTensor_data(ipiv), &info);
@@ -530,7 +530,7 @@ void THTensor_(clearUpLoTriangle)(THTensor *a, const char *uplo)
 
   /* Build full matrix */
   real *p = THTensor_(data)(a);
-  long i, j;
+  int64_t i, j;
 
   /* Upper Triangular Case */
   if (uplo[0] == 'U')
@@ -563,7 +563,7 @@ void THTensor_(copyUpLoTriangle)(THTensor *a, const char *uplo)
 
   /* Build full matrix */
   real *p = THTensor_(data)(a);
-  long i, j;
+  int64_t i, j;
 
   /* Upper Triangular Case */
   if (uplo[0] == 'U')
@@ -955,7 +955,7 @@ void THTensor_(btrifact)(THTensor *ra_, THIntTensor *rpivots_, THIntTensor *rinf
   if (m != n) {
     THError("btrifact is only implemented for square matrices");
   }
-  long num_batches = THTensor_(size)(a, 0);
+  int64_t num_batches = THTensor_(size)(a, 0);
   THTensor *ra__;
   int lda;
 
@@ -985,7 +985,7 @@ void THTensor_(btrifact)(THTensor *ra_, THIntTensor *rpivots_, THIntTensor *rinf
 
   THIntTensor_resize2d(rpivots_, num_batches, n);
 
-  long batch = 0;
+  int64_t batch = 0;
   for (; batch < num_batches; ++batch) {
     THTensor_(select)(ai, a, 0, batch);
     THTensor_(select)(rai, ra__, 0, batch);
@@ -1031,8 +1031,8 @@ void THTensor_(btrisolve)(THTensor *rb_, THTensor *b, THTensor *atf, THIntTensor
     THTensor_(copy)(rb_, b);
   }
 
-  long num_batches = atf->size[0];
-  long n = atf->size[1];
+  int64_t num_batches = atf->size[0];
+  int64_t n = atf->size[1];
   int nrhs = rb_->nDimension > 2 ? rb_->size[2] : 1;
 
   int lda, ldb;
@@ -1087,7 +1087,7 @@ void THTensor_(btrisolve)(THTensor *rb_, THTensor *b, THTensor *atf, THIntTensor
       THError("Error: rpivots_ is not contiguous.");
   }
 
-  for (long batch = 0; batch < num_batches; ++batch) {
+  for (int64_t batch = 0; batch < num_batches; ++batch) {
     THTensor_(select)(ai, atf_, 0, batch);
     THTensor_(select)(rbi, rb__, 0, batch);
     THIntTensor_select(pivoti, pivots, 0, batch);
diff --git a/torch/lib/TH/generic/THTensorMath.c b/torch/lib/TH/generic/THTensorMath.c
index 1e62baea857c..25b02bf6c6fe 100644
--- a/torch/lib/TH/generic/THTensorMath.c
+++ b/torch/lib/TH/generic/THTensorMath.c
@@ -201,10 +201,10 @@ void THTensor_(maskedSelect)(THTensor *tensor, THTensor *src, THByteTensor *mask
 void THTensor_(nonzero)(THLongTensor *subscript, THTensor *tensor)
 {
   ptrdiff_t numel = 0;
-  long *subscript_data;
-  long i = 0;
-  long dim;
-  long div = 1;
+  int64_t *subscript_data;
+  int64_t i = 0;
+  int64_t dim;
+  int64_t div = 1;
 #ifdef TH_REAL_IS_HALF
 #define IS_NONZERO(val) ((val.x & 0x7fff) != 0)
 #else
@@ -242,7 +242,7 @@ void THTensor_(indexSelect)(THTensor *tensor, THTensor *src, int dim, THLongTens
   ptrdiff_t i, numel;
   THLongStorage *newSize;
   THTensor *tSlice, *sSlice;
-  long *index_data;
+  int64_t *index_data;
   real *tensor_data, *src_data;
 
   THArgCheck(index->nDimension == 1, 3, "Index is supposed to be a vector");
@@ -270,7 +270,7 @@ void THTensor_(indexSelect)(THTensor *tensor, THTensor *src, int dim, THLongTens
     ptrdiff_t rowsize = THTensor_(nElement)(src) / src->size[0];
 
     // check that the indices are within range
-    long max = src->size[0] - 1 + TH_INDEX_BASE;
+    int64_t max = src->size[0] - 1 + TH_INDEX_BASE;
     for (i=0; i<numel; i++) {
       if (index_data[i] < TH_INDEX_BASE || index_data[i] > max) {
         THLongTensor_free(index);
@@ -314,7 +314,7 @@ void THTensor_(indexCopy)(THTensor *tensor, int dim, THLongTensor *index, THTens
 {
   ptrdiff_t i, numel;
   THTensor *tSlice, *sSlice;
-  long *index_data;
+  int64_t *index_data;
 
   numel = THLongTensor_nElement(index);
   THArgCheck(index->nDimension == 1, 3, "Index is supposed to be a vector");
@@ -353,7 +353,7 @@ void THTensor_(indexAdd)(THTensor *tensor, int dim, THLongTensor *index, THTenso
 {
   ptrdiff_t i, numel;
   THTensor *tSlice, *sSlice;
-  long *index_data;
+  int64_t *index_data;
 
   numel = THLongTensor_nElement(index);
   THArgCheck(index->nDimension == 1, 3, "Index is supposed to be a vector");
@@ -394,7 +394,7 @@ void THTensor_(indexFill)(THTensor *tensor, int dim, THLongTensor *index, real v
 {
   ptrdiff_t i, numel;
   THTensor *tSlice;
-  long *index_data;
+  int64_t *index_data;
 
   numel = THLongTensor_nElement(index);
   THArgCheck(index->nDimension == 1, 3, "Index is supposed to be a vector");
@@ -422,7 +422,7 @@ void THTensor_(indexFill)(THTensor *tensor, int dim, THLongTensor *index, real v
 
 void THTensor_(gather)(THTensor *tensor, THTensor *src, int dim, THLongTensor *index)
 {
-  long elems_per_row, i, idx;
+  int64_t elems_per_row, i, idx;
 
   THArgCheck(THTensor_(nDimension)(src) == THTensor_(nDimension)(tensor), 2,
              "Input tensor must have same dimensions as output tensor");
@@ -432,7 +432,7 @@ void THTensor_(gather)(THTensor *tensor, THTensor *src, int dim, THLongTensor *i
 
   elems_per_row = THLongTensor_size(index, dim);
 
-  TH_TENSOR_DIM_APPLY3(real, tensor, real, src, long, index, dim,
+  TH_TENSOR_DIM_APPLY3(real, tensor, real, src, int64_t, index, dim,
                        for (i = 0; i < elems_per_row; ++i)
                        {
                          idx = *(index_data + i*index_stride);
@@ -447,7 +447,7 @@ void THTensor_(gather)(THTensor *tensor, THTensor *src, int dim, THLongTensor *i
 
 void THTensor_(scatter)(THTensor *tensor, int dim, THLongTensor *index, THTensor *src)
 {
-  long elems_per_row, i, idx;
+  int64_t elems_per_row, i, idx;
 
   THArgCheck(dim < THTensor_(nDimension)(tensor), 2, "Index dimension is out of bounds");
   THArgCheck(THLongTensor_nDimension(index) == THTensor_(nDimension)(tensor), 3,
@@ -457,7 +457,7 @@ void THTensor_(scatter)(THTensor *tensor, int dim, THLongTensor *index, THTensor
 
   elems_per_row = THLongTensor_size(index, dim);
 
-  TH_TENSOR_DIM_APPLY3(real, tensor, real, src, long, index, dim,
+  TH_TENSOR_DIM_APPLY3(real, tensor, real, src, int64_t, index, dim,
                        for (i = 0; i < elems_per_row; ++i)
                        {
                          idx = *(index_data + i*index_stride);
@@ -472,7 +472,7 @@ void THTensor_(scatter)(THTensor *tensor, int dim, THLongTensor *index, THTensor
 
 void THTensor_(scatterAdd)(THTensor *tensor, int dim, THLongTensor *index, THTensor *src)
 {
-  long elems_per_row, i, idx;
+  int64_t elems_per_row, i, idx;
 
   THArgCheck(dim < THTensor_(nDimension)(tensor), 2, "Index dimension is out of bounds");
   THArgCheck(THLongTensor_nDimension(index) == THTensor_(nDimension)(tensor), 3,
@@ -482,7 +482,7 @@ void THTensor_(scatterAdd)(THTensor *tensor, int dim, THLongTensor *index, THTen
 
   elems_per_row = THLongTensor_size(index, dim);
 
-  TH_TENSOR_DIM_APPLY3(real, tensor, real, src, long, index, dim,
+  TH_TENSOR_DIM_APPLY3(real, tensor, real, src, int64_t, index, dim,
                        for (i = 0; i < elems_per_row; ++i)
                        {
                          idx = *(index_data + i*index_stride);
@@ -497,7 +497,7 @@ void THTensor_(scatterAdd)(THTensor *tensor, int dim, THLongTensor *index, THTen
 
 void THTensor_(scatterFill)(THTensor *tensor, int dim, THLongTensor *index, real val)
 {
-  long elems_per_row, i, idx;
+  int64_t elems_per_row, i, idx;
 
   THArgCheck(dim < THTensor_(nDimension)(tensor), 2, "Index dimension is out of bounds");
   THArgCheck(THLongTensor_nDimension(index) == THTensor_(nDimension)(tensor), 3,
@@ -505,7 +505,7 @@ void THTensor_(scatterFill)(THTensor *tensor, int dim, THLongTensor *index, real
 
   elems_per_row = THLongTensor_size(index, dim);
 
-  TH_TENSOR_DIM_APPLY2(real, tensor, long, index, dim,
+  TH_TENSOR_DIM_APPLY2(real, tensor, int64_t, index, dim,
                        for (i = 0; i < elems_per_row; ++i)
                        {
                          idx = *(index_data + i*index_stride);
@@ -523,7 +523,7 @@ accreal THTensor_(dot)(THTensor *tensor, THTensor *src)
   accreal sum = 0;
   /* we use a trick here. careful with that. */
   TH_TENSOR_APPLY2(real, tensor, real, src,
-                   long sz = (tensor_size-tensor_i < src_size-src_i ? tensor_size-tensor_i : src_size-src_i);
+                   int64_t sz = (tensor_size-tensor_i < src_size-src_i ? tensor_size-tensor_i : src_size-src_i);
                    sum += THBlas_(dot)(sz, src_data, src_stride, tensor_data, tensor_stride);
                    tensor_i += sz;
                    src_i += sz;
@@ -586,7 +586,7 @@ real THTensor_(maxall)(THTensor *tensor)
   return theMax;
 }
 
-static void THTensor_(quickselectnoidx)(real *arr, long k, long elements, long stride);
+static void THTensor_(quickselectnoidx)(real *arr, int64_t k, int64_t elements, int64_t stride);
 
 real THTensor_(medianall)(THTensor *tensor)
 {
@@ -594,7 +594,7 @@ real THTensor_(medianall)(THTensor *tensor)
 
   real theMedian;
   ptrdiff_t numel;
-  long k;
+  int64_t k;
   THTensor *temp_;
   real *temp__data;
 
@@ -677,21 +677,21 @@ void THTensor_(lshift)(THTensor *r_, THTensor *t, real value)
       THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
       real *tp = THTensor_(data)(t);
       real *rp = THTensor_(data)(r_);
-      long sz = THTensor_(nElement)(t);
-      long i;
+      int64_t sz = THTensor_(nElement)(t);
+      int64_t i;
       #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD * 100) private(i)
       for (i=0; i<sz; i++) {
 #if defined(TH_REAL_IS_BYTE)
           rp[i] = ((real) tp[i]) << value;
 #else
-          rp[i] = ((unsigned real) tp[i]) << value;
+          rp[i] = ((ureal) tp[i]) << value;
 #endif
       }
   } else {
 #if defined(TH_REAL_IS_BYTE)
       TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (((real) *t_data) << value););
 #else
-      TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (((unsigned real) *t_data) << value););
+      TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (((ureal) *t_data) << value););
 #endif
   }
 #endif
@@ -712,21 +712,21 @@ void THTensor_(rshift)(THTensor *r_, THTensor *t, real value)
       THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
       real *tp = THTensor_(data)(t);
       real *rp = THTensor_(data)(r_);
-      long sz = THTensor_(nElement)(t);
-      long i;
+      int64_t sz = THTensor_(nElement)(t);
+      int64_t i;
       #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD * 100) private(i)
       for (i=0; i<sz; i++) {
 #if defined(TH_REAL_IS_BYTE)
           rp[i] = ((real) tp[i]) >> value;
 #else
-          rp[i] = ((unsigned real) tp[i]) >> value;
+          rp[i] = ((ureal) tp[i]) >> value;
 #endif
       }
   } else {
 #if defined(TH_REAL_IS_BYTE)
       TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (((real) *t_data) >> value););
 #else
-      TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (((unsigned real) *t_data) >> value););
+      TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (((ureal) *t_data) >> value););
 #endif
   }
 #endif
@@ -799,8 +799,8 @@ void THTensor_(bitand)(THTensor *r_, THTensor *t, real value)
       THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
       real *tp = THTensor_(data)(t);
       real *rp = THTensor_(data)(r_);
-      long sz = THTensor_(nElement)(t);
-      long i;
+      int64_t sz = THTensor_(nElement)(t);
+      int64_t i;
       #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD * 100) private(i)
       for (i=0; i<sz; i++) {
           rp[i] = tp[i] & value;
@@ -822,8 +822,8 @@ void THTensor_(bitor)(THTensor *r_, THTensor *t, real value)
       THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
       real *tp = THTensor_(data)(t);
       real *rp = THTensor_(data)(r_);
-      long sz = THTensor_(nElement)(t);
-      long i;
+      int64_t sz = THTensor_(nElement)(t);
+      int64_t i;
       #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD * 100) private(i)
       for (i=0; i<sz; i++) {
           rp[i] = tp[i] | value;
@@ -845,8 +845,8 @@ void THTensor_(bitxor)(THTensor *r_, THTensor *t, real value)
       THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
       real *tp = THTensor_(data)(t);
       real *rp = THTensor_(data)(r_);
-      long sz = THTensor_(nElement)(t);
-      long i;
+      int64_t sz = THTensor_(nElement)(t);
+      int64_t i;
       #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD * 100) private(i)
       for (i=0; i<sz; i++) {
           rp[i] = tp[i] ^ value;
@@ -954,7 +954,7 @@ void THTensor_(clshift)(THTensor *r_, THTensor *t, THTensor *src)
 #elif defined(TH_REAL_IS_BYTE)
       rp[i] = ((real) tp[i]) << sp[i];
 #else
-      rp[i] = ((unsigned real) tp[i]) << sp[i];
+      rp[i] = ((ureal) tp[i]) << sp[i];
 #endif
     }
   } else {
@@ -965,7 +965,7 @@ void THTensor_(clshift)(THTensor *r_, THTensor *t, THTensor *src)
 #elif defined(TH_REAL_IS_BYTE)
       TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = ((real)*t_data) << *src_data;);
 #else
-      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = ((unsigned real)*t_data) << *src_data;);
+      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = ((ureal)*t_data) << *src_data;);
 #endif
   }
 }
@@ -994,7 +994,7 @@ void THTensor_(crshift)(THTensor *r_, THTensor *t, THTensor *src)
 #elif defined(TH_REAL_IS_BYTE)
       rp[i] = ((real) tp[i]) >> sp[i];
 #else
-      rp[i] = ((unsigned real) tp[i]) >> sp[i];
+      rp[i] = ((ureal) tp[i]) >> sp[i];
 #endif
     }
   } else {
@@ -1005,7 +1005,7 @@ void THTensor_(crshift)(THTensor *r_, THTensor *t, THTensor *src)
 #elif defined(TH_REAL_IS_BYTE)
       TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = ((real)*t_data) >> *src_data;);
 #else
-      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = ((unsigned real)*t_data) >> *src_data;);
+      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = ((ureal)*t_data) >> *src_data;);
 #endif
   }
 }
@@ -1239,13 +1239,13 @@ void THTensor_(addmv)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor
 
 void THTensor_(match)(THTensor *r_, THTensor *m1, THTensor *m2, real gain)
 {
-  long N1 = m1->size[0];
-  long N2 = m2->size[0];
-  long dim;
+  int64_t N1 = m1->size[0];
+  int64_t N2 = m2->size[0];
+  int64_t dim;
   real *m1_p;
   real *m2_p;
   real *r_p;
-  long i;
+  int64_t i;
 
   THTensor_(resize2d)(r_, N1, N2);
 
@@ -1264,7 +1264,7 @@ void THTensor_(match)(THTensor *r_, THTensor *m1, THTensor *m2, real gain)
 
 #pragma omp parallel for private(i)
   for (i=0; i<N1; i++) {
-    long j,k;
+    int64_t j,k;
     for (j=0; j<N2; j++) {
       real sum = 0;
       for (k=0; k<dim; k++) {
@@ -1459,7 +1459,7 @@ void THTensor_(addr)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor
 
 void THTensor_(addbmm)(THTensor *result, real beta, THTensor *t, real alpha, THTensor *batch1, THTensor *batch2)
 {
-  long batch;
+  int64_t batch;
 
   THArgCheck(THTensor_(nDimension)(batch1) == 3, 1, "expected 3D tensor");
   THArgCheck(THTensor_(nDimension)(batch2) == 3, 2, "expected 3D tensor");
@@ -1471,8 +1471,8 @@ void THTensor_(addbmm)(THTensor *result, real beta, THTensor *t, real alpha, THT
              THTensor_(size)(batch1, 1), THTensor_(size)(batch1,2),
              THTensor_(size)(batch2, 1), THTensor_(size)(batch2,2));
 
-  long dim1 = THTensor_(size)(batch1, 1);
-  long dim2 = THTensor_(size)(batch2, 2);
+  int64_t dim1 = THTensor_(size)(batch1, 1);
+  int64_t dim2 = THTensor_(size)(batch2, 2);
   THArgCheck(THTensor_(size)(t, 0) == dim1, 1, "output tensor of incorrect size");
   THArgCheck(THTensor_(size)(t, 1) == dim2, 1, "output tensor of incorrect size");
 
@@ -1500,7 +1500,7 @@ void THTensor_(addbmm)(THTensor *result, real beta, THTensor *t, real alpha, THT
 
 void THTensor_(baddbmm)(THTensor *result, real beta, THTensor *t, real alpha, THTensor *batch1, THTensor *batch2)
 {
-  long batch;
+  int64_t batch;
 
   THArgCheck(THTensor_(nDimension)(batch1) == 3, 1, "expected 3D tensor, got %dD", THTensor_(nDimension)(batch1));
   THArgCheck(THTensor_(nDimension)(batch2) == 3, 2, "expected 3D tensor, got %dD", THTensor_(nDimension)(batch2));
@@ -1512,9 +1512,9 @@ void THTensor_(baddbmm)(THTensor *result, real beta, THTensor *t, real alpha, TH
              THTensor_(size)(batch1, 1), THTensor_(size)(batch1, 2),
              THTensor_(size)(batch2, 1), THTensor_(size)(batch2, 2));
 
-  long bs = THTensor_(size)(batch1, 0);
-  long dim1 = THTensor_(size)(batch1, 1);
-  long dim2 = THTensor_(size)(batch2, 2);
+  int64_t bs = THTensor_(size)(batch1, 0);
+  int64_t dim1 = THTensor_(size)(batch1, 1);
+  int64_t dim2 = THTensor_(size)(batch2, 2);
   THArgCheck(THTensor_(size)(t, 0) == bs, 1,   "output tensor of incorrect size");
   THArgCheck(THTensor_(size)(t, 1) == dim1, 1, "output tensor of incorrect size");
   THArgCheck(THTensor_(size)(t, 2) == dim2, 1, "output tensor of incorrect size");
@@ -1565,9 +1565,9 @@ void THTensor_(max)(THTensor *values_, THLongTensor *indices_, THTensor *t, int
   if (t->stride[dimension] == 1) {
     real theMax;
     real value;
-    long theIndex;
-    long i;
-    TH_TENSOR_DIM_APPLY3(real, t, real, values_, long, indices_, dimension,
+    int64_t theIndex;
+    int64_t i;
+    TH_TENSOR_DIM_APPLY3(real, t, real, values_, int64_t, indices_, dimension,
                          theMax = t_data[0];
                          theIndex = 0;
 
@@ -1612,7 +1612,7 @@ void THTensor_(max)(THTensor *values_, THLongTensor *indices_, THTensor *t, int
     tempIndices_->size[dimension] = t->size[dimension];
     tempIndices_->stride[dimension] = 0;
 
-    TH_TENSOR_APPLY3_D(real, t, real, tempValues_, long, tempIndices_, dimension,
+    TH_TENSOR_APPLY3_D(real, t, real, tempValues_, int64_t, tempIndices_, dimension,
                           if(!(*t_data <= *tempValues__data) && !th_isnan(*tempValues__data)) {
                             *tempValues__data = *t_data;
                             *tempIndices__data = *tempIndices__dimOffset;
@@ -1645,9 +1645,9 @@ void THTensor_(min)(THTensor *values_, THLongTensor *indices_, THTensor *t, int
   if (t->stride[dimension] == 1) {
     real theMax;
     real value;
-    long theIndex;
-    long i;
-    TH_TENSOR_DIM_APPLY3(real, t, real, values_, long, indices_, dimension,
+    int64_t theIndex;
+    int64_t i;
+    TH_TENSOR_DIM_APPLY3(real, t, real, values_, int64_t, indices_, dimension,
                          theMax = t_data[0];
                          theIndex = 0;
 
@@ -1692,7 +1692,7 @@ void THTensor_(min)(THTensor *values_, THLongTensor *indices_, THTensor *t, int
     tempIndices_->size[dimension] = t->size[dimension];
     tempIndices_->stride[dimension] = 0;
 
-    TH_TENSOR_APPLY3_D(real, t, real, tempValues_, long, tempIndices_, dimension,
+    TH_TENSOR_APPLY3_D(real, t, real, tempValues_, int64_t, tempIndices_, dimension,
                           if(!(*t_data >= *tempValues__data) && !th_isnan(*tempValues__data)) {
                             *tempValues__data = *t_data;
                             *tempIndices__data = *tempIndices__dimOffset;
@@ -1722,7 +1722,7 @@ void THTensor_(sum)(THTensor *r_, THTensor *t, int dimension, int keepdim)
   if (t->stride[dimension] == 1) {
     TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
                          accreal sum = 0;
-                         long i;
+                         int64_t i;
                          for(i = 0; i < t_size; i++)
                            sum += t_data[i*t_stride];
                          *r__data = (real)sum;);
@@ -1758,7 +1758,7 @@ void THTensor_(prod)(THTensor *r_, THTensor *t, int dimension, int keepdim)
   if (t->stride[dimension] == 1) {
     TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
                          accreal prod = 1;
-                         long i;
+                         int64_t i;
                          for(i = 0; i < t_size; i++)
                            prod *= t_data[i*t_stride];
                          *r__data = (real)prod;);
@@ -1787,7 +1787,7 @@ void THTensor_(cumsum)(THTensor *r_, THTensor *t, int dimension)
 
   TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
                        accreal cumsum = 0;
-                       long i;
+                       int64_t i;
                        for(i = 0; i < t_size; i++)
                        {
                          cumsum += t_data[i*t_stride];
@@ -1804,7 +1804,7 @@ void THTensor_(cumprod)(THTensor *r_, THTensor *t, int dimension)
 
   TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
                        accreal cumprod = 1;
-                       long i;
+                       int64_t i;
                        for(i = 0; i < t_size; i++)
                        {
                          cumprod *= t_data[i*t_stride];
@@ -1834,8 +1834,8 @@ accreal THTensor_(trace)(THTensor *t)
 {
   real *t_data = THTensor_(data)(t);
   accreal sum = 0;
-  long i = 0;
-  long t_stride_0, t_stride_1, t_diag_size;
+  int64_t i = 0;
+  int64_t t_stride_0, t_stride_1, t_diag_size;
 
   THArgCheck(THTensor_(nDimension)(t) == 2, 1, "expected a matrix");
 
@@ -1952,13 +1952,13 @@ void THTensor_(diag)(THTensor *r_, THTensor *t, int k)
   if(THTensor_(nDimension)(t) == 1)
   {
     real *t_data = THTensor_(data)(t);
-    long t_stride_0 = THTensor_(stride)(t, 0);
-    long t_size = THTensor_(size)(t, 0);
-    long sz = t_size + (k >= 0 ? k : -k);
+    int64_t t_stride_0 = THTensor_(stride)(t, 0);
+    int64_t t_size = THTensor_(size)(t, 0);
+    int64_t sz = t_size + (k >= 0 ? k : -k);
     real *r__data;
-    long r__stride_0;
-    long r__stride_1;
-    long i;
+    int64_t r__stride_0;
+    int64_t r__stride_1;
+    int64_t i;
 
     THTensor_(resize2d)(r_, sz, sz);
     THTensor_(zero)(r_);
@@ -1973,12 +1973,12 @@ void THTensor_(diag)(THTensor *r_, THTensor *t, int k)
   else
   {
     real *t_data = THTensor_(data)(t);
-    long t_stride_0 = THTensor_(stride)(t, 0);
-    long t_stride_1 = THTensor_(stride)(t, 1);
-    long sz;
+    int64_t t_stride_0 = THTensor_(stride)(t, 0);
+    int64_t t_stride_1 = THTensor_(stride)(t, 1);
+    int64_t sz;
     real *r__data;
-    long r__stride_0;
-    long i;
+    int64_t r__stride_0;
+    int64_t i;
 
     if(k >= 0)
       sz = THMin(THTensor_(size)(t, 0), THTensor_(size)(t, 1)-k);
@@ -1994,10 +1994,10 @@ void THTensor_(diag)(THTensor *r_, THTensor *t, int k)
   }
 }
 
-void THTensor_(eye)(THTensor *r_, long n, long m)
+void THTensor_(eye)(THTensor *r_, int64_t n, int64_t m)
 {
   real *r__data;
-  long i, sz;
+  int64_t i, sz;
 
   THArgCheck(n > 0, 1, "invalid argument");
 
@@ -2035,20 +2035,20 @@ void THTensor_(range)(THTensor *r_, accreal xmin, accreal xmax, accreal step)
 
 void THTensor_(arange)(THTensor *r_, accreal xmin, accreal xmax, accreal step) {
 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
-  int m = fmod(xmax - xmin,step) == 0;
+  int m = fmod(xmax - xmin, step) == 0;
 #else
   int m = (xmax - xmin) % step == 0;
 #endif
   if (m)
     xmax -= step;
-  THTensor_(range)(r_,xmin,xmax,step);
+  THTensor_(range)(r_, xmin, xmax, step);
 }
 
-void THTensor_(randperm)(THTensor *r_, THGenerator *_generator, long n)
+void THTensor_(randperm)(THTensor *r_, THGenerator *_generator, int64_t n)
 {
   real *r__data;
-  long r__stride_0;
-  long i;
+  int64_t r__stride_0;
+  int64_t i;
 
   THArgCheck(n > 0, 1, "must be strictly positive");
 
@@ -2061,7 +2061,7 @@ void THTensor_(randperm)(THTensor *r_, THGenerator *_generator, long n)
 
   for(i = 0; i < n-1; i++)
   {
-    long z = THRandom_random(_generator) % (n-i);
+    int64_t z = THRandom_random(_generator) % (n-i);
     real sav = r__data[i*r__stride_0];
     r__data[i*r__stride_0] = r__data[(z+i)*r__stride_0];
     r__data[(z+i)*r__stride_0] = sav;
@@ -2104,9 +2104,9 @@ void THTensor_(reshape)(THTensor *r_, THTensor *t, THLongStorage *size)
   REAL_SWAP(ARR(III), ARR(JJJ)); \
   LONG_SWAP(IDX(III), IDX(JJJ))
 
-static void THTensor_(quicksortascend)(real *arr, long *idx, long elements, long stride)
+static void THTensor_(quicksortascend)(real *arr, int64_t *idx, int64_t elements, int64_t stride)
 {
-  long beg[MAX_LEVELS], end[MAX_LEVELS], i, j, L, R, P, swap, pid, stack = 0, sz_right, sz_left;
+  int64_t beg[MAX_LEVELS], end[MAX_LEVELS], i, j, L, R, P, swap, pid, stack = 0, sz_right, sz_left;
   real rswap, piv;
   unsigned char done = 0;
 
@@ -2193,9 +2193,9 @@ static void THTensor_(quicksortascend)(real *arr, long *idx, long elements, long
   }
 }
 
-static void THTensor_(quicksortdescend)(real *arr, long *idx, long elements, long stride)
+static void THTensor_(quicksortdescend)(real *arr, int64_t *idx, int64_t elements, int64_t stride)
 {
-  long beg[MAX_LEVELS], end[MAX_LEVELS], i, j, L, R, P, swap, pid, stack = 0, sz_right, sz_left;
+  int64_t beg[MAX_LEVELS], end[MAX_LEVELS], i, j, L, R, P, swap, pid, stack = 0, sz_right, sz_left;
   real rswap, piv;
   unsigned char done = 0;
 
@@ -2301,16 +2301,16 @@ void THTensor_(sort)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int dimensio
 
   if(descendingOrder)
   {
-    TH_TENSOR_DIM_APPLY2(real, rt_, long, ri_, dimension,
-                         long i;
+    TH_TENSOR_DIM_APPLY2(real, rt_, int64_t, ri_, dimension,
+                         int64_t i;
                          for(i = 0; i < ri__size; i++)
                            ri__data[i*ri__stride] = i;
                          THTensor_(quicksortdescend)(rt__data, ri__data, rt__size, rt__stride);)
       }
   else
   {
-    TH_TENSOR_DIM_APPLY2(real, rt_, long, ri_, dimension,
-                         long i;
+    TH_TENSOR_DIM_APPLY2(real, rt_, int64_t, ri_, dimension,
+                         int64_t i;
                          for(i = 0; i < ri__size; i++)
                            ri__data[i*ri__stride] = i;
                          THTensor_(quicksortascend)(rt__data, ri__data, rt__size, rt__stride);)
@@ -2321,9 +2321,9 @@ void THTensor_(sort)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int dimensio
 public domain implementation at http://ndevilla.free.fr/median/median/
 Adapted similarly to the above Quicksort algorithm.
 This version does not produce indices along with values. */
-static void THTensor_(quickselectnoidx)(real *arr, long k, long elements, long stride)
+static void THTensor_(quickselectnoidx)(real *arr, int64_t k, int64_t elements, int64_t stride)
 {
-  long P, L, R, i, j, swap;
+  int64_t P, L, R, i, j, swap;
   real rswap, piv;
   L = 0;
   R = elements-1;
@@ -2367,9 +2367,9 @@ static void THTensor_(quickselectnoidx)(real *arr, long k, long elements, long s
 /* Implementation of the Quickselect algorithm, based on Nicolas Devillard's
 public domain implementation at http://ndevilla.free.fr/median/median/
 Adapted similarly to the above Quicksort algorithm. */
-static void THTensor_(quickselect)(real *arr, long *idx, long k, long elements, long stride)
+static void THTensor_(quickselect)(real *arr, int64_t *idx, int64_t k, int64_t elements, int64_t stride)
 {
-  long P, L, R, i, j, swap, pid;
+  int64_t P, L, R, i, j, swap, pid;
   real rswap, piv;
   L = 0;
   R = elements-1;
@@ -2423,8 +2423,8 @@ void THTensor_(mode)(THTensor *values_, THLongTensor *indices_, THTensor *t, int
   THTensor *temp_;
   THLongTensor *tempi_;
   real *temp__data;
-  long *tempi__data;
-  long t_size_dim;
+  int64_t *tempi__data;
+  int64_t t_size_dim;
 
   THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 3, "dimension out of range");
 
@@ -2444,12 +2444,12 @@ void THTensor_(mode)(THTensor *values_, THLongTensor *indices_, THTensor *t, int
   THLongTensor_resize1d(tempi_, t_size_dim);
   tempi__data = THLongTensor_data(tempi_);
 
-  TH_TENSOR_DIM_APPLY3(real, t, real, values_, long, indices_, dimension,
-                       long i;
+  TH_TENSOR_DIM_APPLY3(real, t, real, values_, int64_t, indices_, dimension,
+                       int64_t i;
                        real mode = 0;
-                       long modei = 0;
-                       long temp_freq = 0;
-                       long max_freq = 0;
+                       int64_t modei = 0;
+                       int64_t temp_freq = 0;
+                       int64_t max_freq = 0;
                        for(i = 0; i < t_size_dim; i++)
                           temp__data[i] = t_data[i*t_stride];
                        for(i = 0; i < t_size_dim; i++)
@@ -2481,14 +2481,14 @@ void THTensor_(mode)(THTensor *values_, THLongTensor *indices_, THTensor *t, int
   }
 }
 
-void THTensor_(kthvalue)(THTensor *values_, THLongTensor *indices_, THTensor *t, long k, int dimension, int keepdim)
+void THTensor_(kthvalue)(THTensor *values_, THLongTensor *indices_, THTensor *t, int64_t k, int dimension, int keepdim)
 {
   THLongStorage *dim;
   THTensor *temp_;
   THLongTensor *tempi_;
   real *temp__data;
-  long *tempi__data;
-  long t_size_dim;
+  int64_t *tempi__data;
+  int64_t t_size_dim;
 
   THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 3, "dimension out of range");
   THArgCheck(k > 0 && k <= t->size[dimension], 2, "selected index out of range");
@@ -2509,8 +2509,8 @@ void THTensor_(kthvalue)(THTensor *values_, THLongTensor *indices_, THTensor *t,
   THLongTensor_resize1d(tempi_, t_size_dim);
   tempi__data = THLongTensor_data(tempi_);
 
-  TH_TENSOR_DIM_APPLY3(real, t, real, values_, long, indices_, dimension,
-                       long i;
+  TH_TENSOR_DIM_APPLY3(real, t, real, values_, int64_t, indices_, dimension,
+                       int64_t i;
                        for(i = 0; i < t_size_dim; i++)
                           temp__data[i] = t_data[i*t_stride];
                        for(i = 0; i < t_size_dim; i++)
@@ -2529,7 +2529,7 @@ void THTensor_(kthvalue)(THTensor *values_, THLongTensor *indices_, THTensor *t,
 
 void THTensor_(median)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim)
 {
-  long t_size_dim, k;
+  int64_t t_size_dim, k;
 
   THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 3, "dimension out of range");
 
@@ -2539,12 +2539,12 @@ void THTensor_(median)(THTensor *values_, THLongTensor *indices_, THTensor *t, i
   THTensor_(kthvalue)(values_, indices_, t, k+1, dimension, keepdim);
 }
 
-void THTensor_(topk)(THTensor *rt_, THLongTensor *ri_, THTensor *t, long k, int dim, int dir, int sorted)
+void THTensor_(topk)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int64_t k, int dim, int dir, int sorted)
 {
   int numDims = THTensor_(nDimension)(t);
   THArgCheck(dim >= 0 && dim < numDims, 3, "dim not in range");
 
-  long sliceSize = THTensor_(size)(t, dim);
+  int64_t sliceSize = THTensor_(size)(t, dim);
   THArgCheck(k > 0 && k <= sliceSize, 2, "k not in range for dimension");
 
   THTensor *tmpResults = THTensor_(new)();
@@ -2553,7 +2553,7 @@ void THTensor_(topk)(THTensor *rt_, THLongTensor *ri_, THTensor *t, long k, int
 
   THLongTensor *tmpIndices = THLongTensor_new();
   THLongTensor_resize1d(tmpIndices, sliceSize);
-  long *tmpi__data = THLongTensor_data(tmpIndices);
+  int64_t *tmpi__data = THLongTensor_data(tmpIndices);
 
   THLongStorage *topKSize = THTensor_(newSizeOf)(t);
   THLongStorage_set(topKSize, dim, k);
@@ -2563,9 +2563,9 @@ void THTensor_(topk)(THTensor *rt_, THLongTensor *ri_, THTensor *t, long k, int
 
   if (dir) {
     /* k largest elements, descending order (optional: see sorted) */
-    long K = sliceSize - k;
-    TH_TENSOR_DIM_APPLY3(real, t, real, rt_, long, ri_, dim,
-                         long i;
+    int64_t K = sliceSize - k;
+    TH_TENSOR_DIM_APPLY3(real, t, real, rt_, int64_t, ri_, dim,
+                         int64_t i;
                          for(i = 0; i < sliceSize; i++)
                          {
                            tmp__data[i] = t_data[i*t_stride];
@@ -2583,8 +2583,8 @@ void THTensor_(topk)(THTensor *rt_, THLongTensor *ri_, THTensor *t, long k, int
   }
   else {
     /* k smallest elements, ascending order (optional: see sorted) */
-    TH_TENSOR_DIM_APPLY3(real, t, real, rt_, long, ri_, dim,
-                         long i;
+    TH_TENSOR_DIM_APPLY3(real, t, real, rt_, int64_t, ri_, dim,
+                         int64_t i;
                          for(i = 0; i < sliceSize; i++)
                          {
                            tmp__data[i] = t_data[i*t_stride];
@@ -2604,13 +2604,13 @@ void THTensor_(topk)(THTensor *rt_, THLongTensor *ri_, THTensor *t, long k, int
   THLongTensor_free(tmpIndices);
 }
 
-void THTensor_(tril)(THTensor *r_, THTensor *t, long k)
+void THTensor_(tril)(THTensor *r_, THTensor *t, int64_t k)
 {
-  long t_size_0, t_size_1;
-  long t_stride_0, t_stride_1;
-  long r__stride_0, r__stride_1;
+  int64_t t_size_0, t_size_1;
+  int64_t t_stride_0, t_stride_1;
+  int64_t r__stride_0, r__stride_1;
   real *t_data, *r__data;
-  long r, c;
+  int64_t r, c;
 
   THArgCheck(THTensor_(nDimension)(t) == 2, 1, "expected a matrix");
 
@@ -2627,7 +2627,7 @@ void THTensor_(tril)(THTensor *r_, THTensor *t, long k)
 
   for(r = 0; r < t_size_0; r++)
   {
-    long sz = THMin(r+k+1, t_size_1);
+    int64_t sz = THMin(r+k+1, t_size_1);
     for(c = THMax(0, r+k+1); c < t_size_1; c++)
       r__data[r*r__stride_0+c*r__stride_1] = 0;
     for(c = 0; c < sz; c++)
@@ -2635,13 +2635,13 @@ void THTensor_(tril)(THTensor *r_, THTensor *t, long k)
   }
 }
 
-void THTensor_(triu)(THTensor *r_, THTensor *t, long k)
+void THTensor_(triu)(THTensor *r_, THTensor *t, int64_t k)
 {
-  long t_size_0, t_size_1;
-  long t_stride_0, t_stride_1;
-  long r__stride_0, r__stride_1;
+  int64_t t_size_0, t_size_1;
+  int64_t t_stride_0, t_stride_1;
+  int64_t r__stride_0, r__stride_1;
   real *t_data, *r__data;
-  long r, c;
+  int64_t r, c;
 
   THArgCheck(THTensor_(nDimension)(t) == 2, 1, "expected a matrix");
 
@@ -2658,7 +2658,7 @@ void THTensor_(triu)(THTensor *r_, THTensor *t, long k)
 
   for(r = 0; r < t_size_0; r++)
   {
-    long sz = THMin(r+k, t_size_1);
+    int64_t sz = THMin(r+k, t_size_1);
     for(c = THMax(0, r+k); c < t_size_1; c++)
       r__data[r*r__stride_0+c*r__stride_1] = t_data[r*t_stride_0+c*t_stride_1];
     for(c = 0; c < sz; c++)
@@ -2678,7 +2678,7 @@ void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int
 {
   THLongStorage *size;
   int i, j;
-  long offset;
+  int64_t offset;
   int maxDim = dimension + 1;
   int allEmpty = 1;
   int allContiguous = 1;
@@ -2706,7 +2706,7 @@ void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int
   for(i = 0; i < maxDim; i++)
   {
     // dimSize is either the size of the dim if it exists, either 1 if #dim > 0, otherwise 0
-    long dimSize = i < inputs[0]->nDimension ? inputs[0]->size[i] : THMin(inputs[0]->nDimension, 1);
+    int64_t dimSize = i < inputs[0]->nDimension ? inputs[0]->size[i] : THMin(inputs[0]->nDimension, 1);
     if (i == cat_dimension)
     {
       for (j = 1; j < numInputs; j++)
@@ -2720,7 +2720,7 @@ void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int
     {
       for (j = 1; j < numInputs; j++)
       {
-        long sz = (i < inputs[j]->nDimension ? inputs[j]->size[i] : THMin(inputs[j]->nDimension, 1));
+        int64_t sz = (i < inputs[j]->nDimension ? inputs[j]->size[i] : THMin(inputs[j]->nDimension, 1));
         // If it's a dimension we're not catting on
         // Then fail if sizes are different AND > 0
         if (dimSize != sz && dimSize && sz)
@@ -2764,7 +2764,7 @@ void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int
         {
           THTensor* input0 = inputs[j];
           real* input0_data = input0->storage->data + input0->storageOffset;
-          long input0_size = THTensor_(nElement)(input0);
+          int64_t input0_size = THTensor_(nElement)(input0);
           memcpy(result_data + offset, input0_data, input0_size*sizeof(real));
           offset += input0_size;
         }
@@ -2777,7 +2777,7 @@ void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int
       {
         if (inputs[j]->nDimension)
         {
-          long dimSize = cat_dimension < inputs[j]->nDimension ? inputs[j]->size[cat_dimension] : 1;
+          int64_t dimSize = cat_dimension < inputs[j]->nDimension ? inputs[j]->size[cat_dimension] : 1;
           THTensor *nt = THTensor_(newWithTensor)(result);
           THTensor_(narrow)(nt, NULL, cat_dimension, offset, dimSize);
           THTensor_(copy)(nt, inputs[j]);
@@ -2859,7 +2859,7 @@ TENSOR_IMPLEMENT_LOGICAL(ne,!=)
 #if defined(TH_REAL_IS_LONG)
 LAB_IMPLEMENT_BASIC_FUNCTION(abs,labs)
 LAB_IMPLEMENT_BASIC_FUNCTION(neg,-)
-#endif /* long only part */
+#endif /* int64_t only part */
 
 #if defined(TH_REAL_IS_SHORT) || defined(TH_REAL_IS_INT)
 LAB_IMPLEMENT_BASIC_FUNCTION(abs,abs)
@@ -2983,7 +2983,7 @@ void THTensor_(std)(THTensor *r_, THTensor *t, int dimension, int biased, int ke
   TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
                        accreal sum = 0;
                        accreal sum2 = 0;
-                       long i;
+                       int64_t i;
                        for(i = 0; i < t_size; i++)
                        {
                          real z = t_data[i*t_stride];
@@ -3028,7 +3028,7 @@ void THTensor_(var)(THTensor *r_, THTensor *t, int dimension, int biased, int ke
   TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
                        accreal sum = 0;
                        accreal sum2 = 0;
-                       long i;
+                       int64_t i;
                        for(i = 0; i < t_size; i++)
                        {
                          real z = t_data[i*t_stride];
@@ -3073,14 +3073,14 @@ void THTensor_(norm)(THTensor *r_, THTensor *t, real value, int dimension, int k
   if(value == 0) {
     TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
                          accreal sum = 0;
-                         long i;
+                         int64_t i;
                          for(i = 0; i < t_size; i++)
                            sum += t_data[i*t_stride] != 0.0;
                          *r__data = sum;)
   } else {
     TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
                          accreal sum = 0;
-                         long i;
+                         int64_t i;
                          for(i = 0; i < t_size; i++) {
                            sum += TH_MATH_NAME(pow)(
                              TH_MATH_NAME(fabs)(t_data[i*t_stride]), value);
@@ -3190,7 +3190,7 @@ accreal THTensor_(stdall)(THTensor *tensor, int biased)
   return sqrt(THTensor_(varall)(tensor, biased));
 }
 
-void THTensor_(linspace)(THTensor *r_, real a, real b, long n)
+void THTensor_(linspace)(THTensor *r_, real a, real b, int64_t n)
 {
   real i = 0;
 
@@ -3210,7 +3210,7 @@ void THTensor_(linspace)(THTensor *r_, real a, real b, long n)
   }
 }
 
-void THTensor_(logspace)(THTensor *r_, real a, real b, long n)
+void THTensor_(logspace)(THTensor *r_, real a, real b, int64_t n)
 {
   real i = 0;
 
@@ -3242,7 +3242,7 @@ void THTensor_(randn)(THTensor *r_, THGenerator *_generator, THLongStorage *size
   THTensor_(normal)(r_, _generator, 0, 1);
 }
 
-void THTensor_(histc)(THTensor *hist, THTensor *tensor, long nbins, real minvalue, real maxvalue)
+void THTensor_(histc)(THTensor *hist, THTensor *tensor, int64_t nbins, real minvalue, real maxvalue)
 {
   real minval;
   real maxval;
@@ -3273,7 +3273,7 @@ void THTensor_(histc)(THTensor *hist, THTensor *tensor, long nbins, real minvalu
   );
 }
 
-void THTensor_(bhistc)(THTensor *hist, THTensor *tensor, long nbins, real minvalue, real maxvalue)
+void THTensor_(bhistc)(THTensor *hist, THTensor *tensor, int64_t nbins, real minvalue, real maxvalue)
 {
   THArgCheck(THTensor_(nDimension)(tensor) < 3, 2, "invalid dimension %d, the input must be a 2d tensor", THTensor_(nDimension)(tensor));
 
@@ -3301,7 +3301,7 @@ void THTensor_(bhistc)(THTensor *hist, THTensor *tensor, long nbins, real minval
     maxval = maxval + 1;
   }
 
-  TH_TENSOR_DIM_APPLY2(real, tensor, real, hist, dimension, long i;
+  TH_TENSOR_DIM_APPLY2(real, tensor, real, hist, dimension, int64_t i;
                         for(i = 0; i < tensor_size; i++)
                         {
                           if(tensor_data[i*tensor_stride] >= minval && tensor_data[i*tensor_stride] <= maxval) {
diff --git a/torch/lib/TH/generic/THTensorMath.h b/torch/lib/TH/generic/THTensorMath.h
index 5f3870176dd0..186ef58894a4 100644
--- a/torch/lib/TH/generic/THTensorMath.h
+++ b/torch/lib/TH/generic/THTensorMath.h
@@ -73,7 +73,7 @@ TH_API void THTensor_(match)(THTensor *r_, THTensor *m1, THTensor *m2, real gain
 TH_API ptrdiff_t THTensor_(numel)(THTensor *t);
 TH_API void THTensor_(max)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim);
 TH_API void THTensor_(min)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim);
-TH_API void THTensor_(kthvalue)(THTensor *values_, THLongTensor *indices_, THTensor *t, long k, int dimension, int keepdim);
+TH_API void THTensor_(kthvalue)(THTensor *values_, THLongTensor *indices_, THTensor *t, int64_t k, int dimension, int keepdim);
 TH_API void THTensor_(mode)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim);
 TH_API void THTensor_(median)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim);
 TH_API void THTensor_(sum)(THTensor *r_, THTensor *t, int dimension, int keepdim);
@@ -94,16 +94,16 @@ TH_API void THTensor_(zerosLike)(THTensor *r_, THTensor *input);
 TH_API void THTensor_(ones)(THTensor *r_, THLongStorage *size);
 TH_API void THTensor_(onesLike)(THTensor *r_, THTensor *input);
 TH_API void THTensor_(diag)(THTensor *r_, THTensor *t, int k);
-TH_API void THTensor_(eye)(THTensor *r_, long n, long m);
+TH_API void THTensor_(eye)(THTensor *r_, int64_t n, int64_t m);
 TH_API void THTensor_(arange)(THTensor *r_, accreal xmin, accreal xmax, accreal step);
 TH_API void THTensor_(range)(THTensor *r_, accreal xmin, accreal xmax, accreal step);
-TH_API void THTensor_(randperm)(THTensor *r_, THGenerator *_generator, long n);
+TH_API void THTensor_(randperm)(THTensor *r_, THGenerator *_generator, int64_t n);
 
 TH_API void THTensor_(reshape)(THTensor *r_, THTensor *t, THLongStorage *size);
 TH_API void THTensor_(sort)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int dimension, int descendingOrder);
-TH_API void THTensor_(topk)(THTensor *rt_, THLongTensor *ri_, THTensor *t, long k, int dim, int dir, int sorted);
-TH_API void THTensor_(tril)(THTensor *r_, THTensor *t, long k);
-TH_API void THTensor_(triu)(THTensor *r_, THTensor *t, long k);
+TH_API void THTensor_(topk)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int64_t k, int dim, int dir, int sorted);
+TH_API void THTensor_(tril)(THTensor *r_, THTensor *t, int64_t k);
+TH_API void THTensor_(triu)(THTensor *r_, THTensor *t, int64_t k);
 TH_API void THTensor_(cat)(THTensor *r_, THTensor *ta, THTensor *tb, int dimension);
 TH_API void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int dimension);
 
@@ -176,16 +176,16 @@ TH_API void THTensor_(var)(THTensor *r_, THTensor *t, int dimension, int biased,
 TH_API void THTensor_(norm)(THTensor *r_, THTensor *t, real value, int dimension, int keepdim);
 TH_API void THTensor_(renorm)(THTensor *r_, THTensor *t, real value, int dimension, real maxnorm);
 TH_API accreal THTensor_(dist)(THTensor *a, THTensor *b, real value);
-TH_API void THTensor_(histc)(THTensor *hist, THTensor *tensor, long nbins, real minvalue, real maxvalue);
-TH_API void THTensor_(bhistc)(THTensor *hist, THTensor *tensor, long nbins, real minvalue, real maxvalue);
+TH_API void THTensor_(histc)(THTensor *hist, THTensor *tensor, int64_t nbins, real minvalue, real maxvalue);
+TH_API void THTensor_(bhistc)(THTensor *hist, THTensor *tensor, int64_t nbins, real minvalue, real maxvalue);
 
 TH_API accreal THTensor_(meanall)(THTensor *self);
 TH_API accreal THTensor_(varall)(THTensor *self, int biased);
 TH_API accreal THTensor_(stdall)(THTensor *self, int biased);
 TH_API accreal THTensor_(normall)(THTensor *t, real value);
 
-TH_API void THTensor_(linspace)(THTensor *r_, real a, real b, long n);
-TH_API void THTensor_(logspace)(THTensor *r_, real a, real b, long n);
+TH_API void THTensor_(linspace)(THTensor *r_, real a, real b, int64_t n);
+TH_API void THTensor_(logspace)(THTensor *r_, real a, real b, int64_t n);
 TH_API void THTensor_(rand)(THTensor *r_, THGenerator *_generator, THLongStorage *size);
 TH_API void THTensor_(randn)(THTensor *r_, THGenerator *_generator, THLongStorage *size);
 #endif
diff --git a/torch/lib/TH/generic/THTensorRandom.c b/torch/lib/TH/generic/THTensorRandom.c
index 21359a160446..072fe9f5e975 100644
--- a/torch/lib/TH/generic/THTensorRandom.c
+++ b/torch/lib/TH/generic/THTensorRandom.c
@@ -5,15 +5,15 @@
 void THTensor_(random)(THTensor *self, THGenerator *_generator)
 {
 #if defined(TH_REAL_IS_BYTE)
-  TH_TENSOR_APPLY(real, self, *self_data = (unsigned char)(THRandom_random(_generator) % (UCHAR_MAX+1)););
+  TH_TENSOR_APPLY(real, self, *self_data = (uint8_t)(THRandom_random(_generator) % (UINT8_MAX+1)););
 #elif defined(TH_REAL_IS_CHAR)
-  TH_TENSOR_APPLY(real, self, *self_data = (char)(THRandom_random(_generator) % (CHAR_MAX+1)););
+  TH_TENSOR_APPLY(real, self, *self_data = (int8_t)(THRandom_random(_generator) % (INT8_MAX+1)););
 #elif defined(TH_REAL_IS_SHORT)
-  TH_TENSOR_APPLY(real, self, *self_data = (short)(THRandom_random(_generator) % (SHRT_MAX+1)););
+  TH_TENSOR_APPLY(real, self, *self_data = (int16_t)(THRandom_random(_generator) % (INT16_MAX+1)););
 #elif defined(TH_REAL_IS_INT)
-  TH_TENSOR_APPLY(real, self, *self_data = (int)(THRandom_random(_generator) % (INT_MAX+1UL)););
+  TH_TENSOR_APPLY(real, self, *self_data = (int32_t)(THRandom_random(_generator) % (INT32_MAX+1UL)););
 #elif defined(TH_REAL_IS_LONG)
-  TH_TENSOR_APPLY(real, self, *self_data = (long)(THRandom_random(_generator) % (LONG_MAX+1UL)););
+  TH_TENSOR_APPLY(real, self, *self_data = (int64_t)(THRandom_random(_generator) % (LONG_MAX+1UL)););
 #elif defined(TH_REAL_IS_FLOAT)
   TH_TENSOR_APPLY(real, self, *self_data = (float)(THRandom_random(_generator) % ((1UL << FLT_MANT_DIG)+1)););
 #elif defined(TH_REAL_IS_DOUBLE)
@@ -23,12 +23,12 @@ void THTensor_(random)(THTensor *self, THGenerator *_generator)
 #endif
 }
 
-void THTensor_(clampedRandom)(THTensor *self, THGenerator *_generator, long min, long max) {
+void THTensor_(clampedRandom)(THTensor *self, THGenerator *_generator, int64_t min, int64_t max) {
   THArgCheck(max > min, 2, "max must be greater than min");
   TH_TENSOR_APPLY(real, self, *self_data = (real)((THRandom_random(_generator) % (max - min)) + min);)
 }
 
-void THTensor_(cappedRandom)(THTensor *self, THGenerator *_generator, long max) {
+void THTensor_(cappedRandom)(THTensor *self, THGenerator *_generator, int64_t max) {
   THArgCheck(max > 0, 1, "max must be positive");
   THTensor_(clampedRandom)(self, _generator, 0, max);
 }
@@ -106,18 +106,18 @@ void THTensor_(logNormal)(THTensor *self, THGenerator *_generator, double mean,
 
 void THTensor_(multinomialAliasSetup)(THTensor *probs, THLongTensor *J, THTensor *q)
 {
-  long inputsize = THTensor_(nElement)(probs);
-  long i = 0;
+  int64_t inputsize = THTensor_(nElement)(probs);
+  int64_t i = 0;
   THLongTensor *smaller = THLongTensor_newWithSize1d(inputsize);
   THLongTensor *larger = THLongTensor_newWithSize1d(inputsize);
-  long small_c = 0;
-  long large_c = 0;
+  int64_t small_c = 0;
+  int64_t large_c = 0;
   THLongTensor_resize1d(J, inputsize);
   THTensor_(resize1d)(q, inputsize);
   real *q_data = THTensor_(data)(q);
-  long *J_data = THLongTensor_data(J);
+  int64_t *J_data = THLongTensor_data(J);
       
-  for(i = 0; i < inputsize; i++)
+  for (i = 0; i < inputsize; i++)
     {
       THTensor_fastSet1d(J, i, 0L);
       real val = THTensor_fastGet1d(probs, i);
@@ -138,8 +138,8 @@ void THTensor_(multinomialAliasSetup)(THTensor *probs, THLongTensor *J, THTensor
   // Loop through and create little binary mixtures that
   // appropriately allocate the larger outcomes over the
   // overall uniform mixture.
-  long large, small;
-  while(small_c > 0 && large_c > 0)
+  int64_t large, small;
+  while (small_c > 0 && large_c > 0)
     {
       large = THTensor_fastGet1d(larger, large_c-1);
       small = THTensor_fastGet1d(smaller, small_c-1);
@@ -162,26 +162,26 @@ void THTensor_(multinomialAliasSetup)(THTensor *probs, THLongTensor *J, THTensor
   real q_min = THTensor_fastGet1d(q, inputsize-1);
   real q_max = q_min;
   real q_temp;
-  for(i=0; i < inputsize; i++)
+  for (i=0; i < inputsize; i++)
     {
       q_temp = THTensor_fastGet1d(q, i);
-      if(q_temp < q_min)
+      if (q_temp < q_min)
         q_min = q_temp;
-      else if(q_temp > q_max)
+      else if (q_temp > q_max)
         q_max = q_temp;
     }
   THArgCheckWithCleanup((q_min > 0),
                         THCleanup(THLongTensor_free(smaller); THLongTensor_free(larger);), 2,
                         "q_min is less than 0");
   
-  if(q_max > 1)
+  if (q_max > 1)
     {
-      for(i=0; i < inputsize; i++)
+      for (i=0; i < inputsize; i++)
         {
           q_data[i*q->stride[0]] /= q_max;
         }
     }
-  for(i=0; i<inputsize; i++)
+  for (i=0; i < inputsize; i++)
     {
       // sometimes an large index isn't added to J. 
       // fix it by making the probability 1 so that J isn't indexed.
@@ -193,15 +193,16 @@ void THTensor_(multinomialAliasSetup)(THTensor *probs, THLongTensor *J, THTensor
 }
 void THTensor_(multinomialAliasDraw)(THLongTensor *self, THGenerator *_generator, THLongTensor *J, THTensor *q)
 {
-  long K = THLongTensor_nElement(J);
-  long output_nelem = THLongTensor_nElement(self);
-  
-  int i = 0, _mask=0;
+  int64_t K = THLongTensor_nElement(J);
+  int64_t output_nelem = THLongTensor_nElement(self);
+  int64_t i = 0, _mask=0;
   real _q;
-  long rand_ind, sample_idx, J_sample, kk_sample;
-  for(i=0; i< output_nelem; i++)
+  int64_t rand_ind, sample_idx, J_sample, kk_sample;
+
+  for (i=0; i < output_nelem; i++)
     {
-      rand_ind = (long)THRandom_uniform(_generator, 0, K) ;
+      rand_ind = THRandom_uniform(_generator, 0, K);
+
       _q = THTensor_fastGet1d(q, rand_ind);
 
       _mask = THRandom_bernoulli(_generator, _q);
@@ -215,11 +216,11 @@ void THTensor_(multinomialAliasDraw)(THLongTensor *self, THGenerator *_generator
 }
 void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTensor *prob_dist, int n_sample, int with_replacement)
 {
-  int start_dim = THTensor_(nDimension)(prob_dist);
-  long n_dist;
-  long n_categories;
+  int64_t start_dim = THTensor_(nDimension)(prob_dist);
+  int64_t n_dist;
+  int64_t n_categories;
   THDoubleTensor* cum_dist;
-  int i,j,k;
+  int64_t i,j,k;
 
   if (start_dim == 1)
   {
diff --git a/torch/lib/TH/generic/THTensorRandom.h b/torch/lib/TH/generic/THTensorRandom.h
index 145c7d7b1872..db83cca68077 100644
--- a/torch/lib/TH/generic/THTensorRandom.h
+++ b/torch/lib/TH/generic/THTensorRandom.h
@@ -3,8 +3,8 @@
 #else
 
 TH_API void THTensor_(random)(THTensor *self, THGenerator *_generator);
-TH_API void THTensor_(clampedRandom)(THTensor *self, THGenerator *_generator, long min, long max);
-TH_API void THTensor_(cappedRandom)(THTensor *self, THGenerator *_generator, long max);
+TH_API void THTensor_(clampedRandom)(THTensor *self, THGenerator *_generator, int64_t min, int64_t max);
+TH_API void THTensor_(cappedRandom)(THTensor *self, THGenerator *_generator, int64_t max);
 TH_API void THTensor_(geometric)(THTensor *self, THGenerator *_generator, double p);
 TH_API void THTensor_(bernoulli)(THTensor *self, THGenerator *_generator, double p);
 TH_API void THTensor_(bernoulli_FloatTensor)(THTensor *self, THGenerator *_generator, THFloatTensor *p);
diff --git a/torch/lib/TH/generic/simd/convolve.c b/torch/lib/TH/generic/simd/convolve.c
index bf07bbed6bf3..326be70f73bf 100644
--- a/torch/lib/TH/generic/simd/convolve.c
+++ b/torch/lib/TH/generic/simd/convolve.c
@@ -109,10 +109,12 @@ static int haveCPUFeature(unsigned int feature) {
 
 #endif
 
-void convolve_5x5_sse(float* output, float* input, float* kernel, long outRows, long outCols, long outStride, long inCols);
-void convolve_5x5_avx(float* output, float* input, float* kernel, long outRows, long outCols, long outStride, long inCols);
+#include <stdint.h>
 
-void convolve_5x5(float* output, float* input, float* kernel, long outRows, long outCols, long inCols) {
+void convolve_5x5_sse(float* output, float* input, float* kernel, int64_t outRows, int64_t outCols, int64_t outStride, int64_t inCols);
+void convolve_5x5_avx(float* output, float* input, float* kernel, int64_t outRows, int64_t outCols, int64_t outStride, int64_t inCols);
+
+void convolve_5x5(float* output, float* input, float* kernel, int64_t outRows, int64_t outCols, int64_t inCols) {
 #if defined(__AVX__)
   int avx = haveCPUFeature(kCPUFeature_AVX);
   if (avx)
diff --git a/torch/lib/TH/generic/simd/convolve.h b/torch/lib/TH/generic/simd/convolve.h
index 7b9b04c50c09..fa04ce9aa70a 100644
--- a/torch/lib/TH/generic/simd/convolve.h
+++ b/torch/lib/TH/generic/simd/convolve.h
@@ -1 +1 @@
-void convolve_5x5(float* output, float* input, float* kernel, long outRows, long outCols, long inCols);
\ No newline at end of file
+void convolve_5x5(float* output, float* input, float* kernel, int64_t outRows, int64_t outCols, int64_t inCols);
\ No newline at end of file
diff --git a/torch/lib/TH/generic/simd/convolve5x5_avx.c b/torch/lib/TH/generic/simd/convolve5x5_avx.c
index 52b6d0ffb550..560474ba53f3 100644
--- a/torch/lib/TH/generic/simd/convolve5x5_avx.c
+++ b/torch/lib/TH/generic/simd/convolve5x5_avx.c
@@ -1,72 +1,74 @@
 #include <immintrin.h>
 #include "common_simd.h"
+#include <stdint.h>
+
 
 #define CLEAR_AVX() _mm256_zeroupper()
 
-void convolve_5x5_1_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
-  long i = 0;
-  long alignedCount = count & 0xFFFFFFF8;
+void convolve_5x5_1_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
+  int64_t i = 0;
+  int64_t alignedCount = count & 0xFFFFFFF8;
   DECLARE_OUTPUT_1()
   for (; i < alignedCount; i+=8) {
     CONVOLVE_8COLS_XROWS(1, i)
   }
 }
 
-void convolve_5x5_2_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
-  long i = 0;
-  long alignedCount = count & 0xFFFFFFF8;
+void convolve_5x5_2_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
+  int64_t i = 0;
+  int64_t alignedCount = count & 0xFFFFFFF8;
   DECLARE_OUTPUT_2()
   for (; i < alignedCount; i+=8) {
     CONVOLVE_8COLS_XROWS(2, i)
   }
 }
 
-void convolve_5x5_4_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
-  long i = 0;
-  long alignedCount = count & 0xFFFFFFF8;
+void convolve_5x5_4_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
+  int64_t i = 0;
+  int64_t alignedCount = count & 0xFFFFFFF8;
   DECLARE_OUTPUT_4()
   for (; i < alignedCount; i+=8) {
     CONVOLVE_8COLS_XROWS(4, i)
   }
 }
 
-void convolve_5x5_5_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
-  long i = 0;
-  long alignedCount = count & 0xFFFFFFF8;
+void convolve_5x5_5_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
+  int64_t i = 0;
+  int64_t alignedCount = count & 0xFFFFFFF8;
   DECLARE_OUTPUT_5()
   for (; i < alignedCount; i+=8) {
     CONVOLVE_8COLS_XROWS(5, i)
   }
 }
 
-void convolve_5x5_6_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
-  long i = 0;
-  long alignedCount = count & 0xFFFFFFF8;
+void convolve_5x5_6_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
+  int64_t i = 0;
+  int64_t alignedCount = count & 0xFFFFFFF8;
   DECLARE_OUTPUT_6()
   for (; i < alignedCount; i+=8) {
     CONVOLVE_8COLS_XROWS(6, i)
   }
 }
 
-void convolve_5x5_7_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
-  long i = 0;
-  long alignedCount = count & 0xFFFFFFF8;
+void convolve_5x5_7_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
+  int64_t i = 0;
+  int64_t alignedCount = count & 0xFFFFFFF8;
   DECLARE_OUTPUT_7()
   for (; i < alignedCount; i+=8) {
     CONVOLVE_8COLS_XROWS(7, i)
   }
 }
 
-void convolve_5x5_8_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
-  long i = 0;
-  long alignedCount = count & 0xFFFFFFF8;
+void convolve_5x5_8_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
+  int64_t i = 0;
+  int64_t alignedCount = count & 0xFFFFFFF8;
   DECLARE_OUTPUT_8()
   for (; i < alignedCount; i+=8) {
     CONVOLVE_8COLS_XROWS(8, i)
   }
 }
 
-void convolve_5x5_64x64_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
+void convolve_5x5_64x64_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
   for(int i = 0; i < 60; i+=6)
   {
     DECLARE_OUTPUT_6()
@@ -92,7 +94,7 @@ void convolve_5x5_64x64_avx(float* output, float* image, float* weight, long cou
   CONVOLVE_8COLS_XROWS(4, 56)
 }
 
-void convolve_5x5_32x32_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
+void convolve_5x5_32x32_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
   for(int i = 0; i < 30; i+=6)
   {
     DECLARE_OUTPUT_6()
@@ -110,7 +112,7 @@ void convolve_5x5_32x32_avx(float* output, float* image, float* weight, long cou
   CONVOLVE_8COLS_XROWS(2, 24)
 }
 
-void convolve_5x5_16x16_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
+void convolve_5x5_16x16_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
   for(int i = 0; i < 12; i+=6)
   {
     DECLARE_OUTPUT_6()
@@ -124,16 +126,16 @@ void convolve_5x5_16x16_avx(float* output, float* image, float* weight, long cou
   CONVOLVE_8COLS_XROWS(4, 8)
 }
 
-void convolve_5x5_8x8_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
+void convolve_5x5_8x8_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
   DECLARE_OUTPUT_8()
   CONVOLVE_8COLS_XROWS(8, 0)
 }
 
-void convolve_5x5_sse(float* output, float* input, float* kernel, long outRows, long outCols, long outStride, long inCols);
+void convolve_5x5_sse(float* output, float* input, float* kernel, int64_t outRows, int64_t outCols, int64_t outStride, int64_t inCols);
 
-void convolve_5x5_avx(float* output, float* input, float* kernel, long outRows, long outCols, long outStride, long inCols) {
-  long ic = inCols;
-  long yy = 0;
+void convolve_5x5_avx(float* output, float* input, float* kernel, int64_t outRows, int64_t outCols, int64_t outStride, int64_t inCols) {
+  int64_t ic = inCols;
+  int64_t yy = 0;
   float* t_ = input;
   float* r_ = output;
   float* k_ = kernel;
@@ -201,8 +203,8 @@ void convolve_5x5_avx(float* output, float* input, float* kernel, long outRows,
     r_ += (outStride * 1);
   }
 
-  long procCols = outCols & 0xFFFFFFF8; // avx version processes 8 cols at a time
-  long remCols = outCols - procCols;
+  int64_t procCols = outCols & 0xFFFFFFF8; // avx version processes 8 cols at a time
+  int64_t remCols = outCols - procCols;
 
   //process the rest using sse
   if( remCols > 0) {
diff --git a/torch/lib/TH/generic/simd/convolve5x5_sse.c b/torch/lib/TH/generic/simd/convolve5x5_sse.c
index 04dc41b61609..9de9a4a4c084 100644
--- a/torch/lib/TH/generic/simd/convolve5x5_sse.c
+++ b/torch/lib/TH/generic/simd/convolve5x5_sse.c
@@ -1,11 +1,12 @@
 #include <smmintrin.h>
 #include "common_simd.h"
+#include <stdint.h>
 
 
 /* SSE variants */
-void convolve_5x5_1_sse(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
-  long i = 0;
-  long alignedCount4 = count & 0xFFFFFFFC;
+void convolve_5x5_1_sse(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
+  int64_t i = 0;
+  int64_t alignedCount4 = count & 0xFFFFFFFC;
   DECLARE_OUTPUT_1()
   for (; i < alignedCount4; i+=4) {
     CONVOLVE_4COLS_XROWS(1, i)
@@ -23,9 +24,9 @@ void convolve_5x5_1_sse(float* output, float* image, float* weight, long count,
   }
 }
 
-void convolve_5x5_2_sse(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
-  long i = 0;
-  long alignedCount4 = count & 0xFFFFFFFC;
+void convolve_5x5_2_sse(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
+  int64_t i = 0;
+  int64_t alignedCount4 = count & 0xFFFFFFFC;
   DECLARE_OUTPUT_2()
   for (; i < alignedCount4; i+=4) {
     CONVOLVE_4COLS_XROWS(2, i)
@@ -46,9 +47,9 @@ void convolve_5x5_2_sse(float* output, float* image, float* weight, long count,
   }
 }
 
-void convolve_5x5_4_sse(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
-  long i = 0;
-  long alignedCount4 = count & 0xFFFFFFFC;
+void convolve_5x5_4_sse(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
+  int64_t i = 0;
+  int64_t alignedCount4 = count & 0xFFFFFFFC;
   DECLARE_OUTPUT_4()
   for (; i < alignedCount4; i+=4) {
     CONVOLVE_4COLS_XROWS(4, i)
@@ -75,9 +76,9 @@ void convolve_5x5_4_sse(float* output, float* image, float* weight, long count,
   }
 }
 
-void convolve_5x5_6_sse(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
-  long i = 0;
-  long alignedCount4 = count & 0xFFFFFFFC;
+void convolve_5x5_6_sse(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
+  int64_t i = 0;
+  int64_t alignedCount4 = count & 0xFFFFFFFC;
   DECLARE_OUTPUT_6()
   for (; i < alignedCount4; i+=4) {
     CONVOLVE_4COLS_XROWS(6, i)
@@ -110,9 +111,9 @@ void convolve_5x5_6_sse(float* output, float* image, float* weight, long count,
   }
 }
 
-void convolve_5x5_8_sse(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
-  long i = 0;
-  long alignedCount4 = count & 0xFFFFFFFC;
+void convolve_5x5_8_sse(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
+  int64_t i = 0;
+  int64_t alignedCount4 = count & 0xFFFFFFFC;
   DECLARE_OUTPUT_8()
   for (; i < alignedCount4; i+=4) {
     CONVOLVE_4COLS_XROWS(8, i)
@@ -154,7 +155,7 @@ void convolve_5x5_8_sse(float* output, float* image, float* weight, long count,
 #define UNROLL_SSE_CONVOLUTION 0
 #if (UNROLL_SSE_CONVOLUTION)
 
-void convolve_5x5_64x64_sse(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
+void convolve_5x5_64x64_sse(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
   for(int i = 0; i < 60; i+=6)
   {
     DECLARE_OUTPUT_6()
@@ -196,7 +197,7 @@ void convolve_5x5_64x64_sse(float* output, float* image, float* weight, long cou
   CONVOLVE_4COLS_XROWS(4, 60)
 }
 
-void convolve_5x5_32x32_sse(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
+void convolve_5x5_32x32_sse(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
   for(int i = 0; i < 30; i+=6)
   {
     DECLARE_OUTPUT_6()
@@ -224,7 +225,7 @@ void convolve_5x5_32x32_sse(float* output, float* image, float* weight, long cou
   CONVOLVE_4COLS_XROWS(2, 28)
 }
 
-void convolve_5x5_16x16_sse(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
+void convolve_5x5_16x16_sse(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
   for(int i = 0; i < 12; i+=6)
   {
     DECLARE_OUTPUT_6()
@@ -242,7 +243,7 @@ void convolve_5x5_16x16_sse(float* output, float* image, float* weight, long cou
   CONVOLVE_4COLS_XROWS(4, 12)
 }
 
-void convolve_5x5_8x8_sse(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
+void convolve_5x5_8x8_sse(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
   DECLARE_OUTPUT_8()
   CONVOLVE_4COLS_XROWS(8, 0)
   CONVOLVE_4COLS_XROWS(8, 4)
@@ -250,8 +251,8 @@ void convolve_5x5_8x8_sse(float* output, float* image, float* weight, long count
 
 #endif
 
-void convolve_5x5_sse(float* output, float* input, float* kernel, long outRows, long outCols, long outStride, long inCols) {
-  long yy = 0;
+void convolve_5x5_sse(float* output, float* input, float* kernel, int64_t outRows, int64_t outCols, int64_t outStride, int64_t inCols) {
+  int64_t yy = 0;
   float* t_ = input;
   float* r_ = output;
   float* k_ = kernel;
diff --git a/torch/lib/TH/vector/NEON.c b/torch/lib/TH/vector/NEON.c
index 7920fb13b142..3966acefa791 100644
--- a/torch/lib/TH/vector/NEON.c
+++ b/torch/lib/TH/vector/NEON.c
@@ -1,5 +1,5 @@
 static void THFloatVector_fill_NEON(float *x, const float c, const ptrdiff_t n) {
-  long i = 0;
+  int64_t i = 0;
 
   for(; i < n-4; i += 4)
   {
@@ -15,7 +15,7 @@ static void THFloatVector_fill_NEON(float *x, const float c, const ptrdiff_t n)
 }
 
 static void THFloatVector_cmul_NEON(float *z, const float *x, const float* y, const ptrdiff_t n) {
-  long i = 0;
+  int64_t i = 0;
 
   for(; i < n-4; i += 4)
   {
@@ -30,7 +30,7 @@ static void THFloatVector_cmul_NEON(float *z, const float *x, const float* y, co
 }
 
 static void THFloatVector_muls_NEON(float *y, const float *x, const float c, const ptrdiff_t n) {
-  long i = 0;
+  int64_t i = 0;
 
   for(; i < n-4; i += 4)
   {
@@ -45,7 +45,7 @@ static void THFloatVector_muls_NEON(float *y, const float *x, const float c, con
 }
 
 static void THFloatVector_cadd_NEON(float *z, const float *x, const float *y, const float c, const ptrdiff_t n) {
-  long i = 0;
+  int64_t i = 0;
 
   for(;i < n-4; i += 4)
   {
@@ -60,7 +60,7 @@ static void THFloatVector_cadd_NEON(float *z, const float *x, const float *y, co
 }
 
 static void THFloatVector_adds_NEON(float *y, const float *x, const float c, const ptrdiff_t n) {
-  long i = 0;
+  int64_t i = 0;
 
   for(;i < n-4; i += 4)
   {
@@ -75,7 +75,7 @@ static void THFloatVector_adds_NEON(float *y, const float *x, const float c, con
 }
 
 static void THFloatVector_cdiv_NEON(float *z, const float *x, const float *y, const ptrdiff_t n) {
-  long i = 0;
+  int64_t i = 0;
 
   for(;i < n-4; i += 4)
   {
@@ -90,7 +90,7 @@ static void THFloatVector_cdiv_NEON(float *z, const float *x, const float *y, co
 }
 
 static void THFloatVector_divs_NEON(float *y, const float *x, const float c, const ptrdiff_t n) {
-  long i = 0;
+  int64_t i = 0;
 
   for(;i < n-4; i += 4)
   {