<a href="https://colab.research.google.com/github/kobi3028/AttacksonImplementationsCourseBook/blob/master/Labs/CacheSizeTest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Cache size test

This cache size test described by [Henry Wong on his blog](https://blog.stuffedcow.net/2013/01/ivb-cache-replacement/).

The access pattern used in the test is a random cyclic permutation, where each cache line (64 bytes) in an array is accessed exactly once in a random order before the sequence repeats. Each access is data-dependent so this measures the full access latency (not bandwidth) of the cache.

The ***cache size test*** can run on google colab or on private computer, in case the test will run on google colab the test will measure google server cache size 


In [1]:
# Open the file in google colab https://colab.research.google.com/github/Yossioren/AttacksonImplementationsCourseBook/blob/master/Labs/CacheSizeTest.ipynb
import numpy as np
from scipy import stats

!pip install -q bokeh
from bokeh.plotting import figure, show
from bokeh.models import Range1d
from bokeh.io import output_notebook
from bokeh.models.tickers import FixedTicker
from bokeh.models.formatters import PrintfTickFormatter
# Call once to configure Bokeh to display plots inline in the notebook.
output_notebook()

import mmap
import time
import gc

import ctypes
!pip install -q keystone-engine
from keystone import *

ks = Ks(KS_ARCH_X86, KS_MODE_64)

STACK_SPACE = 0x1000

[K     |████████████████████████████████| 1.8MB 5.7MB/s 
[?25h

##Script parameters
NUM_READ - number of random array accesses  

NUM_TEST - test every array size NUM_TEST times 

ARRAY_SIZE_POWER - size of the array for random access 2^(ARRAY_SIZE_POWER) KBytes

ASM_BLOCK_SIZE - data-dependent access operation block size

In [2]:
NUM_READ = (1<<20)    # number of random array accesses  
NUM_TEST = 0x10       # test every array size NUM_TEST times
ARRAY_SIZE_POWER = 23 # size of the array for random access (2^(10+ARRAY_SIZE_POWER))
ASM_BLOCK_SIZE = 128  # data-dependent access operation 

In [3]:
# rdtsc func 
rdtsc_asm = '''
rdtsc
shl rdx, 32
or  rax, rdx
ret
'''
byte_code, count = ks.asm(rdtsc_asm)
assert count == 5

#allocate memory
rdtsc_buf = mmap.mmap(-1, mmap.PAGESIZE, prot=mmap.PROT_READ | mmap.PROT_WRITE | mmap.PROT_EXEC)
rdtsc_buf.write(bytes(byte_code))

#convert to function
fpointer = ctypes.c_void_p.from_buffer(rdtsc_buf)
func_type = ctypes.CFUNCTYPE(ctypes.c_uint64)
rdtsc = ctypes.cast(ctypes.addressof(fpointer), func_type)

In [4]:
#alloc read write execute memory
fbuf_size = mmap.PAGESIZE*0x100
fbuf = mmap.mmap(-1, fbuf_size, prot=mmap.PROT_READ | mmap.PROT_WRITE | mmap.PROT_EXEC)

fpointer = ctypes.c_void_p.from_buffer(fbuf)
fpointer_address = ctypes.addressof(fpointer)
print('machine code buf address:', hex(fpointer_address))

machine code buf address: 0x7fd57aa57000


In [5]:
def make_routine(dpointer_address, asm_its):

  p1 = dpointer_address 

  #assemble intel assembly to intel x86_64 CPU byte code
  code = b''
  code += bytes(ks.asm( 'xchg   ax,ax')[0])*8
  code += bytes(ks.asm( 'push   rbx')[0])
  code += bytes(ks.asm( 'push   rbp')[0])
  code += bytes(ks.asm( 'push   rsi')[0])
  code += bytes(ks.asm( 'push   rdi')[0])
  code += bytes(ks.asm( 'push   r8')[0])
  code += bytes(ks.asm( 'push   r9')[0])
  code += bytes(ks.asm(f'sub    rsp,{STACK_SPACE}')[0])
  code += bytes(ks.asm( 'xor    r8d, r8d')[0])
  code += bytes(ks.asm( 'lea    r9,[rsp]')[0])
  code += bytes(ks.asm(f'movabs rcx,{p1}')[0])
  code += bytes(ks.asm(f'movabs rax,{asm_its}')[0]) #counter

  code += bytes(ks.asm( 'sub    rbx,0x0')[0])
  code += bytes(ks.asm( 'sub    rbp,0x0')[0])
  code += bytes(ks.asm( 'sub    rsi,0x0')[0])
  code += bytes(ks.asm( 'sub    rdi,0x0')[0])
  code += bytes(ks.asm( 'sub    r8, 0x0')[0])
  code += bytes(ks.asm( 'sub    r9, 0x0')[0])
  code += bytes(ks.asm( 'sub    r10,0x0')[0])
  code += bytes(ks.asm( 'sub    r11,0x0')[0])
  code += bytes(ks.asm( 'sub    r12,0x0')[0])
  code += bytes(ks.asm( 'sub    r13,0x0')[0])
  code += bytes(ks.asm( 'sub    r14,0x0')[0])
  code += bytes(ks.asm( 'sub    r15,0x0')[0])
  #padd
  code += bytes(ks.asm( 'nop')[0])*(len(code)%0x10)

  loop_start = len(code)
  code += bytes(ks.asm( 'mov    rcx,QWORD PTR [rcx]')[0])*ASM_BLOCK_SIZE
  
  code += bytes(ks.asm( 'sub   eax, 0x1')[0]) #dec counter
  code += bytes(ks.asm(f'jne   {loop_start - len(code) - 4}')[0]) #loop if eax != 0 


  code += bytes(ks.asm( 'xchg  ax, ax')[0])*8

  code += bytes(ks.asm(f'add   rsp,{STACK_SPACE}')[0])

  code += bytes(ks.asm( 'pop   r9')[0])
  code += bytes(ks.asm( 'pop   r8')[0])
  code += bytes(ks.asm( 'pop   rdi')[0])
  code += bytes(ks.asm( 'pop   rsi')[0])
  code += bytes(ks.asm( 'pop   rbp')[0])
  code += bytes(ks.asm( 'pop   rbx')[0])

  code += bytes(ks.asm( 'emms')[0])
  code += bytes(ks.asm( 'ret')[0])
  code += bytes(ks.asm( 'nop')[0])*(mmap.PAGESIZE - (len(code) % mmap.PAGESIZE)) #pad byte code with nops
  return code

In [6]:
#init data buf
def init_dbuf(dbuf_size):
  size = dbuf_size//8
  dbuf = np.arange(start=0, stop=size, dtype=np.uint64)
  np.random.shuffle(dbuf)

  dpointer = ctypes.c_void_p.from_buffer(dbuf)
  dpointer_address = ctypes.addressof(dpointer)

  f = lambda x :  dpointer_address + (x * 8)
  copy = f(dbuf)
  dbuf[:] = copy
  
  del copy
  return dpointer_address, dbuf

In [7]:
#LENGTH = 256*1024*1024 #256MB
MAP_HUGETLB = 0x40000
MAP_POPULATE = 0x08000

res_arr = {}

for i in range(ARRAY_SIZE_POWER):
  length = (1<<(i+10))
  res_arr[length] = []
  
  dpointer_address, dbuf = init_dbuf(length)  

  routine_code = make_routine(dpointer_address, NUM_READ//ASM_BLOCK_SIZE)
  fbuf.write(routine_code)

  #convert byte code to python function 
  routine_pointer = ctypes.c_void_p.from_buffer(fbuf)
  routine_type = ctypes.CFUNCTYPE(ctypes.c_int)
  routine = ctypes.cast(ctypes.addressof(routine_pointer), routine_type)
    
  time.sleep(1)
  for i in range(NUM_TEST):
    start = rdtsc()
    routine()
    end = rdtsc() - start  
    
    #Save the average time
    res_arr[length].append(float(end)/NUM_READ)
  
  #reset write pointer
  fbuf.seek(0)
  del dbuf

#Print the results
for key in res_arr:
  print('{0:>#12x}'.format(key), res_arr[key])

       0x400 [3.6975955963134766, 4.798613548278809, 4.568370819091797, 3.6765737533569336, 3.6312055587768555, 3.7358531951904297, 3.6349220275878906, 3.67318058013916, 3.825791358947754, 3.447324752807617, 3.4593257904052734, 3.44793701171875, 3.476128578186035, 3.4689292907714844, 3.436182975769043, 3.5118322372436523]
       0x800 [4.135849952697754, 3.9806013107299805, 3.7417526245117188, 3.5844345092773438, 3.6763668060302734, 3.6458969116210938, 3.7222900390625, 3.546555519104004, 3.4617700576782227, 3.5472965240478516, 3.5138769149780273, 3.4543914794921875, 3.456730842590332, 3.4871625900268555, 3.4671974182128906, 3.4515838623046875]
      0x1000 [3.760186195373535, 3.4371566772460938, 3.451420783996582, 3.4406442642211914, 3.466175079345703, 3.5914487838745117, 3.465540885925293, 3.4972362518310547, 3.452704429626465, 3.7353925704956055, 3.4977121353149414, 3.4348344802856445, 6.138422966003418, 4.706808090209961, 3.503408432006836, 3.495327949523926]
      0x2000 [3.6859998

In [8]:
#Plot the results
p = figure(title='Memory Random Access Time Measurement', 
           x_axis_label='Array Size (KBytes)', 
           #y_axis_label='Random Access time [NS]', 
           y_axis_label='CPU ticks', 
           x_axis_type="log")

keys = [(key>>10) for key in res_arr.keys()]
p.xaxis.ticker = FixedTicker(ticks=keys)
p.xaxis.major_label_orientation = "vertical"
p.xaxis.formatter=PrintfTickFormatter(format="0x%X")
avg = []
for key in res_arr:
  #Plot the results
  p.scatter([key>>10]*len(res_arr[key]), res_arr[key])
  
  #Remove the outlier
  data = np.asarray(res_arr[key])
  data = data[abs(data - np.mean(data)) < 2 * np.std(data)]
  avg.append(np.mean(data))

#Plot the avg line
p.line(keys, avg, line_width=2, line_color='orange')
show(p)
