In [None]:
%load_ext autoreload
%autoreload 2
from notebook import *
# if get something about NUMEXPR_MAX_THREADS being set incorrectly, don't worry.  It's not a problem.

#KEY include Namebox.ipynb

<div style=" font-size: 300% !important;
    margin-top: 1.5em;
    margin-bottom: 1.5em;
    font-weight: bold;
    line-height: 1.0;
    text-align:center;">Lab 4: The Memory Hierarchy (Part II) -- Demos</div>


# Temporal Locality

* How much spatial locality is there in this code?
* How much temporal locality is there? 
    * How big is the working set?
    * How much reuse is there?

In [None]:
name="spatial1"
t = fiddle(f"{name}.cpp", function="stride", name=name,  run=["moneta"],opt="-O1",
code=r"""
#include"pin_tags.h"
#include"CNN/tensor_t.hpp"
#include"function_map.hpp"
#include<cstdint>

extern "C"
uint64_t* stride(uint64_t * data, uint64_t size, uint64_t arg1) {
    tensor_t<uint32_t> t(1024, 1,1,1, (uint32_t *)data);
    TAG_START("init", t.data, &t.as_vector(t.element_count()), true);

    for(uint i = 0; i < 1024; i++) {
        for(uint x = 0; x < 1024; x++) {
            t.get(x,0,0,0) = x;
        }
    }
    
    TAG_STOP("init");
    return data;
}

FUNCTION(one_array_1arg, stride);
""")

In [None]:
show_trace(f"./{name}_0", show_tag=["init"])

* How much spatial locality is there in this code?
* How much temporal locality is there? 
    * How big is the working set?
    * How much reuse is there?

In [None]:
name="spatial2"
t = fiddle(f"{name}.cpp", function="stride", name=name,  run=["moneta"],opt="-O1",
code=r"""
#include"pin_tags.h"
#include"CNN/tensor_t.hpp"
#include"function_map.hpp"
#include<cstdint>

extern "C"
uint64_t* stride(uint64_t * data, uint64_t size, uint64_t arg1) {
    tensor_t<uint32_t> t(1024, 1,1,1, (uint32_t *)data);
    TAG_START("init", t.data, &t.as_vector(t.element_count()), true);

    for(uint x = 0; x < 1024; x++) {
        t.get(x,0,0,0) = x;
    }
    
    TAG_STOP("init");
    return data;
}

FUNCTION(one_array_1arg, stride);
""")

In [None]:
show_trace(f"./{name}_0", show_tag=["init"])

* How much spatial locality is there in this code?
* How much temporal locality is there? 
    * How big is the working set?
    * How much reuse is there?

In [None]:
name="spatial3"
t = fiddle(f"{name}.cpp", function="stride", name=name,  run=["moneta"],opt="-O1",
code=r"""
#include"pin_tags.h"
#include"CNN/tensor_t.hpp"
#include"function_map.hpp"
#include<cstdint>

extern "C"
uint64_t* stride(uint64_t * data, uint64_t size, uint64_t arg1) {
    tensor_t<uint32_t> t(1024, 1,1,1, (uint32_t *)data);
    TAG_START("init", t.data, &t.as_vector(t.element_count()), true);

    for(uint x = 0; x < 1024; x+=8) {
        t.get(x,0,0,0) = x;
    }
    
    TAG_STOP("init");
    return data;
}

FUNCTION(one_array_1arg, stride);
""")

In [None]:
show_trace(f"./{name}_0", show_tag=["init"])

* How much spatial locality is there in this code?
* How much temporal locality is there? 
    * How big is the working set?
    * How much reuse is there?

# Miss Types, Locality, and the Data Structure Zoo

## Set

In [None]:
name="spatial4"
t = fiddle(f"{name}.cpp", function="working", analyze=False, run=["moneta"], name=name,opt="-O1",
code=r"""
#include"pin_tags.h"
#include"function_map.hpp"
#include"archlab.hpp"
#include<set>
#include<cstdint>

extern "C"
uint64_t* working(uint64_t * data, uint64_t size, uint64_t arg1) {
    auto s = new std::set<uint64_t>();
    uint64_t seed = 1;

    TAG_START("build", (void*)-1, 0, true);
    for(uint x = 0; x < size; x++) {
        auto t = fast_rand(&seed);
        s->insert(t);
        auto a = s->find(t);
        TAG_GROW("build",  &(*a), &(*a)+ 1);
    }
    TAG_STOP("build");
    
    seed = 1;
    
    TAG_START("search", (void*)-1, 0, true);
    for(uint x = 0; x < size; x++) {
        auto a = s->find(fast_rand(&seed));
        TAG_GROW("search", &(*a), &(*a)+ 1);
    }
    TAG_STOP("search");

    TAG_START_ALL("delete", false);
    delete s;
    TAG_STOP("delete");
    return data;
}

FUNCTION(one_array_1arg, working);
""",
           cmdline=f"--size {4* 1024}   --iters 1")

In [None]:
show_trace(f"./{name}_0.hdf5", show_tag=['build','search'], layer_preset=["misses-compulsory-all", "misses-all", "hits-all"])

* How much spatial locality is there in this code?
* How much temporal locality is there? 
    * How big is the working set?
    * How much reuse is there?
   

## Unordered Set

In [None]:
name="spatial5"
t = fiddle(f"{name}.cpp", function="working", analyze=False, run=["moneta"], name=name,opt="-O1",
code=r"""
#include"pin_tags.h"
#include"function_map.hpp"
#include"archlab.hpp"
#include<unordered_set>
#include<cstdint>

extern "C"
uint64_t* working(uint64_t * data, uint64_t size, uint64_t arg1) {
    auto s = new std::unordered_set<uint64_t>();
    uint64_t seed = 1;

    TAG_START("build", (void*)-1, 0, true);
    for(uint x = 0; x < size; x++) {
        auto t = fast_rand(&seed);
        s->insert(t);
        auto a = s->find(t);
        TAG_GROW("build",  &(*a), &(*a)+ 1);
    }
    TAG_STOP("build");
    
    seed = 1;
    
    TAG_START("search", (void*)-1, 0, true);
    for(uint x = 0; x < size; x++) {
        auto a = s->find(fast_rand(&seed));
        TAG_GROW("search", &(*a), &(*a)+ 1);
    }
    TAG_STOP("search");

    TAG_START_ALL("delete", false);
    delete s;
    TAG_STOP("delete");
    return data;
}

FUNCTION(one_array_1arg, working);
""",
           cmdline=f"--size {4* 1024}   --iters 1")

In [None]:
show_trace(f"./{name}_0.hdf5", show_tag=['build','search'], layer_preset=["misses-compulsory-all", "misses-all", "hits-all"])

## List

In [None]:
name="spatial6"
t = fiddle(f"{name}.cpp", function="working", analyze=False, run=["moneta"], name=name,opt="-O1",
code=r"""
#include"pin_tags.h"
#include"function_map.hpp"
#include"archlab.hpp"
#include<list>
#include<cstdint>

extern "C"
uint64_t* working(uint64_t * data, uint64_t size, uint64_t arg1) {
    auto s = new std::list<uint64_t>();
    uint64_t seed = 1;

    TAG_START("build", (void*)-1, 0, true);
    for(uint x = 0; x < size; x++) {
        s->push_back(fast_rand(&seed));
        auto a = &s->back();
        TAG_GROW("build",  &(*a), &(*a)+ 1);
    }
        
    uint64_t sum = 0;
    std::cerr << s->size() << "\n";
    for(int i = 0; i < 10; i++){
        for(auto &a: *s) sum += a;
    }
    TAG_STOP("build");

    TAG_START_ALL("delete", false);
    delete s;
    TAG_STOP("delete");
    data[0] = sum;
    return data;
}

FUNCTION(one_array_1arg, working);
""",
           cmdline=f"--size {4* 1024}   --iters 1")

In [None]:
show_trace(f"./{name}_0.hdf5", show_tag=['build'], layer_preset=["misses-compulsory-all", "misses-all", "hits-all"])

## Vector

In [None]:
name="spatial7"
t = fiddle(f"{name}.cpp", function="working", analyze=False, run=["moneta"], name=name,opt="-O1",
code=r"""
#include"pin_tags.h"
#include"function_map.hpp"
#include"archlab.hpp"
#include<vector>
#include<cstdint>

extern "C"
uint64_t* working(uint64_t * data, uint64_t size, uint64_t arg1) {
    auto s = new std::vector<uint64_t>();
    uint64_t seed = 1;

    TAG_START("build", (void*)-1, 0, true);
    for(uint x = 0; x < size; x++) {
        s->push_back(fast_rand(&seed));
        auto a = &s->back();
        TAG_GROW("build",  &(*a), &(*a)+ 1);
    }
        
    uint64_t sum = 0;
    std::cerr << s->size() << "\n";
    for(int i = 0; i < 10; i++){
        for(auto &a: *s) sum += a;
    }
    TAG_STOP("build");

    TAG_START_ALL("delete", false);
    delete s;
    TAG_STOP("delete");
    data[0] = sum;
    return data;
}

FUNCTION(one_array_1arg, working);
""",
           cmdline=f"--size {4* 1024}   --iters 1")

In [None]:
show_trace(f"./{name}_0.hdf5", show_tag=['build'], layer_preset=["misses-compulsory-all", "misses-all", "hits-all"])

# Image Stabilization

# Instruction Latency

## Let's measure the latency of an add instruction

In [None]:
t = fiddle("inst_lat.cpp", code="""
#include"pin_tags.h"
#include"function_map.hpp"
#include"archlab.hpp"
#include<vector>
#include<cstdint>

extern "C"
uint64_t* wide_1(uint64_t * data, uint64_t size, register uint64_t arg1) {
	register uint64_t j =0;
	for(register uint64_t i = 0; i < size; i++) {
		j = j+arg1;
		j = j+arg1;       
		j = j+arg1;       
		j = j+arg1;       
		j = j+arg1;
	}
	data[0] = j;
	return data;
}
FUNCTION(one_array_1arg, wide_1);
""",
       function=["wide_1"], name="ilp", opt="-O0", run=["perf_count"], 
           cmdline=f"--size 10000000", 
           perf_cmdline="--stat-set PE.cfg --MHz 3500")


In [None]:
display(t.cfg)

df = render_csv("ilp.csv")
display(df)
df["Cycles/iteration"] = df["cycles"]/df["size"]
df["Cycles/inst"] = df["cycles"]/df["size"]/5.0
display(df[["Cycles/iteration", "Cycles/inst"]])

## How about a floating point add

In [None]:
t = fiddle("inst_lat.cpp", code="""
#include"function_map.hpp"
#include<cstdint>

extern "C"
uint64_t* wide_1(uint64_t * data, uint64_t size, uint64_t arg1) {
	register double j =0;
	for(unsigned i = 0; i < size; i++) {
		j = j+1;
		j = j+1;       
		j = j+1;       
		j = j+1;       
		j = j+1;
	}
	data[0] = j;
	return data;
}
FUNCTION(one_array_1arg, wide_1);
""",
       function=["wide_1"], name="ilp", opt="-O1", run=["perf_count"], 
           cmdline=f"--size 10000000", 
           perf_cmdline="--stat-set PE.cfg --MHz 3500")
render_csv("ilp.csv")

In [None]:
display(t.cfg)
df = render_csv("ilp.csv")
df["Cycles/iteration"] = df["cycles"]/df["size"]
df["Cycles/inst"] = df["cycles"]/df["size"]/5.0
display(df[["Cycles/iteration", "Cycles/inst"]])

## Integer Divide

In [None]:
t = fiddle("inst_lat.cpp", code="""
#include"function_map.hpp"
#include<cstdint>

extern "C"
uint64_t* wide_1(uint64_t * data, uint64_t size, uint64_t arg1) {
	register uint64_t j = (uint64_t)-1;
    register uint64_t k = arg1 + 7;
    
	for(unsigned i = 0; i < size; i++) {
		j = j/k;
		j = j/k;       
		j = j/k;       
		j = j/k;       
		j = j/k;
	}
	data[0] = j;
	return data;
}
FUNCTION(one_array_1arg, wide_1);
""",
       function=["wide_1"], name="ilp", opt="-O1", run=["perf_count"], 
           cmdline=f"--size 10000000", 
           perf_cmdline="--stat-set PE.cfg --MHz 3500")
render_csv("ilp.csv")

In [None]:
display(t.cfg)
df = render_csv("ilp.csv")
df["Cycles/iteration"] = df["cycles"]/df["size"]
df["Cycles/inst"] = df["cycles"]/df["size"]/5.0
display(df[["Cycles/iteration", "Cycles/inst"]])

## Floating point Divide

In [None]:
div1 = fiddle("inst_lat.cpp", code="""
#include"function_map.hpp"
#include<cstdint>

extern "C"
uint64_t* wide_1(uint64_t * data, uint64_t size, uint64_t arg1) {
	register double j =0;
    register double k = arg1 + 7.3;
    
	for(unsigned i = 0; i < size; i++) {
		j = j/k;
		j = j/k;       
		j = j/k;       
		j = j/k;       
		j = j/k;

	}
	data[0] = j;
	return data;
}
FUNCTION(one_array_1arg, wide_1);
""",
       function=["wide_1"], name="div1", opt="-O1", run=["perf_count"], 
           cmdline=f"--size 10000000", 
           perf_cmdline="--stat-set PE.cfg --MHz 3500")


In [None]:
display(div1.cfg)
df = render_csv("div1.csv")
df["Cycles/iteration"] = df["cycles"]/df["size"]
df["Cycles/inst"] = df["cycles"]/df["size"]/5.0
display(df[["Cycles/iteration", "Cycles/inst", "CPI"]])

## Parallel Divides

In [None]:
div2 = fiddle("inst_lat.cpp", code="""
#include"function_map.hpp"
#include<cstdint>

extern "C"
uint64_t* wide_1(uint64_t * data, uint64_t size, uint64_t arg1) {
	register double j =0;
	register double jj = 8324.93242;
    register double k = arg1 + 7.3;
    
	for(unsigned i = 0; i < size; i++) {
		j = j/k;
		j = j/k;       
		j = j/k;       
		j = j/k;       
		j = j/k;

        jj = jj/k;
		jj = jj/k;       
		jj = jj/k;       
		jj = jj/k;       
		jj = jj/k;
	}
	data[0] = j + jj;
	return data;
}
FUNCTION(one_array_1arg, wide_1);
""",
       function=["wide_1"], name="div2", opt="-O1", run=["perf_count"], 
           cmdline=f"--size 10000000", 
           perf_cmdline="--stat-set PE.cfg --MHz 3500")


In [None]:
display(div2.cfg)
df = render_csv("div2.csv")
df["Cycles/iteration"] = df["cycles"]/df["size"]
df["Cycles/op"] = df["cycles"]/df["size"]/5.0
display(df[["Cycles/iteration", "Cycles/inst", "CPI"]])

In [None]:
div3 = fiddle("inst_lat.cpp", code="""
#include"function_map.hpp"
#include<cstdint>

extern "C"
uint64_t* wide_1(uint64_t * data, uint64_t size, uint64_t arg1) {
	register double j =0;
	register double jj = 8324.93242;
	register double jjj = 83324.93242;
    register double k = arg1 + 7.3;
    
	for(unsigned i = 0; i < size; i++) {
		j = j/k;
		j = j/k;       
		j = j/k;       
		j = j/k;       
		j = j/k;

        jj = jj/k;
		jj = jj/k;       
		jj = jj/k;       
		jj = jj/k;       
		jj = jj/k;

        jjj = jjj/k;
		jjj = jjj/k;       
		jjj = jjj/k;       
		jjj = jjj/k;       
		jjj = jjj/k;
	}
	data[0] = j + jj + jjj;
	return data;
}
FUNCTION(one_array_1arg, wide_1);
""",
       function=["wide_1"], name="div3", opt="-O1", run=["perf_count"], 
           cmdline=f"--size 10000000", 
           perf_cmdline="--stat-set PE.cfg --MHz 3500")


In [None]:
display(div3.cfg)
df = render_csv("div3.csv")
df["Cycles/iteration"] = df["cycles"]/df["size"]
df["Cycles/op"] = df["cycles"]/df["size"]/5.0
display(df[["Cycles/iteration", "Cycles/inst", "CPI"]])

In [None]:
div4 = fiddle("inst_lat.cpp", code="""
#include"function_map.hpp"
#include<cstdint>

extern "C"
uint64_t* wide_1(uint64_t * data, uint64_t size, uint64_t arg1) {
	register double j =0;
	register double jj = 8324.93242;
	register double jjj = 83324.93242;
	register double jjjj = 833424.93242;
    register double k = arg1 + 7.3;
    
	for(unsigned i = 0; i < size; i++) {
		j = j/k;
		j = j/k;       
		j = j/k;       
		j = j/k;       
		j = j/k;

        jj = jj/k;
		jj = jj/k;       
		jj = jj/k;       
		jj = jj/k;       
		jj = jj/k;

        jjj = jjj/k;
		jjj = jjj/k;       
		jjj = jjj/k;       
		jjj = jjj/k;       
		jjj = jjj/k;

        jjjj = jjjj/k;
		jjjj = jjjj/k;       
		jjjj = jjjj/k;       
		jjjj = jjjj/k;       
		jjjj = jjjj/k;
	}
	data[0] = j + jj + jjj + jjjj;
	return data;
}
FUNCTION(one_array_1arg, wide_1);
""",
       function=["wide_1"], name="div4", opt="-O1", run=["perf_count"], 
           cmdline=f"--size 10000000", 
           perf_cmdline="--stat-set PE.cfg --MHz 3500")


In [None]:
display(div4.cfg)
df = render_csv("div4.csv")
df["Cycles/iteration"] = df["cycles"]/df["size"]
df["Cycles/op"] = df["cycles"]/df["size"]/5.0
display(df[["Cycles/iteration", "Cycles/inst", "CPI"]])

In [None]:
div5 = fiddle("inst_lat.cpp", code="""
#include"function_map.hpp"
#include<cstdint>

extern "C"
uint64_t* wide_1(uint64_t * data, uint64_t size, uint64_t arg1) {
	register double j =0;
	register double jj = 8324.93242;
	register double jjj = 83324.93242;
	register double jjjj = 833424.93242;
	register double jjjjj = 833424.93242;
    register double k = arg1 + 7.3;
    
	for(unsigned i = 0; i < size; i++) {
		j = j/k;
		j = j/k;       
		j = j/k;       
		j = j/k;       
		j = j/k;

        jj = jj/k;
		jj = jj/k;       
		jj = jj/k;       
		jj = jj/k;       
		jj = jj/k;

        jjj = jjj/k;
		jjj = jjj/k;       
		jjj = jjj/k;       
		jjj = jjj/k;       
		jjj = jjj/k;

        jjjj = jjjj/k;
		jjjj = jjjj/k;       
		jjjj = jjjj/k;       
		jjjj = jjjj/k;       
		jjjj = jjjj/k;
        
        jjjjj = jjjjj/k;
		jjjjj = jjjjj/k;       
		jjjjj = jjjjj/k;       
		jjjjj = jjjjj/k;       
		jjjjj = jjjjj/k;
	}
	data[0] = j + jj + jjj +jjjj +jjjjj;
	return data;
}
FUNCTION(one_array_1arg, wide_1);
""",
       function=["wide_1"], name="div5", opt="-O1", run=["perf_count"], 
           cmdline=f"--size 10000000", 
           perf_cmdline="--stat-set PE.cfg --MHz 3500")


In [None]:
display(div4.cfg)
df = render_csv("div4.csv")
df["Cycles/iteration"] = df["cycles"]/df["size"]
df["Cycles/op"] = df["cycles"]/df["size"]/5.0
display(df[["Cycles/iteration", "Cycles/inst", "CPI"]])

# Floating point

1. Numerical instability -- sort and add
2. FMA speedup
3. Gallery of weird results.

# Vectors

## Floats

In [None]:
cfgs=[]
asm=[]
files=[]

In [None]:
os.environ["OPENMP"]="yes"
for opts in ["-O0", "-O1 -fno-openmp", "-O3 -fno-unroll-loops", "-O3 -march=skylake -fno-unroll-loops"]:
    filename = f"vsum-{opts.replace(' ','')}"
    fiddle("inst_lat.cpp", code="""
    #include"function_map.hpp"
    #include<cstdint>
    #include<cmath>

    extern "C"
    void __attribute__((noinline)) vsum(register float *a, register float *b, register float * c, register uint64_t n) {
    #ifdef _OPENMP
    #pragma omp simd
    #endif
        for(unsigned int i = 0; i < n; i++) {
            c[i] = a[i] + b[i];
        }
    }

    extern "C"
    uint64_t* wide_1(uint64_t * data, uint64_t size, uint64_t arg1) {
        auto a = new float[size];
        auto b = new float[size];
        auto c = new float[size];
        vsum(a,b,c, size);
        data[0] = a[4];
        return data;
    }
    FUNCTION(one_array_1arg, wide_1);
    """,
           function=["wide_1"], name=filename, analyze=False, opt=opts, run=["perf_count"], 
               cmdline=f"--size 10000000", 
               perf_cmdline="--stat-set PE.cfg --MHz 3500")
    cfgs.append(do_cfg("build/inst_lat.so", symbol="vsum"))
    asm.append(do_render_code("build/inst_lat.s", lang="gas", show="vsum"))
    files.append(f"{filename}.csv")

In [None]:
display(asm[0]) # Nothing

In [None]:
display(cfgs[1]) # Normal opt

In [None]:
display(cfgs[2]) # OpenMP SIMD

In [None]:
display(cfgs[3]) #OPenMP SIMD AVX

In [None]:

df=render_csv(files)
df["file"] = files
df["GHz"] = 1/df["CT"]/1e9
df["CT_ns"] = df["CT"]*1e9
df["What we should get"]= df["IC"]*df["CPI"]
df[["file","IC", "CPI", "CT_ns", "ET",  "GHz", "What we should get"]]


## Integers


In [None]:
icfgs=[]
iasm=[]
ifiles=[]

os.environ["OPENMP"]="yes"
for opts in ["-O0", "-O1 -fno-openmp", "-O3 -fno-unroll-loops", "-O3 -march=skylake -fno-unroll-loops"]:
    filename = f"vsum-int-{opts.replace(' ','')}"
    fiddle("inst_lat.cpp", code="""
    #include"function_map.hpp"
    #include<cstdint>
    #include<cmath>

    extern "C"
    void __attribute__((noinline)) vsum(register int *a, register int *b, register  * c, register size_t n) {
    #ifdef _OPENMP
    #pragma omp simd
    #endif
        for(unsigned int i = 0; i < n; i++) {
            c[i] = a[i] + b[i];
        }
    }

    extern "C"
    uint64_t* wide_1(uint64_t * data, uint64_t size, uint64_t arg1) {
        auto a = new int[size];
        auto b = new int[size];
        auto c = new int[size];
        vsum(a,b,c, size);
        data[0] = a[4];
        return data;
    }
    FUNCTION(one_array_1arg, wide_1);
    """,
           function=["wide_1"], name=filename, analyze=False, opt=opts, run=["perf_count"], 
               cmdline=f"--size 10000000", 
               perf_cmdline="--stat-set PE.cfg --MHz 3500")
    icfgs.append(do_cfg("build/inst_lat.so", symbol="vsum"))
    iasm.append(do_render_code("build/inst_lat.s", lang="gas", show="vsum"))
    ifiles.append(f"{filename}.csv")

In [None]:
display(iasm[0]) # Nothing

In [None]:
display(icfgs[1]) # Normal opt

In [None]:
display(icfgs[2]) # Normal opt

In [None]:
display(icfgs[3]) # Normal opt

In [None]:

df=render_csv(ifiles)
df["file"] = ifiles
df["GHz"] = 1/df["CT"]/1e9
df["CT_ns"] = df["CT"]*1e9
df[["file","IC", "CPI", "CT_ns", "ET",  "GHz"]]