# Memory Fragmentation Causing and Mapping 

This notebook investigates how to emulate memory fragmentation and how to print the fragmentation map.

Currently I'm stuck at emulating fragmentation: https://discuss.pytorch.org/t/gpu-ram-fragmentation-diagnostics/34073/2

In [1]:
import pynvml, torch, gc
from ipyexperiments import IPyExperimentsPytorch

In [2]:
# light weight humanize from https://stackoverflow.com/a/1094933/9201239 w/ tweaks
def hs(num, suffix='B'):
    for unit in ['','K','M','G','T','P','E','Z']:
        if abs(num) < 1024.0: return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Y', suffix)

In [3]:
def globals_unset(var_names):
    " this is useful for re-running the cell, so that it resets the initial state or cleanup at the end of the cell"
    for x in var_names: 
        if x in globals(): 
            del globals()[x]   

In [4]:
pynvml.nvmlInit()
id = torch.cuda.current_device()
def mem_free():
    gc.collect()
    torch.cuda.empty_cache()
    handle = pynvml.nvmlDeviceGetHandleByIndex(id)
    info = pynvml.nvmlDeviceGetMemoryInfo(handle)
    return int( info.free / 2**20 )

In [5]:
def mem_report(): print(f"free mem={mem_free()}")

def mem_allocate_mbs_last(n, fatal=False): 
    " allocate n MBs, return the var holding it on success, None on failure "
    if n < 6: return None # don't try to allocate less than 6MB
    try:
        d = int(2**9*n**0.5)
        return torch.ones((d, d)).cuda().contiguous()
    except Exception as e:
        if not fatal: return None
        print(f"allocated={hs(torch.cuda.memory_allocated())}, max allocated={hs(torch.cuda.max_memory_allocated())}, cached={hs(torch.cuda.memory_cached())}, max cached={hs(torch.cuda.max_memory_cached())} ")
        raise e 

In [6]:
def leave_free_mbs(n):
    " consume whatever memory is needed so that n MBs are left free "
    avail = mem_free()
    assert avail > n, f"already have less available mem than desired {n}MBs"
    consume = avail - n
    print(f"consuming {consume}MB to bring free mem to {n}MBs")
    return mem_allocate_mbs_last(consume, fatal=True)         

In [7]:
def mem_get(n):
    print(f"have {mem_free():4d}, allocating {n}")
    return mem_allocate_mbs(n, fatal=True)

In [8]:
exp = IPyExperimentsPytorch(exp_enable=False)


*** Experiment started with the Pytorch backend
Device: ID 0, GeForce GTX 1070 Ti (8119 RAM)

･ RAM: △Consumed △Peaked  Used Total | Exec time 0:00:00.000
･ CPU:         0       0     2151 MB |
･ GPU:         0       0     6334 MB |


In [9]:
def mem_allocate_mbs(n, fatal=False): 
    " allocate n MBs, return the var holding it on success, None on failure "
    if n < 6: return None # don't try to allocate less than 6MB
    try:
        return torch.ByteTensor([1]*n*2**20).cuda().contiguous()
    except Exception as e:
        if not fatal: return None
        print(f"allocated={hs(torch.cuda.memory_allocated())}, max allocated={hs(torch.cuda.max_memory_allocated())}, cached={hs(torch.cuda.memory_cached())}, max cached={hs(torch.cuda.max_memory_cached())} ")
        raise e 

･ RAM: △Consumed △Peaked  Used Total | Exec time 0:00:00.001
･ CPU:         0       0     2151 MB |
･ GPU:         0       0     6334 MB |


In [10]:
### test mem_allocate_mbs ###
globals_unset(['x1'])

x1 = mem_allocate_mbs(660)
if x1 is None:
    print("failed to allocate")
else:
    print("yay")
globals_unset(['x1'])

yay
･ RAM: △Consumed △Peaked  Used Total | Exec time 0:00:08.124
･ CPU:         0    5280     2149 MB |
･ GPU:         0     660      503 MB |


In [11]:
exp.cl.data[1].peaked_delta / 2**20

660.0

･ RAM: △Consumed △Peaked  Used Total | Exec time 0:00:00.005
･ CPU:         0       0     2149 MB |
･ GPU:         0       0      503 MB |


In [12]:
avail = mem_free()
print(f"Avail: {avail}")

Avail: 7616
･ RAM: △Consumed △Peaked  Used Total | Exec time 0:00:00.019
･ CPU:         0       0     2149 MB |
･ GPU:         0       0      503 MB |


# Mem fragmentation mapper

In [10]:
threshold = 10

def get(goal, want=0, delta=0, depth=0):
    """ measure what size of a contiguous memory chunk it's possible to allocate up to a `goal`
      return the max size if possible, 0 otherwise.
      this is a recursive implementation.
    """
    if want  == 0: want  = goal        
    if delta == 0: delta = goal
        
    delta = int(delta/2)

    # threshold to stop at
    if want  < threshold  : return 0
    if delta < threshold/2: return 0

    torch.cuda.empty_cache()
    x = mem_allocate_mbs(want)
    if x is not None: # success, try more
        del x
        print(f"yes: {depth} {want}, {delta} ({goal})")
        return max(want, get(goal, want+delta, delta, depth+1))
    else:             # failure, try less
        print(f" no: {depth} {want}, {delta} ({goal})")
        return           get(goal, want-delta, delta, depth+1)   


def memmap():
    """
    this function finds all the blocks of memory that can be allocated, ignoring small blocks < 10MB
    it prints out these blocks from large to small
    """
    last_tried = 0
    blocks = []
    store  = []
    while True:
        avail = mem_free()
        print(f"have {avail}")
        if avail < threshold: break
        #if avail > last_tried: break
        #last_tried = avail
        
        size = get(avail)
        print(f"wanted to get {avail} got {size}")
        if not size: break
        blocks.append(str(size))
        store.append(mem_allocate_mbs(size))
        print(f"got block of size {size}")
        
    # free the tmp memory
    store = []
    print(f"Free blocks in MBs: {', '.join(blocks)}")

･ RAM: △Consumed △Peaked  Used Total | Exec time 0:00:00.009
･ CPU:         0       0     2151 MB |
･ GPU:         0       0     6334 MB |


In [11]:
# XXX! try to check torch.C functions and alloc directly if possible

mem_report()
memmap()
mem_report()

free mem=1785
have 1785
 no: 0 1785, 892 (1785)
yes: 1 893, 446 (1785)
yes: 2 1339, 223 (1785)
yes: 3 1562, 111 (1785)
yes: 4 1673, 55 (1785)
yes: 5 1728, 27 (1785)
yes: 6 1755, 13 (1785)
yes: 7 1768, 6 (1785)
wanted to get 1785 got 1768
got block of size 1768
have 17
 no: 0 17, 8 (17)
wanted to get 17 got 0
Free blocks in MBs: 1768
free mem=1785
･ RAM: △Consumed △Peaked  Used Total | Exec time 0:04:27.356
･ CPU:         0   14280      168 MB |
･ GPU:         0    1768     6334 MB |


In [None]:
### Test the memmapper ###

globals_unset(['x1', 'x2', 'x3'])

mem_report()
          
# 1. first create a hole of 512MB
x1 = mem_allocate_mbs(514, fatal=True)
mem_report()
          
x2 = mem_allocate_mbs(514, fatal=True)    
mem_report()
          
del x1
mem_report()
          
x3 = mem_allocate_mbs(1500, fatal=True)    
mem_report()
          
# 2. detect the hole
memmap()
x = get(10000)
print(x)
mem_report()
          
# cleanup
globals_unset(['x1', 'x2', 'x3'])

free mem=7616
free mem=7102
free mem=6588
free mem=7102


# Attempts to create fragmentation

In [14]:
# 
globals_unset(['buf', 'stack', 'z'])

stack = []

# this ensures we always test the same thing
buf = leave_free_mbs(1500)

# this one tries to create lots of small holes
mem_report()
holes_total_size = 0
for s in range(30):
    size = s*2
    holes_total_size += size
    #print(f"have {mem_free()}, want {size}")
    x1 = mem_allocate_mbs(size, fatal=True)
    #print(f"have {mem_free()}, want {size}")
    stack.append(mem_allocate_mbs(size, fatal=True))
    del x1
print(f"allocated {holes_total_size}MBs")
holes_total_size -= 30*2
print(f"generated {holes_total_size}MBs of holes")
mem_report()

z = mem_allocate_mbs(512, fatal=True)
mem_report()

# cleanup
globals_unset(['buf', 'stack', 'z'])

consuming 4667MB to bring free mem to 1500MBs
free mem=1499
allocated 870MBs
generated 810MBs of holes
free mem=635
free mem=123
･ RAM: △Consumed △Peaked  Used Total | Exec time 2.343s
･ CPU:         0       0     2306 MB |
･ GPU:         0    6114     1952 MB |


In [11]:
globals_unset(['x1', 'x2', 'x3', 'buf'])

# this ensures we always test the same thing
buf = leave_free_mbs(1600)

# this one tries to create one single hole
                   # legend: [free block]  {used block}
                   # [1600]
x1 = mem_get(512)  # {512}[1092]
x2 = mem_get(512)  # {512}{512}[576]
print(f"have {mem_free():4d}, reclaiming first 512")
del x1             # [512]{512}[576]
x3 = mem_get(1024) # shouldn't be able to allocate 1024 contiguous mem
print(f"have {mem_free():4d}")

# cleanup
globals_unset(['x1', 'x2', 'x3', 'buf'])

consuming 6016MB to bring free mem to 1600MBs
have 1600, allocating 512
have 1088, allocating 512
have  576, reclaiming first 512
have 1088, allocating 1024
have   64
･ RAM: △Consumed △Peaked  Used Total | Exec time 0:00:26.974
･ CPU:         0    8192     2149 MB |
･ GPU:         0    7552      503 MB |


# All in one

In [16]:
# Same as the parts of the notebook above that tries to emulate fragmentation just in one chunks

import pynvml, torch, gc

pynvml.nvmlInit()
id = torch.cuda.current_device()
def mem_free():
    gc.collect()
    torch.cuda.empty_cache()
    handle = pynvml.nvmlDeviceGetHandleByIndex(id)
    info = pynvml.nvmlDeviceGetMemoryInfo(handle)
    return int( info.free / 2**20 )

def mem_report(): print(f"free mem={mem_free()}")

def mem_allocate_mbs(n, fatal=False): 
    " allocate n MBs, return the var holding it on success, None on failure "
    if n < 6: return None # don't try to allocate less than 6MB
    try:
        d = int(2**9*n**0.5)
        return torch.ones((d, d)).cuda().contiguous()
    except Exception as e:
        if not fatal: return None
        raise e
        
def leave_free_mbs(n):
    " consume whatever memory is needed so that n MBs are left free "
    avail = mem_free()
    assert avail > n, f"already have less available mem than desired {n}MBs"
    consume = avail - n
    print(f"consuming {consume}MB to bring free mem to {n}MBs")
    return mem_allocate_mbs(consume, fatal=True)

def globals_unset(var_names):
    " this is useful for re-running the cell, so that it resets the initial state or cleanup at the end of the cell"
    for x in var_names: 
        if x in globals(): 
            del globals()[x]
            
def mem_get(n):
    print(f"have {mem_free():4d}, allocating {n}")
    return mem_allocate_mbs(n, fatal=True)

globals_unset(['x1', 'x2', 'x3', 'buf'])
_=torch.ones(1).cuda()# preload

# this ensures we always test the same thing
buf = leave_free_mbs(1600)
    
                   # legend: [free block]  {used block}
                   # [1600]
x1 = mem_get(512)  # {512}[1092]
x2 = mem_get(512)  # {512}{512}[576]
print(f"have {mem_free():4d}, reclaiming first 512")
del x1             # [512]{512}[576]
x3 = mem_get(1024) # shouldn't be able to allocate 1024 contiguous mem
print(f"have {mem_free():4d}")

# cleanup
globals_unset(['x1', 'x2', 'x3', 'buf'])

consuming 4565MB to bring free mem to 1600MBs
have 1599, allocating 512
have 1087, allocating 512
have  575, reclaiming first 512
have 1087, allocating 1024
have   63
･ RAM: △Consumed △Peaked  Used Total | Exec time 2.363s
･ CPU:         0       0     2306 MB |
･ GPU:         2    6102     1954 MB |
