In [1]:
import pynvml, torch, gc
from ipyexperiments import IPyExperimentsPytorch

In [2]:
exp = IPyExperimentsPytorch(exp_enable=False)


*** Experiment started with the Pytorch backend
Device: ID 0, GeForce GTX 1070 Ti (8119 RAM)

･ RAM: △Consumed △Peaked  Used Total | Exec time 0.000s
･ CPU:         0       0     2110 MB |
･ GPU:         0       0     6010 MB |


In [3]:
# light weight humanize from https://stackoverflow.com/a/1094933/9201239 w/ tweaks
def hs(num, suffix='B'):
    for unit in ['','K','M','G','T','P','E','Z']:
        if abs(num) < 1024.0: return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Y', suffix)

･ RAM: △Consumed △Peaked  Used Total | Exec time 0.001s
･ CPU:         0       0     2110 MB |
･ GPU:         0       0     6010 MB |


In [4]:
pynvml.nvmlInit()
id = torch.cuda.current_device()
def mem_free():
    gc.collect()
    torch.cuda.empty_cache()
    handle = pynvml.nvmlDeviceGetHandleByIndex(id)
    info = pynvml.nvmlDeviceGetMemoryInfo(handle)
    return int( info.free / 2**20 )

･ RAM: △Consumed △Peaked  Used Total | Exec time 0.051s
･ CPU:         0       0     2110 MB |
･ GPU:         0       0     6010 MB |


In [None]:
def mem_report(): print(f"free mem={mem_free()}")

def mem_allocate_mbs(n, fatal=False): 
    " allocate n MBs, return the var holding it on success, None on failure "
    if n < 6: return None # don't try to allocate less than 6MB
    try:
        d = int(2**9*n**0.5)
        return torch.ones((d, d)).cuda().contiguous()
    except Exception as e:
        if not fatal: return None
        print(f"allocated={hs(torch.cuda.memory_allocated())}, max allocated={hs(torch.cuda.max_memory_allocated())}, cached={hs(torch.cuda.memory_cached())}, max cached={hs(torch.cuda.max_memory_cached())} ")
        raise e 

In [6]:
def consume_gpu_mbs(n):
    d = int(2**9*n**0.5)
    return torch.ones((d, d)).cuda()
if 'x1' in locals(): del x1
x1 = mem_allocate_mbs(660)
#x1 = consume_gpu_mbs(10)
if x1 is None:
    print("failed to allocate")
else:
    print("yay")
del x1

yay
･ RAM: △Consumed △Peaked  Used Total | Exec time 0.308s
･ CPU:         0       0     2110 MB |
･ GPU:         0     660     6010 MB |


In [7]:
exp.cl.data[1].peaked_delta / 2**20

660.0

･ RAM: △Consumed △Peaked  Used Total | Exec time 0.006s
･ CPU:         0       0     2110 MB |
･ GPU:         0       0     6010 MB |


In [8]:
avail = mem_free()
print(f"Avail: {avail}")

Avail: 2109
･ RAM: △Consumed △Peaked  Used Total | Exec time 0.037s
･ CPU:         0       0     2110 MB |
･ GPU:         0       0     6010 MB |


In [9]:
success = []

threshold = 10

def get(goal, want=0, delta=0, depth=0):
    if want  == 0: want  = goal        
    if delta == 0: delta = goal
        
    delta = int(delta/2)

    # threshold to stop at
    if want  < threshold  : return 0
    if delta < threshold/2: return 0

    torch.cuda.empty_cache()
    x = mem_allocate_mbs(want)
    if x is not None: # success, try more
        del x
        print(f"yes: {depth} {want}, {delta} ({goal})")
        return max(want, get(goal, want+delta, delta, depth+1))
    else:             # failure, try less
        print(f" no: {depth} {want}, {delta} ({goal})")
        return           get(goal, want-delta, delta, depth+1)   


def memmap():
    last_tried = 0
    blocks = []
    store  = []
    while True:
        avail = mem_free()
        print(f"have {avail}")
        if avail < threshold: break
        #if avail > last_tried: break
        #last_tried = avail
        
        size = get(avail)
        print(f"wanted to get {avail} got {size}")
        if not size: break
        blocks.append(str(size))
        store.append(mem_allocate_mbs(size))
        print(f"got block of size {size}")
        
    # free the tmp memory
    store = []
    print(f"Free blocks in MBs: {', '.join(blocks)}")

          
for x in ['x1', 'x2', 'x3']: 
    if x in locals(): del x 

gc.collect()

mem_report()
          
# create a hole of 512MB
x1 = mem_allocate_mbs(514, fatal=True)
mem_report()
          
x2 = mem_allocate_mbs(514, fatal=True)    
mem_report()
          
del x1
mem_report()
          
x3 = mem_allocate_mbs(1500, fatal=True)    
mem_report()
          
# detect the hole
#memmap()
# x = get(10000)
# print(x)
#mem_report()
          
# cleanup
del x2, x3

0

free mem=2109
free mem=1595
free mem=1081
free mem=1595
free mem=95
･ RAM: △Consumed △Peaked  Used Total | Exec time 1.000s
･ CPU:         0       0     2111 MB |
･ GPU:         0    2014     6010 MB |


In [10]:
stack = []
for s in range(8):
    size = s*128
    print(f"have {mem_free()}, want {size}")
    x1 = mem_allocate_mbs(size, fatal=True)
    print(f"have {mem_free()}, want {size}")
    stack.append(mem_allocate_mbs(size, fatal=True))
    del x1

have 2109, want 0
have 2109, want 0
have 2109, want 128
have 1981, want 128
have 1981, want 256
have 1725, want 256
have 1725, want 384
have 1341, want 384
have 1341, want 512
have 829, want 512
have 829, want 640
have 189, want 640
allocated=1.9 GB, max allocated=2.0 GB, cached=1.9 GB, max cached=2.0 GB 


RuntimeError: CUDA out of memory. Tried to allocate 640.00 MiB (GPU 0; 7.93 GiB total capacity; 1.88 GiB already allocated; 189.56 MiB free; 0 bytes cached)

･ RAM: △Consumed △Peaked  Used Total | Exec time 1.569s
･ CPU:         0       0     2113 MB |
･ GPU:      1920       0     7930 MB |


In [None]:
stack = []