# Hugging Face Transformers Memory Leak Demo
This notebook demonstrates that there is a possible GPU memory leak in Huggingface's Transformers.

In [None]:
import os
HF_CACHE_LOCATION = os.getenv("HF_CACHE_DIR")

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import gc

In [None]:
prompt = "Functional Programming is"
torch_device = "cuda" if torch.cuda.is_available() else "cpu"
checkpoint = "facebook/opt-125m"

In [None]:
# CUDA memory before inference should be zero
print("CUDA MEMORY BEFORE: " + str(torch.cuda.memory_reserved())) # 0

In [2]:
model = AutoModelForCausalLM.from_pretrained(checkpoint, cache_dir = HF_CACHE_LOCATION).to(torch_device)
tokenizer = AutoTokenizer.from_pretrained(checkpoint, cache_dir = HF_CACHE_LOCATION)
inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(torch_device)

with torch.no_grad():
    output = model.generate(inputs, max_length= 100)
    print(tokenizer.batch_decode(output, skip_special_tokens=True))
    del output

del prompt, torch_device, checkpoint, model, tokenizer, inputs
gc.collect()
torch.cuda.empty_cache()

CUDA MEMORY BEFORE: 0
['Functional Programming is a programming language that is used to create functional programming languages. It is a programming language that is used to create functional programming languages. It is a programming language that is used to create functional programming languages. It is a programming language that is used to create functional programming languages. It is a programming language that is used to create functional programming languages. It is a programming language that is used to create functional programming languages. It is a programming language that is used to create functional programming languages']
CUDA MEMORY AFTER: 20971520


In [4]:
print("CUDA MEMORY AFTER: " + str(torch.cuda.memory_reserved())) # 20971520