### Open the notebook on Colab

We should have already started a notebook server in a container on a Chameleon GPU host, and set up an SSH tunnel to this notebook server. Now, we will connect this notebook to the runtime that you have in Chameleon. This is a convenient way to work, because the notebook and its outputs will be saved automatically in your Google Drive.

-   Next to the “Connect” button in the top right, there is a ▼ symbol. Click on this symbol to expand the menu, and choose “Connect to a local runtime”.
-   Paste the `http://127.0.0.1:8888/lab?token=XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX` you copied earlier into this space, and choose “Connect”.

**Alternatively, if you prefer not to use Colab** (or can’t, for some reason): just put the `http://127.0.0.1:8888/lab?token=XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX` URL you copied earlier into your browser to open the Jupyter interface directly. But, then you’ll have to open a terminal in that Jupyter interface and run

    wget https://raw.githubusercontent.com/teaching-on-testbeds/llm-chi/refs/heads/main/workspace/2_single_gpu_a100.ipynb

to get a copy of this notebook in that workspace.

In [1]:
# Install necessary packages
!pip install transformers datasets torch accelerate bitsandbytes sentencepiece



In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from transformers import GenerationConfig
import time
import accelerate
from accelerate import infer_auto_device_map, dispatch_model
from types import MethodType
import gc

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
print(device)

cuda


In [5]:
# Function to load models and apply quantization
from transformers import BitsAndBytesConfig
def load_and_quantize_model(model_name):
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False)

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,  # You can use float16, but A100 supports bfloat16 very well
        device_map="auto",
        low_cpu_mem_usage=True,
        trust_remote_code=True,
    )
    model.eval()
    return tokenizer, model

In [6]:
import subprocess

def print_gpu_memory(note=""):
    result = subprocess.run(
        ["nvidia-smi", "--query-gpu=memory.used,memory.total", "--format=csv,nounits,noheader"],
        stdout=subprocess.PIPE,
        text=True
    )
    used, total = map(int, result.stdout.strip().split(','))
    print(f"{note} GPU memory: {used} MiB / {total} MiB")


In [7]:

def generate_code_with_profiling(model, tokenizer, prompt, max_new_tokens=64):
    # Tokenize input and move to model's device
    device = next(model.parameters()).device  # Get the assigned device
    inputs = tokenizer(prompt, return_tensors="pt").to(device)  # Move all inputs to device

    # Clear the GPU cache to ensure accurate measurement
    torch.cuda.empty_cache()
    torch.cuda.synchronize()  # Ensure previous GPU tasks are done

    with torch.no_grad():
        start_time = time.time()

        # Track memory usage before
        memory_before = torch.cuda.memory_allocated(device)
        memory_reserved_before = torch.cuda.memory_reserved(device)

        # Generate output
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            eos_token_id=tokenizer.eos_token_id,
            use_cache=True
        )

        torch.cuda.synchronize()  # Ensure generation completes

        # Track memory usage after
        memory_after = torch.cuda.memory_allocated(device)
        memory_reserved_after = torch.cuda.memory_reserved(device)

        end_time = time.time()

    # Calculate stats
    inference_time = end_time - start_time
    memory_usage = memory_after - memory_before
    reserved_memory_usage = memory_reserved_after - memory_reserved_before  # Optional, but useful

    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Optionally print out reserved memory for diagnostics
    print(f"Memory Usage: {memory_usage} bytes")
    print(f"Reserved Memory: {reserved_memory_usage} bytes")

    return decoded_output, inference_time, memory_usage


In [8]:
# Load CoNaLa dataset
dataset = load_dataset("neulab/conala", split="train[:2]", trust_remote_code=True)  # Just a few samples for quick test

In [9]:
# # OpenCoder-Instruct
# model_name = "OpenCoder-Instruct"
# model_address = "infly/OpenCoder-8B-Instruct"
# print_gpu_memory("Before loading model")
# tokenizer, loaded_model = load_and_quantize_model(model_address)
# print_gpu_memory("After loading model")

In [10]:
# for idx, sample in enumerate(dataset):
#     print(f"\n==================== Sample {idx + 1} ====================")
#     print(f"Intent: {sample['intent']}")
#     prompt = f"### Instruction:\n{sample['intent']}\n\n### Response:"
#     output, inference_time, memory_usage = generate_code_with_profiling(loaded_model, tokenizer, prompt)
#     print(f"\n🔹 Output from {model_name}:\n{output}")
#     print(f"⏱️ Inference time: {inference_time:.4f} seconds\n")


In [11]:
# del loaded_model 
# del tokenizer
# gc.collect()
# torch.cuda.empty_cache()
# print_gpu_memory("After freeing previous model")

In [12]:
def patched_prepare_inputs_for_generation(
    self,
    input_ids,
    past_key_values=None,
    attention_mask=None,
    inputs_embeds=None,
    **kwargs,
):
    if past_key_values is not None:
        if isinstance(past_key_values, Cache):
            cache_length = past_key_values.get_seq_length()
            past_length = past_key_values.seen_tokens
            max_cache_length = past_key_values.get_max_cache_shape()
        else:
            cache_length = past_length = past_key_values[0][0].shape[2]
            max_cache_length = None

        # Keep only the unprocessed tokens:
        # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
        # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
        # input)
        if (
            attention_mask is not None
            and attention_mask.shape[1] > input_ids.shape[1]
        ):
            input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
        # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
        # input_ids based on the past_length.
        elif past_length < input_ids.shape[1]:
            input_ids = input_ids[:, past_length:]
        # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.

        # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
        if (
            max_cache_length is not None
            and attention_mask is not None
            and cache_length + input_ids.shape[1] > max_cache_length
        ):
            attention_mask = attention_mask[:, -max_cache_length:]

    position_ids = kwargs.get("position_ids", None)
    if attention_mask is not None and position_ids is None:
        # create position_ids on the fly for batch generation
        position_ids = attention_mask.long().cumsum(-1) - 1
        position_ids.masked_fill_(attention_mask == 0, 1)
        if past_key_values:
            position_ids = position_ids[:, -input_ids.shape[1] :]

    # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
    if inputs_embeds is not None and past_key_values is None:
        model_inputs = {"inputs_embeds": inputs_embeds}
    else:
        model_inputs = {"input_ids": input_ids}

    model_inputs.update(
        {
            "position_ids": position_ids,
            "past_key_values": past_key_values,
            "use_cache": kwargs.get("use_cache"),
            "attention_mask": attention_mask,
        }
    )
    return model_inputs


def load_and_patch_model(path):
    tokenizer, model = load_and_quantize_model(path)
    model.prepare_inputs_for_generation = MethodType(patched_prepare_inputs_for_generation, model)
    return tokenizer, model


In [13]:
# DeepSeek Coder V2
model_name = "Lite-Base"
model_address = "deepseek-ai/DeepSeek-Coder-V2-Lite-Base"
print_gpu_memory("Before loading model")
tokenizer, loaded_model = load_and_patch_model(model_address)
print_gpu_memory("After loading model")

Before loading model GPU memory: 4 MiB / 81920 MiB


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

After loading model GPU memory: 31469 MiB / 81920 MiB


In [14]:
# print(loaded_model.prepare_inputs_for_generation)

In [16]:
from transformers.cache_utils import Cache, DynamicCache

In [17]:
for idx, sample in enumerate(dataset):
    print(f"\n==================== Sample {idx + 1} ====================")
    print(f"Intent: {sample['intent']}")
    prompt = f"### Instruction:\n{sample['intent']}\n\n### Response:"
    output, inference_time, memory_usage = generate_code_with_profiling(loaded_model, tokenizer, prompt)
    print(f"\n🔹 Output from {model_name}:\n{output}")
    print(f"⏱️ Inference time: {inference_time:.4f} seconds\n")


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.



Intent: How to convert a list of multiple integers into a single integer?


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


Memory Usage: -1123333120 bytes
Reserved Memory: 50331648 bytes

🔹 Output from Lite-Base:
### Instruction:
How to convert a list of multiple integers into a single integer?

### Response:
To convert a list of multiple integers into a single integer, you can use the following steps:

1. Initialize an empty string variable to store the result.
2. Iterate over the list of integers.
3. For each integer, convert it to a string and append it to the result string
⏱️ Inference time: 4.4278 seconds


Intent: How to convert a list of multiple integers into a single integer?
Memory Usage: 1024 bytes
Reserved Memory: 14680064 bytes

🔹 Output from Lite-Base:
### Instruction:
How to convert a list of multiple integers into a single integer?

### Response:
To convert a list of multiple integers into a single integer, you can use the following steps:

1. Initialize an empty string variable to store the result.
2. Iterate over the list of integers.
3. For each integer, convert it to a string and append

In [None]:
# if hasattr(model, 'clear_cache'):
#     loaded_model.clear_cache()
# del loaded_model 
# del tokenizer
# gc.collect()
# torch.cuda.empty_cache()
# print_gpu_memory("After freeing previous model")