In [1]:
!pip install transformers torch psutil nvidia-ml-py3 fvcore ptflops thop pynvml dotenv



In [2]:
!nvidia-smi

Tue Mar 18 16:59:56 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A40                     Off | 00000000:31:00.0 Off |                    0 |
|  0%   54C    P0              64W / 300W |     16MiB / 46068MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A40                     Off | 00000000:B1:00.0 Off |  

In [3]:
import os
from dotenv import load_dotenv

load_dotenv()

token = os.getenv("token")

In [4]:
!huggingface-cli login --token {token} --add-to-git-credential

Token is valid (permission: read).
The token `LLM` has been saved to /home/ddal/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m
Token has not been saved to git credential helper.
Your token has been saved to /home/ddal/.cache/huggingface/token
Login successful.
The current active token is: `LLM`


In [5]:
import torch
print('CUDA available:', torch.cuda.is_available())

device = "cuda" if torch.cuda.is_available() else "cpu"
print('Device:', device)

CUDA available: True
Device: cuda


In [6]:
from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList

## Version: refer to "https://huggingface.co/deepseek-ai"
model_name = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(device)
model.eval()

print(tokenizer)
print(model)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LlamaTokenizerFast(name_or_path='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', vocab_size=128000, model_max_length=16384, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<｜begin▁of▁sentence｜>', 'eos_token': '<｜end▁of▁sentence｜>', 'pad_token': '<｜end▁of▁sentence｜>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	128000: AddedToken("<｜begin▁of▁sentence｜>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128001: AddedToken("<｜end▁of▁sentence｜>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128002: AddedToken("<|reserved_special_token_0|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128003: AddedToken("<|reserved_special_token_1|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128004: AddedToken("<|finetune_right_pad_id|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=T

In [7]:
input_max_length = 2048
input_text = "Tell me more about Deep Seek."

inputs = tokenizer(input_text, return_tensors="pt", max_length=input_max_length, padding="max_length").to(device)
inputs["input_ids"] = inputs["input_ids"].long()
print('Input ids:', inputs["input_ids"])
print('Length:', len(inputs["input_ids"][0]))

Input ids: tensor([[128001, 128001, 128001,  ...,  18682,  30839,     13]],
       device='cuda:0')
Length: 2048


In [8]:
from thop import profile

## FLOPs 측정 (input_text 기준)
flops, params = profile(model, inputs=(inputs["input_ids"],))

## GFLOPs (10⁹ FLOPs 단위) 변환
gflops = flops / 1e9

print('Params:', params)

[INFO] Register count_linear() for <class 'torch.nn.modules.linear.Linear'>.
Params: 7504658432.0


In [9]:
# class MinLengthStoppingCriteria(StoppingCriteria):
#     def __init__(self, min_length):
#         self.min_length = min_length

#     def __call__(self, input_ids, scores, **kwargs):
#         return input_ids.shape[-1] >= self.min_length

In [10]:
import time
import pynvml

# 평균을 내기 위한 실행 횟수
num_runs = 10  
# 총 추론 시간(ms)
total_time = 0.0
# 총 사용 전력량
total_power_usage = 0.0
# 총 생성 토큰 수 
total_generated_tokens = 0.0
# 추가 생성 최대 토큰 수
max_new_tokens = 500
# 추가 생성 최소 토큰 수
min_new_tokens = 2048

# stopping_criteria = StoppingCriteriaList([MinLengthStoppingCriteria(len(inputs["input_ids"][0]) + min_new_tokens)])

# GPU 전력 측정 초기화
pynvml.nvmlInit()

handle = pynvml.nvmlDeviceGetHandleByIndex(0)
for _ in range(num_runs):

    torch.cuda.synchronize()

    # 전력 사용량 측정 시작
    power_usage = pynvml.nvmlDeviceGetPowerUsage(handle) / 1000.0  # W

    total_power_usage += power_usage

    start_time = time.time()
    with torch.no_grad():
        outputs = model.generate(**inputs, \
            min_length=min_new_tokens + len(inputs["input_ids"][0]), \
            max_new_tokens=max_new_tokens, \
            # stopping_criteria=stopping_criteria, \
            use_cache=True)
        
    end_time = time.time()
    print('Output ids:', outputs)
    print('Output text[:200]:', tokenizer.decode(outputs[0], skip_special_tokens=True)[:100])
    
    num_generated_tokens = outputs.shape[1]
    print('Num of generated tokens (input tokens + output tokens):', num_generated_tokens)
    total_generated_tokens += num_generated_tokens
          
    inf_time = end_time - start_time
    print('Inferecne time:', inf_time)

    torch.cuda.synchronize()

    total_time += inf_time
    
# 평균 전력 사용량 (W)
avg_power_usage = total_power_usage / num_runs

# 평균 추론 시간
avg_time = total_time / num_runs # s

# 평균 토큰 생성량
avg_generated_tokens = total_generated_tokens / num_runs

# TOPS (Tera FLOPs per Second) 계산
tops = flops / (avg_time * 1e12)

# 초당 추론 가능량 (FPS)
fps = 1 / avg_time

# 초당 토큰 처리량 (TPS)
tps = avg_generated_tokens / avg_time

# fps/W
efficiency = fps / avg_power_usage

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Output ids: tensor([[128001, 128001, 128001,  ...,   1162,   7978,    477]],
       device='cuda:0')
Output text[:200]: Tell me more about Deep Seek. DeepSeek is an AI assistant developed by the Chinese company DeepSeek 
Num of generated tokens (input tokens + output tokens): 2548
Inferecne time: 21.696008682250977


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Output ids: tensor([[128001, 128001, 128001,  ...,    656,    814,   3790]],
       device='cuda:0')
Output text[:200]: Tell me more about Deep Seek. Is it a Chinese company? What is their area of expertise? What makes t
Num of generated tokens (input tokens + output tokens): 2548
Inferecne time: 21.667158842086792


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Output ids: tensor([[128001, 128001, 128001,  ...,   6787,   3196,    389]],
       device='cuda:0')
Output text[:200]: Tell me more about Deep Seek. DeepSeek is a company focused on AI and big data, providing intelligen
Num of generated tokens (input tokens + output tokens): 2548
Inferecne time: 21.004376649856567


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Output ids: tensor([[128001, 128001, 128001,  ...,   6505,   3834,    374]],
       device='cuda:0')
Output text[:200]: Tell me more about Deep Seek. I've heard it's a new search engine, but I'm not exactly sure what mak
Num of generated tokens (input tokens + output tokens): 2548
Inferecne time: 20.925662517547607


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Output ids: tensor([[128001, 128001, 128001,  ...,   1205,  11944,  11156]],
       device='cuda:0')
Output text[:200]: Tell me more about Deep Seek. Is it a company or a product?

DeepSeek is a company that specializes 
Num of generated tokens (input tokens + output tokens): 2548
Inferecne time: 20.9649441242218


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Output ids: tensor([[128001, 128001, 128001,  ...,    279,   3938,    315]],
       device='cuda:0')
Output text[:200]: Tell me more about Deep Seek. I came across this term and am curious to know more about it.
DeepSeek
Num of generated tokens (input tokens + output tokens): 2548
Inferecne time: 21.013421535491943


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Output ids: tensor([[128001, 128001, 128001,  ...,     13,   4427,   2778]],
       device='cuda:0')
Output text[:200]: Tell me more about Deep Seek. What is it exactly? I know it's a search engine, but what makes it dif
Num of generated tokens (input tokens + output tokens): 2548
Inferecne time: 21.09519672393799


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Output ids: tensor([[128001, 128001, 128001,  ...,    927,  12472,     11]],
       device='cuda:0')
Output text[:200]: Tell me more about Deep Seek. I have a question about the company.
DeepSeek Artificial Intelligence 
Num of generated tokens (input tokens + output tokens): 2548
Inferecne time: 21.140743494033813


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Output ids: tensor([[128001, 128001, 128001,  ...,  34564,  21579,    612]],
       device='cuda:0')
Output text[:200]: Tell me more about Deep Seek. What is it exactly?
DeepSeek Artificial Intelligence Co., Ltd. is a Ch
Num of generated tokens (input tokens + output tokens): 2548
Inferecne time: 21.259671211242676
Output ids: tensor([[128001, 128001, 128001,  ...,     11,    323,  37843]],
       device='cuda:0')
Output text[:200]: Tell me more about Deep Seek. Is it a search engine, a tool, or something else? It seems like a vert
Num of generated tokens (input tokens + output tokens): 2548
Inferecne time: 21.11881995201111


In [11]:
print(f"FLOPs: {flops:.3e}")
print(f"GFLOPs: {gflops:.3f}")
print(f"TOPS: {tops:.6f}")
print(f"Avg inference time: {avg_time:.3f} sec")
print(f"FPS: {fps:.3f}")
print(f"Avg num of generated tokens: {avg_generated_tokens:.2f} sec")
print(f"TPS: {tps:.3f}")
print(f"Avg power usage: {avg_power_usage:.3f} W")
print(f"Efficiency: {efficiency:.6f} FPS/W")

FLOPs: 1.537e+13
GFLOPs: 15369.540
TOPS: 0.725368
Avg inference time: 21.189 sec
FPS: 0.047
Avg num of generated tokens: 2548.00 sec
TPS: 120.253
Avg power usage: 245.110 W
Efficiency: 0.000193 FPS/W
