In [2]:
import json
from safetensors import safe_open

def inspect_safetensor_header(filepath):
    with open(filepath, 'rb') as f:
        # 첫 8바이트에서 헤더 길이 읽기
        header_size = int.from_bytes(f.read(8), 'little')
        
        # 헤더 읽기
        header = json.loads(f.read(header_size))
        
        print("Safetensor Header:")
        print(json.dumps(header, indent=2))
        
        # tensor 정보 출력
        if '__metadata__' in header:
            print("\n**************************Metadata:")
            for key, value in header['__metadata__'].items():
                print(f"  {key}: {value}")
        
        print("\n**************************Tensors:")
        for key, info in header.items():
            if key != '__metadata__':
                print(f"  {key}:")
                print(f"    dtype: {info['dtype']}")
                print(f"    shape: {info['shape']}")
                print(f"    data_offsets: {info['data_offsets']}")

# 사용 예
model_path = "/home/sparkleholic/.cache/huggingface/hub/models--LGAI-EXAONE--EXAONE-4.0-1.2B/snapshots/e1bc152ddee871e02a4b91826a700898686c2012/model.safetensors"
inspect_safetensor_header(model_path)


Safetensor Header:
{
  "__metadata__": {
    "format": "pt"
  },
  "model.embed_tokens.weight": {
    "dtype": "BF16",
    "shape": [
      102400,
      2048
    ],
    "data_offsets": [
      0,
      419430400
    ]
  },
  "model.layers.0.mlp.down_proj.weight": {
    "dtype": "BF16",
    "shape": [
      2048,
      4096
    ],
    "data_offsets": [
      419430400,
      436207616
    ]
  },
  "model.layers.0.mlp.gate_proj.weight": {
    "dtype": "BF16",
    "shape": [
      4096,
      2048
    ],
    "data_offsets": [
      436207616,
      452984832
    ]
  },
  "model.layers.0.mlp.up_proj.weight": {
    "dtype": "BF16",
    "shape": [
      4096,
      2048
    ],
    "data_offsets": [
      452984832,
      469762048
    ]
  },
  "model.layers.0.post_attention_layernorm.weight": {
    "dtype": "BF16",
    "shape": [
      2048
    ],
    "data_offsets": [
      469762048,
      469766144
    ]
  },
  "model.layers.0.post_feedforward_layernorm.weight": {
    "dtype": "BF16",
  

In [None]:
model_path = "/home/sparkleholic/.cache/huggingface/hub/models--LGAI-EXAONE--EXAONE-3.5-2.4B-Instruct/snapshots/e949c91dec92095908d34e6b560af77dd0c993f8/model-00001-of-00002.safetensors"
inspect_safetensor_header(model_path)

In [7]:
from transformers import AutoTokenizer
from hashlib import sha256

# 모델 디렉토리 경로 설정 (파일이 아닌 디렉토리)
model_dir = "/home/sparkleholic/.cache/huggingface/hub/models--LGAI-EXAONE--EXAONE-4.0-1.2B/snapshots/e1bc152ddee871e02a4b91826a700898686c2012"
tokenizer = AutoTokenizer.from_pretrained(model_dir)

# 테스트 문자열 (코드에서 가져온 것)
chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'

# 토크나이징 및 해시 계산
chktok = tokenizer.encode(chktxt)
chkhsh = sha256(str(chktok).encode()).hexdigest()

print(f"chktok: {chktok}")
print(f"chkhsh: {chkhsh}")

chktok: [560, 582, 560, 560, 582, 560, 560, 560, 582, 559, 582, 41, 582, 559, 560, 33, 560, 32, 560, 31, 560, 30, 560, 11881, 610, 584, 688, 13528, 370, 38961, 476, 45433, 11881, 596, 466, 533, 76100, 688, 87333, 97354, 648, 86597, 370, 78177, 14901, 461, 609, 11881, 461, 609, 582, 380, 582, 380, 380, 582, 380, 380, 380, 582, 380, 380, 380, 380, 582, 380, 380, 380, 380, 380, 582, 380, 380, 380, 380, 380, 380, 582, 380, 380, 380, 380, 380, 380, 380, 582, 380, 380, 380, 380, 380, 380, 380, 380, 582, 380, 375, 380, 582, 380, 951, 380, 582, 380, 4087, 380, 62775, 584, 35149, 476, 35149, 603, 68853, 595, 35149, 599, 68853, 586, 35149, 606, 35149, 477, 35149, 615, 68853, 585, 35149, 615, 35149, 457, 35149, 476, 35149, 589, 11881, 608, 585, 908, 48118, 57583, 19762, 46295, 55482, 20958, 378, 380, 378, 381, 378, 382, 378, 45152, 533, 483, 614, 31744, 24486, 23638, 23573, 2658, 19569, 58635, 44182, 5405, 19629, 24556, 21851, 3341, 5740, 7925, 7925, 77782, 5232, 2281, 2281, 52639, 55202, 3461, 6