In [1]:
import torch
import torch.nn.functional as F
from tqdm import tqdm
import timm
assert timm.__version__ == "0.3.2"
from torchvision import transforms
from PIL import Image
import requests
import random
import numpy as np

# 장치 설정
device = 'cuda' if torch.cuda.is_available() else 'cpu'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder
import torchvision.transforms as transforms

# 데이터 전처리
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # DeiT 모델 입력 크기에 맞춤
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

test_dataset = ImageFolder(root="C:/Users/seonahryu/Desktop/brp1/ILSVRC2012_img_val", transform=transform)
test_loader = DataLoader(dataset=test_dataset, batch_size=32, shuffle=False)

In [3]:
def my_forward_wrapper(attn_obj):
    def my_forward(x):
        B, N, C = x.shape
        qkv = attn_obj.qkv(x).reshape(B, N, 3, attn_obj.num_heads, C // attn_obj.num_heads).permute(2, 0, 3, 1, 4)
        q, k, v = qkv.unbind(0)

        attn = (q @ k.transpose(-2, -1)) * attn_obj.scale
        attn = attn.softmax(dim=-1)
        attn = attn_obj.attn_drop(attn)

        # attn_map을 저장
        attn_maps.append(attn.detach())  # 주의 가중치를 저장
        attn_obj.cls_attn_map = attn[:, :, 0, 2:]  # cls attention map 저장

        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
        x = attn_obj.proj(x)
        x = attn_obj.proj_drop(x)
        return x
    return my_forward

In [4]:
def attention_rollout_function(attn_maps):
    I_size = 196  # 주의 맵의 크기를 196으로 설정
    attn_rollout = []

    for attn_map in attn_maps:
        # 현재 주의 맵의 크기 확인
        current_size = attn_map.size(-1)
        I = torch.eye(max(current_size, I_size)).to(attn_map.device)  # 동적 아이덴티티 행렬 생성

        if current_size < I_size:
            padding_size = I_size - current_size
            padded_attn_map = F.pad(attn_map, (0, padding_size, 0, 0))  # 오른쪽에 패딩 추가
            I_padded = F.pad(I, (0, padding_size, 0, 0))  # 아이덴티티 행렬 패딩
            prod = padded_attn_map + I_padded
        else:
            prod = attn_map + I

        attn_rollout.append(prod / prod.sum(dim=-1, keepdim=True))  # 정규화

    return attn_rollout

DeiT-small

In [5]:
model = torch.hub.load('facebookresearch/deit:main', 'deit_small_patch16_224', pretrained=True)
model

Using cache found in C:\Users\seonahryu/.cache\torch\hub\facebookresearch_deit_main


VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 384, kernel_size=(16, 16), stride=(16, 16))
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): ModuleList(
    (0): Block(
      (norm1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=384, out_features=1152, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=384, out_features=1536, bias=True)
        (act): GELU()
        (fc2): Linear(in_features=1536, out_features=384, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
    )
    (1): Block(
      (norm1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (

In [6]:
print(f"Number of blocks in the model: {len(model.blocks)}")

Number of blocks in the model: 12


In [7]:
model.eval()

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x2a2ad8719d0>

In [8]:
attn_maps = []  # 전역 리스트를 사용하여 attn_map을 저장
cls_weights = []

# 각 블록을 순회하며 attention map과 class attention map을 추출
for block in tqdm(model.blocks):
    wrapped_forward = my_forward_wrapper(block.attn)
    
    # 입력 텐서가 필요하므로 임의의 입력을 생성
    x = torch.randn(1, 196, 384)  # B=1, N=196, C=384
    output = wrapped_forward(x)

    # attn_map이 저장되었는지 확인합니다.
    if attn_maps:
        print(f"Block {len(attn_maps)}: Attention map collected with shape {attn_maps[-1].shape}")
    else:
        print(f"Block {len(attn_maps)}: No attention map collected.")

    # cls_attn_map 저장
    cls_attn_map = block.attn.cls_attn_map.detach()  # cls_attn_map 저장
    cls_weights.append(cls_attn_map)

# 주의 맵이 저장되었는지 확인합니다.
if attn_maps:
    attn_map = attn_maps[-1].mean(dim=1).squeeze(0).detach()
else:
    print("attn_maps 내용:", attn_maps)  # 디버깅 출력 추가
    raise ValueError("No attention maps were collected.")

cls_weight = cls_weights[-1].max(dim=1).values.detach()

# Ensure tensors are on the CPU
attn_map_cpu = attn_map.cpu()
cls_weight_cpu = cls_weight.cpu()

# 클래스 가중치의 크기를 확인합니다.
print("CLS Weight Shape:", cls_weight.shape)

# Combine class scores of all blocks
try:
    cls_weight_combined = torch.prod(torch.stack(cls_weights), dim=0)
except RuntimeError as e:
    print(f"Error combining class weights: {e}")

# 주의 맵의 곱을 계산합니다
attn_maps_prod = torch.prod(torch.stack(attn_maps), dim=0)

# CPU로 전환
attn_maps_cpu = [attn_map.cpu() for attn_map in attn_maps]
cls_weights_cpu = [cls_weight.cpu() for cls_weight in cls_weights]

# 결과 확인
print("Attention Rollout:", attn_maps_prod.shape)
print("Combined CLS Weight Shape:", cls_weight_combined.shape)

100%|██████████| 12/12 [00:00<00:00, 292.63it/s]

Block 1: Attention map collected with shape torch.Size([1, 6, 196, 196])
Block 2: Attention map collected with shape torch.Size([1, 6, 196, 196])
Block 3: Attention map collected with shape torch.Size([1, 6, 196, 196])
Block 4: Attention map collected with shape torch.Size([1, 6, 196, 196])
Block 5: Attention map collected with shape torch.Size([1, 6, 196, 196])
Block 6: Attention map collected with shape torch.Size([1, 6, 196, 196])
Block 7: Attention map collected with shape torch.Size([1, 6, 196, 196])
Block 8: Attention map collected with shape torch.Size([1, 6, 196, 196])
Block 9: Attention map collected with shape torch.Size([1, 6, 196, 196])
Block 10: Attention map collected with shape torch.Size([1, 6, 196, 196])
Block 11: Attention map collected with shape torch.Size([1, 6, 196, 196])
Block 12: Attention map collected with shape torch.Size([1, 6, 196, 196])
CLS Weight Shape: torch.Size([1, 194])
Attention Rollout: torch.Size([1, 6, 196, 196])
Combined CLS Weight Shape: torch.S




In [9]:
def attention_rollout_function(attn_maps):
    attn_rollout = []
    I = torch.eye(attn_maps[0].shape[-1])  # Identity matrix
    prod = I
    for i, attn_map in enumerate(attn_maps):
        prod = prod @ (attn_map + I)  # Product of attention maps with identity matrix
        
        prod = prod / prod.sum(dim=-1, keepdim=True) # Normalize
        attn_rollout.append(prod)
    return attn_rollout

# Attention Rollout
attn_rollout = attention_rollout_function(attn_maps_cpu)

# For Class Weights
cls_weights_rollout = []

for i in tqdm(range(len(attn_rollout))):  # attn_rollout의 크기로 반복
    cls_weight_tensor = attn_rollout[i][0, 0, :, :]  # 첫 번째 헤드 사용

    # 텐서의 크기를 출력하여 디버깅
    print(f"Class weight tensor shape before view: {cls_weight_tensor.shape}")

    # 크기를 확인하고, 14x14로 변환할 수 있는지 확인합니다.
    if cls_weight_tensor.numel() == 14 * 14:
        cls_weights_rollout.append(cls_weight_tensor.view(14, 14))
    elif cls_weight_tensor.numel() == 196 * 196:
        # 196x196 텐서를 클래스 가중치로 사용
        cls_weights_rollout.append(cls_weight_tensor)  # 변환하지 않고 그대로 사용
    else:
        print(f"Skipping view for index {i}, tensor size is {cls_weight_tensor.numel()}")

# 결과 확인
if isinstance(attn_rollout, list):
    print("Attention Rollout Length:", len(attn_rollout))
    print("Attention Rollout Shapes:", [tensor.shape for tensor in attn_rollout])
else:
    print("Attention Rollout Shape:", attn_rollout.shape)

print("Class Weights Rollout Shapes:", [cls_weight.shape for cls_weight in cls_weights_rollout])

100%|██████████| 12/12 [00:00<00:00, 12003.73it/s]

Class weight tensor shape before view: torch.Size([196, 196])
Class weight tensor shape before view: torch.Size([196, 196])
Class weight tensor shape before view: torch.Size([196, 196])
Class weight tensor shape before view: torch.Size([196, 196])
Class weight tensor shape before view: torch.Size([196, 196])
Class weight tensor shape before view: torch.Size([196, 196])
Class weight tensor shape before view: torch.Size([196, 196])
Class weight tensor shape before view: torch.Size([196, 196])
Class weight tensor shape before view: torch.Size([196, 196])
Class weight tensor shape before view: torch.Size([196, 196])
Class weight tensor shape before view: torch.Size([196, 196])
Class weight tensor shape before view: torch.Size([196, 196])
Attention Rollout Length: 12
Attention Rollout Shapes: [torch.Size([1, 6, 196, 196]), torch.Size([1, 6, 196, 196]), torch.Size([1, 6, 196, 196]), torch.Size([1, 6, 196, 196]), torch.Size([1, 6, 196, 196]), torch.Size([1, 6, 196, 196]), torch.Size([1, 6, 19




In [10]:
attn_map

tensor([[0.0092, 0.0108, 0.0016,  ..., 0.0033, 0.0031, 0.0041],
        [0.0025, 0.0249, 0.0017,  ..., 0.0045, 0.0037, 0.0045],
        [0.0022, 0.0028, 0.0083,  ..., 0.0025, 0.0050, 0.0159],
        ...,
        [0.0022, 0.0089, 0.0015,  ..., 0.0199, 0.0061, 0.0046],
        [0.0014, 0.0043, 0.0024,  ..., 0.0043, 0.0238, 0.0065],
        [0.0042, 0.0048, 0.0018,  ..., 0.0042, 0.0047, 0.0244]])

In [11]:
cls_weight

tensor([[0.0028, 0.0103, 0.0145, 0.0100, 0.0049, 0.0047, 0.0083, 0.0101, 0.0075,
         0.0024, 0.0125, 0.0012, 0.0042, 0.0037, 0.0073, 0.0071, 0.0119, 0.0110,
         0.0049, 0.0063, 0.0066, 0.0083, 0.0065, 0.0068, 0.0030, 0.0534, 0.0020,
         0.0335, 0.0037, 0.0035, 0.0532, 0.0121, 0.0040, 0.0036, 0.0115, 0.0019,
         0.0060, 0.0106, 0.0159, 0.0096, 0.0117, 0.0069, 0.0179, 0.0196, 0.0382,
         0.0029, 0.0084, 0.0023, 0.0139, 0.0022, 0.0152, 0.0044, 0.0024, 0.0139,
         0.0097, 0.0583, 0.0025, 0.0032, 0.0253, 0.0018, 0.0032, 0.0034, 0.0056,
         0.0191, 0.0047, 0.0171, 0.0095, 0.0093, 0.0037, 0.0014, 0.0151, 0.0051,
         0.0043, 0.0063, 0.0054, 0.0231, 0.0095, 0.0082, 0.0100, 0.0046, 0.0108,
         0.0045, 0.0025, 0.0169, 0.0107, 0.0023, 0.0054, 0.0060, 0.0040, 0.0103,
         0.0049, 0.0091, 0.0056, 0.0041, 0.1168, 0.0315, 0.0022, 0.0107, 0.0085,
         0.0040, 0.0037, 0.0197, 0.0153, 0.0179, 0.0044, 0.0178, 0.0102, 0.0677,
         0.0103, 0.0061, 0.0

In [12]:
cls_weights_rollout

[tensor([[5.0145e-01, 1.0773e-04, 1.9219e-09,  ..., 3.0224e-07, 4.8908e-08,
          9.4525e-09],
         [1.7469e-07, 7.3549e-01, 1.9689e-09,  ..., 1.5312e-05, 1.8353e-07,
          9.8260e-07],
         [2.0269e-04, 8.8949e-03, 7.8070e-01,  ..., 8.4695e-05, 3.8640e-05,
          1.5120e-02],
         ...,
         [3.1366e-08, 1.4511e-03, 1.2052e-10,  ..., 5.2665e-01, 1.4668e-05,
          1.2199e-06],
         [6.7066e-11, 3.5859e-06, 1.0056e-12,  ..., 1.3806e-06, 5.0115e-01,
          6.2253e-07],
         [2.0210e-10, 8.3169e-06, 4.0810e-09,  ..., 2.4031e-07, 3.3919e-06,
          5.2196e-01]]),
 tensor([[2.8310e-01, 7.6627e-04, 1.8418e-03,  ..., 1.2459e-04, 8.3037e-03,
          8.7034e-04],
         [3.8865e-04, 3.8519e-01, 1.1187e-03,  ..., 5.4312e-04, 1.4127e-02,
          8.7212e-04],
         [2.2728e-04, 4.7906e-03, 7.5910e-01,  ..., 1.1250e-04, 3.2742e-03,
          9.5180e-03],
         ...,
         [2.5900e-04, 1.3777e-03, 2.0527e-03,  ..., 4.1472e-01, 1.6249e-03,
   

DeiT-tiny -> student model로 가정

In [13]:
model_st = torch.hub.load('facebookresearch/deit:main', 'deit_tiny_patch16_224', pretrained=True)
model_st

Using cache found in C:\Users\seonahryu/.cache\torch\hub\facebookresearch_deit_main


VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 192, kernel_size=(16, 16), stride=(16, 16))
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): ModuleList(
    (0): Block(
      (norm1): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=192, out_features=576, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=192, out_features=192, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=192, out_features=768, bias=True)
        (act): GELU()
        (fc2): Linear(in_features=768, out_features=192, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
    )
    (1): Block(
      (norm1): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv

In [14]:
print(f"Number of blocks in the model: {len(model_st.blocks)}")

Number of blocks in the model: 12


In [15]:
model_st.eval()

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x2a2ad8719d0>

In [16]:
# 전역 리스트를 사용하여 attn_map과 cls_weight 저장
attn_maps_st = []
cls_weights_st = []

# Forward hook을 설정하는 함수
def get_attention_map(module, input, output):
    attn_maps_st.append(output.detach())  # 주의 맵 저장
    cls_attn_map = output[:, :, 0]  # CLS token에 대한 attention map을 추출
    cls_weights_st.append(cls_attn_map.detach())  # 클래스 주의 맵 저장

# 각 블록을 순회하며 attention map과 class attention map을 추출합니다.
for block in tqdm(model_st.blocks):
    # Forward hook 등록
    hook = block.attn.register_forward_hook(get_attention_map)
    
    # 입력 텐서가 필요하므로 임의의 입력을 생성합니다.
    x = torch.randn(1, 3, 224, 224).to(device)  # B=1, C=3, H=224, W=224
    x_patch = model_st.patch_embed(x)  # 패치 임베딩 변환

    # wrapped_forward 함수로 출력 계산
    output = block.attn(x_patch)

    # hook 해제
    hook.remove()

# 주의 맵이 저장되었는지 확인합니다.
if attn_maps_st:
    attn_map_st = attn_maps_st[-1].mean(dim=1).squeeze(0).detach()
    print(f"Collected attention maps: {len(attn_maps_st)}")
else:
    print("attn_maps_st 내용:", attn_maps_st)  # 디버깅 출력 추가
    raise ValueError("No attention maps were collected.")

# cls_weight 계산
cls_weight_st = cls_weights_st[-1].max(dim=1).values.detach()

# Ensure tensors are on the CPU
attn_map_st_cpu = attn_map_st.cpu()
cls_weight_st_cpu = cls_weight_st.cpu()

# 클래스 가중치의 크기를 확인합니다.
print("CLS Weight Shape:", cls_weight_st.shape)

# Combine class scores of all blocks
try:
    cls_weight_st_combined = torch.prod(torch.stack(cls_weights_st), dim=0)
except RuntimeError as e:
    print(f"Error combining class weights: {e}")

# 주의 맵의 곱을 계산합니다
attn_maps_st_prod = torch.prod(torch.stack(attn_maps_st), dim=0)

# CPU로 전환
attn_maps_st_cpu = [attn_map.cpu() for attn_map in attn_maps_st]
cls_weights_st_cpu = [cls_weight.cpu() for cls_weight in cls_weights_st]

# 결과 확인
print("Attention Rollout:", attn_maps_st_prod.shape)
print("Combined CLS Weight Shape:", cls_weight_st_combined.shape)

100%|██████████| 12/12 [00:00<00:00, 266.64it/s]

Collected attention maps: 12
CLS Weight Shape: torch.Size([1])
Attention Rollout: torch.Size([1, 196, 192])
Combined CLS Weight Shape: torch.Size([1, 196])





In [17]:
# Attention Rollout
def attention_rollout_function(attn_maps):
    I_size = 196  # 주의 맵의 크기를 196으로 설정
    I = torch.eye(I_size).to(attn_maps[0].device)  # Identity matrix
    attn_rollout = []

    for attn_map in attn_maps:
        if attn_map.size(-1) == I_size:
            prod = attn_map + I
        elif attn_map.size(-1) < I_size:
            padding_size = I_size - attn_map.size(-1)
            padded_attn_map = F.pad(attn_map, (0, padding_size, 0, 0))  # 오른쪽에 패딩 추가
            prod = padded_attn_map + I
        else:
            raise ValueError("Unexpected attention map size.")

        attn_rollout.append(prod)

    attn_rollout = [prod / prod.sum(dim=-1, keepdim=True) for prod in attn_rollout]  # Normalize
    return attn_rollout

In [18]:
# Attention Rollout 실행
attn_rollout_st = attention_rollout_function(attn_maps_st_cpu)

# For Class Weights
cls_weights_rollout_st = []

for i in range(len(attn_rollout_st)):  # attn_rollout_st의 크기로 반복
    cls_weight_tensor_st = attn_rollout_st[i][0]  # 첫 번째 배치의 텐서 사용

    # 크기를 확인하고, 14x14로 변환할 수 있는지 확인
    if cls_weight_tensor_st.numel() == 14 * 14:
        cls_weights_rollout_st.append(cls_weight_tensor_st.view(14, 14))
    elif cls_weight_tensor_st.numel() == 196 * 196:
        cls_weights_rollout_st.append(cls_weight_tensor_st)  # 그대로 사용

# 결과 확인
print("Class Weights Rollout Shapes:", [cls_weight.shape for cls_weight in cls_weights_rollout_st])

Class Weights Rollout Shapes: [torch.Size([196, 196]), torch.Size([196, 196]), torch.Size([196, 196]), torch.Size([196, 196]), torch.Size([196, 196]), torch.Size([196, 196]), torch.Size([196, 196]), torch.Size([196, 196]), torch.Size([196, 196]), torch.Size([196, 196]), torch.Size([196, 196]), torch.Size([196, 196])]


In [19]:
attn_map_st

tensor([ 0.1842,  0.6513,  0.9481, -0.0575, -0.4596,  0.9378, -0.0630,  0.0913,
        -0.2760,  0.8236, -0.3048, -0.5067, -0.2237,  0.0104,  0.2620,  0.4669,
         0.3408, -0.6330, -0.4091, -0.4896, -0.9567, -1.5660, -0.2059, -0.2811,
        -0.2813, -0.8560, -0.1496, -0.0455, -0.0833,  0.3670,  0.2738, -0.3592,
         0.3951,  0.6094,  0.5716, -0.0526,  0.3808, -0.0236, -0.7082,  0.1859,
         0.8467, -0.0996,  0.4303,  0.0613,  0.7428,  0.1889, -0.0631, -0.1368,
         0.1214,  0.5921,  0.1423, -0.4978,  0.5646, -0.2145,  0.1315, -0.3321,
         0.1872, -0.0260, -0.5459,  0.0105,  0.7164, -0.4216, -0.2821,  0.2232,
        -0.1487,  0.1884, -0.1680,  0.3930, -0.3026,  0.8395, -0.5283, -0.1191,
         0.5188, -1.0522,  0.4154, -0.1342, -0.8908,  1.1572,  0.0318,  0.0403,
        -0.4454,  0.0902, -0.5127,  0.6663, -0.3861,  0.5882,  0.0156,  0.5951,
        -0.7754,  0.4059,  0.1418,  0.9044, -1.3180, -0.1740,  0.9519, -0.7662,
        -0.1940, -0.4566,  0.9794,  0.26

In [20]:
cls_weight_st

tensor([0.2606])

In [21]:
cls_weights_rollout_st

[tensor([[-0.1554, -0.0586, -0.0729,  ..., -0.0000, -0.0000, -0.0000],
         [-0.0529, -0.2460, -0.0994,  ..., -0.0000, -0.0000, -0.0000],
         [-0.1179, -0.0144, -0.2249,  ..., -0.0000, -0.0000, -0.0000],
         ...,
         [-0.0701, -0.1249, -0.0585,  ..., -0.1516, -0.0000, -0.0000],
         [ 0.0249, -0.1786, -0.0892,  ..., -0.0000, -0.1801, -0.0000],
         [-0.1276, -0.1296, -0.0653,  ..., -0.0000, -0.0000, -0.1503]]),
 tensor([[-0.5720,  0.0253, -0.8815,  ..., -0.0000, -0.0000, -0.0000],
         [-0.1371, -0.5842, -0.8806,  ..., -0.0000, -0.0000, -0.0000],
         [ 0.0658, -0.2696, -1.0837,  ..., -0.0000, -0.0000, -0.0000],
         ...,
         [-0.0265, -0.1658, -0.9726,  ..., -0.4945, -0.0000, -0.0000],
         [-0.0341, -0.0129, -0.8782,  ..., -0.0000, -0.4722, -0.0000],
         [-0.0291,  0.0190, -1.0195,  ..., -0.0000, -0.0000, -0.5399]]),
 tensor([[ -5.2143,  -0.2073,  -9.1526,  ...,  -0.0000,  -0.0000,  -0.0000],
         [  1.0086,  -6.6903,  -8.8212,

두 모델 간의 차이 Loss 계산 (L1, L2, Cosine Similarity, KL Divergence로 구현)

In [None]:
# L1 (Mean Absolute Error): 두 모델의 각 레이어 가중치 간의 절대 차이 계산
# L1 손실 계산
loss_l1 = 0
layer_losses_l1 = []

for i in range(len(cls_weights_rollout_st)):
    # 레이어별 가중치 차이 계산
    layer_loss = F.l1_loss(cls_weights_rollout_st[i], cls_weights_rollout[i])
    layer_losses_l1.append(layer_loss.item())  # 손실을 리스트에 추가
    loss_l1 += layer_loss

# 총 손실 출력
total_loss_l1 = loss_l1.item()
print(f"Total L1 Loss: {total_loss_l1}")

# 레이어별 손실 비율 계산
layer_loss_percentages_l1 = [(layer_loss / total_loss_l1) * 100 for layer_loss in layer_losses_l1]

# 각 레이어 손실 및 비율 출력
for i, layer_loss in enumerate(layer_losses_l1):
    print(f"Layer {i} L1 Loss: {layer_loss:.4f}, Percentage of Total Loss: {layer_loss_percentages_l1[i]:.4f}%")
## 두 모델 간의 차이가 주로 Layer 2와 Layer 10에서 발생

Total L1 Loss: 14.616626739501953
Layer 0 L1 Loss: 0.1479, Percentage of Total Loss: 1.0115%
Layer 1 L1 Loss: 0.2261, Percentage of Total Loss: 1.5469%
Layer 2 L1 Loss: 5.1462, Percentage of Total Loss: 35.2075%
Layer 3 L1 Loss: 0.1554, Percentage of Total Loss: 1.0629%
Layer 4 L1 Loss: 0.2121, Percentage of Total Loss: 1.4508%
Layer 5 L1 Loss: 0.1961, Percentage of Total Loss: 1.3413%
Layer 6 L1 Loss: 0.2350, Percentage of Total Loss: 1.6081%
Layer 7 L1 Loss: 0.2223, Percentage of Total Loss: 1.5206%
Layer 8 L1 Loss: 0.5502, Percentage of Total Loss: 3.7645%
Layer 9 L1 Loss: 0.5793, Percentage of Total Loss: 3.9635%
Layer 10 L1 Loss: 4.3009, Percentage of Total Loss: 29.4244%
Layer 11 L1 Loss: 2.6453, Percentage of Total Loss: 18.0980%


In [None]:
# L2 (Euclidean Distance): 두 모델의 각 레이어 가중치 간의 유클리드 거리 계산
# L2 손실 계산
loss_l2 = 0
layer_losses_l2 = []

for i in range(len(cls_weights_rollout_st)):
    # 레이어별 가중치 차이 계산 (L2 손실)
    layer_loss = F.mse_loss(cls_weights_rollout_st[i], cls_weights_rollout[i])
    layer_losses_l2.append(layer_loss.item())  # 손실을 리스트에 추가
    loss_l2 += layer_loss

# 총 L2 손실 출력
total_loss_l2 = loss_l2.item()
print(f"Total L2 Loss: {total_loss_l2}")

# 레이어별 L2 손실 비율 계산
layer_loss_percentages_l2 = [(layer_loss / total_loss_l2) * 100 for layer_loss in layer_losses_l2]

# 각 레이어 L2 손실 및 비율 출력
for i, layer_loss in enumerate(layer_losses_l2):
    print(f"Layer {i} L2 Loss: {layer_loss:.4f}, Percentage of Total Loss: {layer_loss_percentages_l2[i]:.4f}%")

## Layer 2: 2번째 레이어의 L2 손실이 394.6398로, 전체 손실의 87.11% -> 레이어2에서 두 모델 간의 가중치 차이가 매우 크다
## 나머지 레이어들은 상대적으로 손실 값이 매우 작고, 전체 손실에서 차지하는 비율도 낮음.
## layer 2가 두 모델의 성능 차이에 큰 기여하고 있을 것이라 판단할 수 있음. 두 모델의 구조나 파라미터 조정시 해당 레이어 더 보기!

Total L2 Loss: 453.0188903808594
Layer 0 L2 Loss: 0.0445, Percentage of Total Loss: 0.0098%
Layer 1 L2 Loss: 0.0991, Percentage of Total Loss: 0.0219%
Layer 2 L2 Loss: 394.6398, Percentage of Total Loss: 87.1133%
Layer 3 L2 Loss: 0.0411, Percentage of Total Loss: 0.0091%
Layer 4 L2 Loss: 0.0765, Percentage of Total Loss: 0.0169%
Layer 5 L2 Loss: 0.0602, Percentage of Total Loss: 0.0133%
Layer 6 L2 Loss: 0.0887, Percentage of Total Loss: 0.0196%
Layer 7 L2 Loss: 0.0799, Percentage of Total Loss: 0.0176%
Layer 8 L2 Loss: 0.5019, Percentage of Total Loss: 0.1108%
Layer 9 L2 Loss: 0.5323, Percentage of Total Loss: 0.1175%
Layer 10 L2 Loss: 38.7059, Percentage of Total Loss: 8.5440%
Layer 11 L2 Loss: 18.1490, Percentage of Total Loss: 4.0062%


In [None]:
# Cosine Similarity: 두 모델의 가중치 벡터 간의 코사인 유사도 계산

from torch.nn.functional import cosine_similarity

# 코사인 유사도 손실 계산
loss_cosine = 0
layer_losses_cosine = []

for i in range(len(cls_weights_rollout_st)):
    # 레이어별 가중치 차이 계산
    layer_loss = 1 - cosine_similarity(cls_weights_rollout_st[i].view(1, -1), cls_weights_rollout[i].view(1, -1))
    layer_losses_cosine.append(layer_loss.item())  # 손실을 리스트에 추가
    loss_cosine += layer_loss

# 총 손실 출력
total_loss_cosine = loss_cosine.item()
print(f"Total Cosine Loss: {total_loss_cosine}")

# 레이어별 손실 비율 계산
layer_loss_percentages_cosine = [(layer_loss / total_loss_cosine) * 100 for layer_loss in layer_losses_cosine]

# 각 레이어 손실 및 비율 출력
for i, layer_loss in enumerate(layer_losses_cosine):
    print(f"Layer {i} Cosine Loss: {layer_loss:.4f}, Percentage of Total Loss: {layer_loss_percentages_cosine[i]:.4f}%")

## 각 레이어의 손실은 0.8에서 1.1 사이로 분포 -> 두 모델의 가중치가 레이어별로 상당히 유사
## 레이어 0에서 레이어 11까지의 손실 값이 비슷한 수준이므로, 전체적으로 두 모델의 가중치가 유사

Total Cosine Loss: 11.714558601379395
Layer 0 Cosine Loss: 1.0442, Percentage of Total Loss: 8.9137%
Layer 1 Cosine Loss: 1.0548, Percentage of Total Loss: 9.0045%
Layer 2 Cosine Loss: 1.0349, Percentage of Total Loss: 8.8344%
Layer 3 Cosine Loss: 0.8728, Percentage of Total Loss: 7.4502%
Layer 4 Cosine Loss: 0.8025, Percentage of Total Loss: 6.8504%
Layer 5 Cosine Loss: 0.9400, Percentage of Total Loss: 8.0241%
Layer 6 Cosine Loss: 1.0328, Percentage of Total Loss: 8.8163%
Layer 7 Cosine Loss: 0.9947, Percentage of Total Loss: 8.4910%
Layer 8 Cosine Loss: 0.9695, Percentage of Total Loss: 8.2759%
Layer 9 Cosine Loss: 0.9745, Percentage of Total Loss: 8.3191%
Layer 10 Cosine Loss: 0.9839, Percentage of Total Loss: 8.3986%
Layer 11 Cosine Loss: 1.0100, Percentage of Total Loss: 8.6217%


In [None]:
# KL Divergence: 두 모델의 가중치를 확률 분포로 간주하고, Kullback-Leibler Divergence 계산하여 두 분포 간의 차이 측정
# KL 발산 계산
loss_kl = 0
layer_losses_kl = []

for i in range(len(cls_weights_rollout_st)):
    # 확률 분포로 정규화
    p = F.softmax(cls_weights_rollout_st[i], dim=0) + 1e-10  # 작은 값 추가
    q = F.softmax(cls_weights_rollout[i], dim=0) + 1e-10  # 작은 값 추가
    
    # KL 발산 계산
    layer_loss = F.kl_div(p.log(), q, reduction='batchmean')
    layer_losses_kl.append(layer_loss.item())
    loss_kl += layer_loss

# 총 손실 출력
total_loss_kl = loss_kl.item()
print(f"Total KL Divergence Loss: {total_loss_kl}")

# 레이어별 손실 비율 계산
layer_loss_percentages_kl = [(layer_loss / total_loss_kl) * 100 for layer_loss in layer_losses_kl]

# 각 레이어 손실 및 비율 출력
for i, layer_loss in enumerate(layer_losses_kl):
    print(f"Layer {i} KL Loss: {layer_loss:.4f}, Percentage of Total Loss: {layer_loss_percentages_kl[i]:.4f}%")
## 두 모델 간의 차이가 주로 Layer 2, Layer 10, 그리고 Layer 11에서 발생

Total KL Divergence Loss: 31.367801666259766
Layer 0 KL Loss: 0.0042, Percentage of Total Loss: 0.0133%
Layer 1 KL Loss: 0.0067, Percentage of Total Loss: 0.0214%
Layer 2 KL Loss: 15.2732, Percentage of Total Loss: 48.6907%
Layer 3 KL Loss: 0.0010, Percentage of Total Loss: 0.0032%
Layer 4 KL Loss: 0.0025, Percentage of Total Loss: 0.0080%
Layer 5 KL Loss: 0.0023, Percentage of Total Loss: 0.0073%
Layer 6 KL Loss: 0.0020, Percentage of Total Loss: 0.0063%
Layer 7 KL Loss: 0.0030, Percentage of Total Loss: 0.0096%
Layer 8 KL Loss: 0.0287, Percentage of Total Loss: 0.0915%
Layer 9 KL Loss: 0.0246, Percentage of Total Loss: 0.0785%
Layer 10 KL Loss: 9.7367, Percentage of Total Loss: 31.0406%
Layer 11 KL Loss: 6.2829, Percentage of Total Loss: 20.0297%
