In [1]:
import sys
sys.path.append('../')

import torch
from torchvision.models.feature_extraction import get_graph_node_names
from models import get_model

def get_node_names(model):
    node_names = []

    for name, _ in model.named_modules():
        if '.' in name:
            node_names.append(name)

    return node_names

## pyTorch 기본 모델 분석

In [2]:
model = get_model('resnet50')
train_nodes, eval_nodes = get_graph_node_names(model.model)
eval_nodes

['x',
 'conv1',
 'bn1',
 'relu',
 'maxpool',
 'layer1.0.conv1',
 'layer1.0.bn1',
 'layer1.0.relu',
 'layer1.0.conv2',
 'layer1.0.bn2',
 'layer1.0.relu_1',
 'layer1.0.conv3',
 'layer1.0.bn3',
 'layer1.0.downsample.0',
 'layer1.0.downsample.1',
 'layer1.0.add',
 'layer1.0.relu_2',
 'layer1.1.conv1',
 'layer1.1.bn1',
 'layer1.1.relu',
 'layer1.1.conv2',
 'layer1.1.bn2',
 'layer1.1.relu_1',
 'layer1.1.conv3',
 'layer1.1.bn3',
 'layer1.1.add',
 'layer1.1.relu_2',
 'layer1.2.conv1',
 'layer1.2.bn1',
 'layer1.2.relu',
 'layer1.2.conv2',
 'layer1.2.bn2',
 'layer1.2.relu_1',
 'layer1.2.conv3',
 'layer1.2.bn3',
 'layer1.2.add',
 'layer1.2.relu_2',
 'layer2.0.conv1',
 'layer2.0.bn1',
 'layer2.0.relu',
 'layer2.0.conv2',
 'layer2.0.bn2',
 'layer2.0.relu_1',
 'layer2.0.conv3',
 'layer2.0.bn3',
 'layer2.0.downsample.0',
 'layer2.0.downsample.1',
 'layer2.0.add',
 'layer2.0.relu_2',
 'layer2.1.conv1',
 'layer2.1.bn1',
 'layer2.1.relu',
 'layer2.1.conv2',
 'layer2.1.bn2',
 'layer2.1.relu_1',
 'layer2.

## HuggingFace 모델 분석

- MS Resnet50

In [3]:
model = get_model('ms_resnet50')
model.model

Some weights of ResNetForImageClassification were not initialized from the model checkpoint at microsoft/resnet-50 and are newly initialized because the shapes did not match:
- classifier.1.weight: found shape torch.Size([1000, 2048]) in the checkpoint and torch.Size([10, 2048]) in the model instantiated
- classifier.1.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([10]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


ResNetForImageClassification(
  (resnet): ResNetModel(
    (embedder): ResNetEmbeddings(
      (embedder): ResNetConvLayer(
        (convolution): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
        (normalization): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (activation): ReLU()
      )
      (pooler): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    )
    (encoder): ResNetEncoder(
      (stages): ModuleList(
        (0): ResNetStage(
          (layers): Sequential(
            (0): ResNetBottleNeckLayer(
              (shortcut): ResNetShortCut(
                (convolution): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
                (normalization): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
              )
              (layer): Sequential(
                (0): ResNetConvLayer(
                  (convolution): Conv2d(64

In [4]:
model.eval()
with torch.no_grad():
    outputs = model.model.base_model(model.example_input_array, output_hidden_states=True)
outputs.pooler_output.shape

torch.Size([1, 2048, 1, 1])

- ViT16

In [5]:
model = get_model('vit16')
model.model

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([10]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0): ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, out_

In [6]:
model.eval()
with torch.no_grad():
    outputs = model.model.base_model(model.example_input_array, output_hidden_states=True)
    hidden_states = outputs.hidden_states
print(outputs.last_hidden_state[:, 0,:].shape)
print(outputs.last_hidden_state[:, 0,:])    # [CLS] token is at index 0

torch.Size([1, 768])
tensor([[-3.9416e-01, -8.6115e-01,  8.1237e-01, -8.1887e-01,  1.4707e-01,
          7.6216e-01, -1.6560e+00,  6.2770e-01, -1.6068e+00, -8.5001e-01,
         -1.5637e+00,  1.0181e+00, -1.2752e+00, -1.4599e-01, -9.5680e-01,
          1.0760e+00, -9.8362e-01, -7.2553e-01, -4.9964e-01, -1.0735e-03,
          1.1142e+00,  1.0845e+00, -4.8157e-01, -1.1751e-01,  1.1771e+00,
         -9.3265e-01,  3.7546e-01,  1.2659e-03,  1.4208e-01, -2.6776e-01,
         -1.0716e+00,  1.9308e-02, -5.4390e-02, -8.4021e-01, -5.4229e-01,
         -6.0595e-02,  9.6812e-01, -9.7414e-02, -2.5784e-01,  1.5728e+00,
          7.7523e-01,  7.0383e-01, -2.8317e-01, -1.0824e+00, -5.2147e-01,
         -1.3175e+00, -7.6344e-01, -7.6244e-01, -2.2818e-01,  5.2672e-01,
         -4.7757e-01,  2.3694e-01,  2.8581e-01, -5.7927e-01, -8.5997e-02,
          8.4022e-01,  6.0622e-01, -5.0133e-01,  8.2126e-01, -3.9532e-01,
         -8.2799e-01, -1.0612e+00, -3.3004e-01,  1.3755e+00, -1.8610e-01,
          7.9786e

- deit16

In [9]:
model = get_model('deit16')
model.model

Some weights of ViTForImageClassification were not initialized from the model checkpoint at facebook/deit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([10]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0): ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, out_

In [8]:
model.eval()
with torch.no_grad():
    outputs = model.model.base_model(model.example_input_array, output_hidden_states=True)
    hidden_states = outputs.hidden_states
print(outputs.last_hidden_state[:, 0, :].shape)
print(outputs.last_hidden_state[:, 0, :])    # [CLS] token is at index 0

torch.Size([1, 768])
tensor([[-4.1362e-01, -2.3566e-01, -1.6842e-01,  2.4150e-01,  3.7727e-01,
          4.0036e-01, -1.9911e-01,  2.2879e-01,  1.8547e-01,  1.7879e-01,
          7.6214e-02,  2.4491e-01,  6.2432e-01, -3.6866e-01,  1.1163e-01,
          1.4406e-01,  9.6961e-02,  6.7247e-01,  1.4773e-01, -2.2477e-01,
         -1.6158e-01, -8.7985e-02,  1.0490e+00, -3.0650e-01,  7.7875e-01,
          5.3884e-02,  4.3531e-01, -1.8043e-01,  9.2140e-01, -4.9171e-03,
          5.4310e-01, -1.9963e-02, -5.2398e-01, -2.7369e-01,  4.8437e-01,
         -8.5049e-01,  5.3675e-01, -1.8170e-01,  4.7189e-01, -1.3445e-01,
         -1.8008e-01, -1.2907e-01, -2.6956e-01,  6.0644e-01, -3.0759e-01,
         -3.7427e-01,  2.5605e-01,  3.9832e-01, -1.2933e-02, -7.4581e-01,
         -1.2591e-01, -5.9723e-01,  1.2022e-01,  1.6797e-01,  8.8628e-02,
         -2.6627e-02,  8.5379e-01, -3.4380e-01,  5.9352e-01, -3.4835e-01,
         -2.2782e-01,  7.2224e-01,  1.0423e-01,  1.4847e-01, -8.2029e-01,
          8.7475e