In [2]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
import torch
import sys
import argparse
from datasets import load_dataset
from torch.utils.data import DataLoader, SequentialSampler
import time
import math
import numpy as np
from torchinfo import summary

In [5]:
model_name_or_path="bert-large-uncased-whole-word-masking-finetuned-squad"
amp_dtype = torch.float16

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, do_lower_case=True, cache_dir=None, use_fast=False)
model = AutoModelForQuestionAnswering.from_pretrained(model_name_or_path, torch_dtype=amp_dtype)

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
summary(model,depth=6,row_settings=["var_names","depth"])

Layer (type (var_name):depth-idx)                                           Param #
BertForQuestionAnswering (BertForQuestionAnswering)                         --
├─BertModel (bert): 1-1                                                     --
│    └─BertEmbeddings (embeddings): 2-1                                     --
│    │    └─Embedding (word_embeddings): 3-1                                31,254,528
│    │    └─Embedding (position_embeddings): 3-2                            524,288
│    │    └─Embedding (token_type_embeddings): 3-3                          2,048
│    │    └─LayerNorm (LayerNorm): 3-4                                      2,048
│    │    └─Dropout (dropout): 3-5                                          --
│    └─BertEncoder (encoder): 2-2                                           --
│    │    └─ModuleList (layer): 3-6                                         --
│    │    │    └─BertLayer (0): 4-1                                         --
│    │    │    │    └─BertAt

In [30]:
print(model.bert.embeddings.word_embeddings)
print(type(model.bert.embeddings.word_embeddings))

Embedding(30522, 1024, padding_idx=0)
<class 'torch.nn.modules.sparse.Embedding'>


In [42]:
from torch import nn
class LSTMNet(nn.Module):
    def __init__(self, vocab_size=20, embed_dim=300, hidden_dim=512, num_layers=2):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.encoder = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.decoder = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        embed = self.embedding(x)
        out, hidden = self.encoder(embed)
        out = self.decoder(out)
        out = out.view(-1, out.size(2))
        return out, hidden

summary(
    LSTMNet(),
    (1, 100),
    dtypes=[torch.long],
    verbose=2,
    col_width=16,
    col_names=["kernel_size", "output_size", "num_params", "mult_adds"],
    row_settings=["var_names"],
)

Layer (type (var_name))                  Kernel Shape     Output Shape     Param #          Mult-Adds
LSTMNet (LSTMNet)                        --               [100, 20]        --               --
├─Embedding (embedding)                  --               [1, 100, 300]    6,000            6,000
│    └─weight                            [300, 20]                         └─6,000
├─LSTM (encoder)                         --               [1, 100, 512]    3,768,320        376,832,000
│    └─weight_ih_l0                      [2048, 300]                       ├─614,400
│    └─weight_hh_l0                      [2048, 512]                       ├─1,048,576
│    └─bias_ih_l0                        [2048]                            ├─2,048
│    └─bias_hh_l0                        [2048]                            ├─2,048
│    └─weight_ih_l1                      [2048, 512]                       ├─1,048,576
│    └─weight_hh_l1                      [2048, 512]                       ├─1,048,576
│    └

Layer (type (var_name))                  Kernel Shape     Output Shape     Param #          Mult-Adds
LSTMNet (LSTMNet)                        --               [100, 20]        --               --
├─Embedding (embedding)                  --               [1, 100, 300]    6,000            6,000
│    └─weight                            [300, 20]                         └─6,000
├─LSTM (encoder)                         --               [1, 100, 512]    3,768,320        376,832,000
│    └─weight_ih_l0                      [2048, 300]                       ├─614,400
│    └─weight_hh_l0                      [2048, 512]                       ├─1,048,576
│    └─bias_ih_l0                        [2048]                            ├─2,048
│    └─bias_hh_l0                        [2048]                            ├─2,048
│    └─weight_ih_l1                      [2048, 512]                       ├─1,048,576
│    └─weight_hh_l1                      [2048, 512]                       ├─1,048,576
│    └

In [None]:
tokenizer_gpt = AutoTokenizer.from_pretrained("gpt", do_lower_case=True, cache_dir=None, use_fast=False)
model_gpt = AutoModelForQuestionAnswering.from_pretrained(model_name_or_path, torch_dtype=amp_dtype)