In [1]:
import seaborn as sns
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np
import torch
import random
# Preliminaries

from torchtext.data import Field, TabularDataset, BucketIterator, Iterator
from torch.profiler import profile, record_function, ProfilerActivity

# Models

import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification

# Training

import torch.optim as optim

# Evaluation

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import time
import os


  assert(self.encoder != None, "the encoder cannot be None")
  assert(self.encoder.config.num_hidden_layers >= max_encoder_num, "the max encoder number should not exceed defined hidden layer")


In [2]:

device = torch.device('cuda:6' if torch.cuda.is_available() else 'cpu')
dirname = os.getcwd()

source_folder = os.path.join(dirname,'../data/imdb')
destination_folder =os.path.join(dirname,'./saved_models/progressive_shrinking')


In [3]:
def create_dataset(tokenizer_pretrained='bert-base-uncased', test_only = False):
    tokenizer = BertTokenizer.from_pretrained(tokenizer_pretrained)

    # Model parameter
    MAX_SEQ_LEN = 128
    PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
    UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)

    # Fields

    label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float)
    text_field = Field(use_vocab=False, tokenize=tokenizer.encode, lower=False, include_lengths=False, batch_first=True,
                       fix_length=MAX_SEQ_LEN, pad_token=PAD_INDEX, unk_token=UNK_INDEX)
    # fields = [('label', label_field), ('title', text_field), ('text', text_field), ('titletext', text_field)]
    fields = [('text', text_field),('sentiment', label_field)]

    # TabularDataset

    train, valid, test = TabularDataset.splits(path=source_folder, train='train.csv', validation='valid.csv',
                                               test='test.csv', format='CSV', fields=fields, skip_header=True)

    # Iterators
    if test_only:
        return Iterator(test, batch_size=16, device=device, train=False, shuffle=True, sort=False)

    train_iter = BucketIterator(train, batch_size=16, sort_key=lambda x: len(x.text),
                                device=device, train=True, sort=True, sort_within_batch=True)
    valid_iter = BucketIterator(valid, batch_size=16, sort_key=lambda x: len(x.text),
                                device=device, train=True, sort=True, sort_within_batch=True)
    test_iter = Iterator(test, batch_size=16, device=device, train=False, shuffle=True, sort=False)
    return train_iter, valid_iter,test_iter

In [4]:

class BERT(nn.Module):

    def __init__(self,options_name="bert-base-uncased"):
        super(BERT, self).__init__()

        self.encoder = BertForSequenceClassification.from_pretrained(options_name)

    def forward(self, text, label):
        loss, text_fea = self.encoder(text, labels=label)[:2]

        return loss, text_fea

In [5]:

class BERT_for_inference(nn.Module):

    def __init__(self,options_name="bert-base-uncased"):
        super(BERT_for_inference, self).__init__()

        self.encoder = BertForSequenceClassification.from_pretrained(options_name)

    def forward(self, text):
        self.encoder(text)



In [6]:
def inference_mock(model, test_loader ):
    y_pred = []
    y_true = []

    model.eval()
    with torch.no_grad():
        for (text,sentiment), _ in test_loader:
            text = text.type(torch.LongTensor)           
            text = text.to(device)
            sentiment = sentiment.type(torch.LongTensor)  
            sentiment = sentiment.to(device)
            
            ## model inference starts
#             output = model(text, sentiment)
            return count_ops(model,text)
            ## model inference ends.



In [7]:
model = BERT('google/bert_uncased_L-12_H-128_A-2').to(device)


Some weights of the model checkpoint at google/bert_uncased_L-12_H-128_A-2 were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'bert.encoder.layer.11.attention.self.key.weight', 'bert.encoder.layer.8.attention.output.dense.weight', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.2.attention.output.dense.bias', 'bert.encoder.layer.10.attention.output.dense.bias', 'bert.encoder.layer.8.attention.self.key.weight', 'cls.predictions.transform.LayerNorm.bias', 'bert.encoder.layer.7.attention.self.key.weight', 'bert.encoder.layer.3.attention.self.query.bias', 'bert.encoder.layer.3.attention.self.value.weight', 'bert.encoder.layer.4.attention.self.key.weight', 'bert.encoder.layer.2.attention.self.query.bias', 'bert.encoder.layer.4.attention.output.dense.bias', 'bert.encoder.layer.11.attention.self.value.bias', 'bert.encoder.layer.5.attention.output.dense.bias', 'bert.encoder.layer.7.attention.self.value.weight', 'bert.encoder.layer.9.

In [None]:
tokenizer.config

In [12]:
model.encoder.config

BertConfig {
  "_name_or_path": "google/bert_uncased_L-12_H-128_A-2",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 2,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.10.0.dev0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [8]:
model.encoder.set_attention_ratio(0.5)
model.eval()

BERT(
  (encoder): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 128, padding_idx=0)
        (position_embeddings): Embedding(512, 128)
        (token_type_embeddings): Embedding(2, 128)
        (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): DynamicLinear(
                  (linear): Linear(in_features=128, out_features=128, bias=True)
                )
                (key): DynamicLinear(
                  (linear): Linear(in_features=128, out_features=128, bias=True)
                )
                (value): DynamicLinear(
                  (linear): Linear(in_features=128, out_features=128, bias=True)
                )
            

In [14]:
test_iter = create_dataset('google/bert_uncased_L-12_H-128_A-2', True)


In [15]:

# total_step = 0
# total_time = 0
# entry_size = 0
# warmup = 100
# total_step = 1000
with torch.no_grad():
    for step,((text,sentiment), _) in enumerate(test_iter):

        text = text.type(torch.LongTensor)           
        text = text.to(device)
        sentiment = sentiment.type(torch.LongTensor)  
        sentiment = sentiment.to(device)
        model(text,sentiment)
#         if step < warmup:
#             model(text, sentiment)
#             entry_size = len(text)
#         elif step < warmup + total_step:
#             start = time.time()

#             out = model(text,sentiment)

#             elapsed = time.time() - start
#             print(elapsed)
#             total_time += elapsed
#         else:
#             break

active_attention_head_size :  32
active_all_head_size :  64
query_layer shape :  tensor([[[[-0.0299, -0.0902, -0.2592,  ...,  0.0430, -0.1856,  0.1938],
          [ 0.2604,  0.4037, -0.1294,  ...,  0.4744, -0.1280,  0.0623],
          [-0.1937,  0.4894, -0.0918,  ..., -0.0568, -0.4527, -0.0615],
          ...,
          [ 0.0393,  0.0085,  0.0103,  ...,  0.3954, -0.0845,  0.2589],
          [ 0.1186,  0.2164, -0.0742,  ...,  0.3781, -0.1545,  0.1301],
          [ 0.2374,  0.3051, -0.0989,  ...,  0.2489, -0.1785,  0.0138]],

         [[-0.1585,  0.1640,  0.3701,  ..., -0.0520, -0.1855,  0.2997],
          [-0.5947, -0.0562,  0.2200,  ..., -0.1106,  0.1878,  0.2650],
          [-0.1355,  0.0229,  0.1192,  ..., -0.1424,  0.1240, -0.0241],
          ...,
          [-0.2013, -0.3482,  0.0724,  ..., -0.3078, -0.1164,  0.3388],
          [-0.1452, -0.2435, -0.0491,  ..., -0.4183, -0.0808,  0.3846],
          [-0.0690, -0.1144, -0.1393,  ..., -0.3868, -0.0632,  0.3792]]],


        [[[-0.0299,

RuntimeError: shape '[16, 128, 64]' is invalid for input of size 262144

In [8]:
print("creating the test_iter")
test_iter = create_dataset('google/bert_uncased_L-12_H-128_A-2', True)
print("loading the pretrained model")
model = BERT('google/bert_uncased_L-12_H-128_A-2').to(device)
model.eval()
with torch.profiler.profile(
        schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=2),
        on_trace_ready=torch.profiler.tensorboard_trace_handler('./log/test'),
        record_shapes=True,
        with_stack=True
) as prof:

    with torch.no_grad():
        for step,((text,sentiment), _) in enumerate(test_iter):
            if step >= (1 + 1 + 3) * 2:
                break
            text = text.type(torch.LongTensor)           
            text = text.to(device)
            sentiment = sentiment.type(torch.LongTensor)  
            sentiment = sentiment.to(device)
            model(text, sentiment)

            prof.step()  # Need to call this at the end of each step to notify profiler of steps' boundary.

creating the test_iter
loading the pretrained model


Some weights of the model checkpoint at google/bert_uncased_L-12_H-128_A-2 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification 

In [9]:
with torch.profiler.profile(
        schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=2),
        on_trace_ready=torch.profiler.tensorboard_trace_handler('./log/test'),
        record_shapes=True,
        with_stack=True
) as prof:

    with torch.no_grad():
        for step,((text,sentiment), _) in enumerate(test_iter):
            if step >= (1 + 1 + 3) * 2:
                break
            text = text.type(torch.LongTensor)           
            text = text.to(device)
            sentiment = sentiment.type(torch.LongTensor)  
            sentiment = sentiment.to(device)
            model(text, sentiment)

            prof.step()  

In [5]:
varying_encoder_layer_model_list=['google/bert_uncased_L-2_H-768_A-12','google/bert_uncased_L-4_H-768_A-12','google/bert_uncased_L-6_H-768_A-12,''google/bert_uncased_L-8_H-768_A-12','google/bert_uncased_L-10_H-768_A-12','google/bert_uncased_L-12_H-768_A-12']
varying_hidden_dim_model_list = ['google/bert_uncased_L-12_H-128_A-2','google/bert_uncased_L-12_H-256_A-4','google/bert_uncased_L-12_H-512_A-8','google/bert_uncased_L-12_H-768_A-12']

In [6]:
flops_varying_encoder =[]
flops_varying_hd = []

In [None]:
for pretrained in varying_encoder_layer_model_list:
    print("testing model ", pretrained)
    print("creating the test_iter")
    test_iter = create_dataset(pretrained, True)
    print("loading the pretrained model")
    model = BERT(pretrained).to(device)
    print("beginning inference mock")
    with torch.profiler.profile(
        schedule=torch.profiler.schedule(wait=1, warmup=1, active=4, repeat=5),
        on_trace_ready=torch.profiler.tensorboard_trace_handler('./log/'+pretrained),
        record_shapes=True,
        with_stack=True
    ) as prof:

        with torch.no_grad():
            for step,((text,sentiment), _) in enumerate(test_iter):
                if step >= (1 + 1 + 3) * 5:
                    break
                text = text.type(torch.LongTensor)           
                text = text.to(device)
                sentiment = sentiment.type(torch.LongTensor)  
                sentiment = sentiment.to(device)
                model(text, sentiment)

                prof.step()  
            



In [31]:
import torch
from torch import nn
from pthflops import count_ops

class CustomLayer(nn.Module):
    def __init__(self):
        super(CustomLayer, self).__init__()
        self.conv1 = nn.Conv2d(5, 5, 1, 1, 0)
        # ... other layers present inside will also be ignored

    def forward(self, x):
        return self.conv1(x)

# Create a network and a corresponding input
inp = torch.rand(1,5,7,7)
net = nn.Sequential(
    nn.Conv2d(5, 5, 1, 1, 0),
    nn.ReLU(inplace=True),
    CustomLayer()
)

# Count the number of FLOPs
count_ops(net, inp)

Operation  OPS    
---------  -----  
_0         1470   
_1         490    
_2_conv1   1470   
--------   ----   
Input size: (1, 5, 7, 7)
3,430 FLOPs or approx. 0.00 GFLOPs


(3430, [['_0', 1470], ['_1', 490], ['_2_conv1', 1470]])

In [45]:
from torch.profiler import profile, record_function, ProfilerActivity


In [11]:
print("creating the test_iter")
test_iter = create_dataset('google/bert_uncased_L-12_H-128_A-2', True)
print("loading the pretrained model")
model = BERT('google/bert_uncased_L-12_H-128_A-2').to(device)
model.eval()
with torch.no_grad():
    for (text,sentiment), _ in test_iter:
        text = text.type(torch.LongTensor)           
        text = text.to(device)
        sentiment = sentiment.type(torch.LongTensor)  
        sentiment = sentiment.to(device)

        ## model inference starts
        with profile(activities=[ProfilerActivity.CUDA],with_flops=True, profile_memory=True, record_shapes=True) as prof:
            model(text, sentiment)
        ## model inference ends.
#         


        table_cuda_self_memory = prof.key_averages()
#     .table(sort_by="self_cuda_memory_usage", row_limit=10,max_src_column_width=75)

        table_cuda_memory=prof.key_averages()
#     .table(sort_by="cuda_memory_usage", row_limit=10,max_src_column_width=75)
        break

creating the test_iter
loading the pretrained model


Downloading:   0%|          | 0.00/25.7M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/bert_uncased_L-12_H-128_A-2 were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification 

In [25]:
for entry in table_cuda_self_memory:
    print(entry.key, entry.cpu_time)

[memory] 0.0
cudaLaunchKernel 41424.098360655735
cudaMalloc 225.44444444444446
cudaFree 0.3333333333333333
cudaDeviceGetAttribute 0.0
cudaMemcpy 34.0
cudaFuncSetAttribute 0.8165680473372781
cudaEventCreateWithFlags 0.25
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags 0.8783783783783784
cudaEventQuery 2.0
cudaEventRecord 0.75
cudaDeviceSynchronize 21357279.0


In [24]:
table_cuda_self_memory[3]

<FunctionEventAvg key=cudaFree self_cpu_time=1.000us cpu_time=0.333us  self_cuda_time=0.000us cuda_time=0.000us input_shapes= cpu_memory_usage=0 cuda_memory_usage=0>

In [57]:
prof.export_chrome_trace("sample.json")

In [19]:
my_table = prof.key_averages().table(sort_by="cpu_memory_usage", row_limit=10,max_src_column_width=30)


In [21]:
table_cuda_self_memory

[<FunctionEventAvg key=[memory] self_cpu_time=0.000us cpu_time=0.000us  self_cuda_time=0.000us cuda_time=0.000us input_shapes= cpu_memory_usage=0 cuda_memory_usage=0>,
 <FunctionEventAvg key=cudaLaunchKernel self_cpu_time=15.161s cpu_time=41.424ms  self_cuda_time=0.000us cuda_time=0.000us input_shapes= cpu_memory_usage=0 cuda_memory_usage=0>,
 <FunctionEventAvg key=cudaMalloc self_cpu_time=2.029ms cpu_time=225.444us  self_cuda_time=0.000us cuda_time=0.000us input_shapes= cpu_memory_usage=0 cuda_memory_usage=0>,
 <FunctionEventAvg key=cudaFree self_cpu_time=1.000us cpu_time=0.333us  self_cuda_time=0.000us cuda_time=0.000us input_shapes= cpu_memory_usage=0 cuda_memory_usage=0>,
 <FunctionEventAvg key=cudaDeviceGetAttribute self_cpu_time=0.000us cpu_time=0.000us  self_cuda_time=0.000us cuda_time=0.000us input_shapes= cpu_memory_usage=0 cuda_memory_usage=0>,
 <FunctionEventAvg key=cudaMemcpy self_cpu_time=34.000us cpu_time=34.000us  self_cuda_time=0.000us cuda_time=0.000us input_shapes= cp

In [20]:
my_table

'-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  \n                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  \n-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  \n                                               [memory]         0.00%       0.000us         0.00%       0.000us       0.000us           0 b           0 b           638  \n                                       cudaLaunchKernel        41.51%       15.161s        41.51%       15.161s      41.424ms           0 b           0 b           366  \n                                             cudaMalloc         0.01%       2.029ms         0.01%       2.029ms     225.444us           0 b     

In [52]:
type(my_table)

str

In [None]:
text_file = open("D:/data.txt", "w")
 
#write string to file
text_file.write('Python Tutorial by TutorialKart.')
 
#close file
text_file.close()

In [58]:
with open("table_cuda_self_memory.txt","w")as f:
    f.write(my_table)

In [None]:
model = BERT_for_inference('google/bert_uncased_L-2_H-768_A-12').to(device)
print("beginning inference mock")
flops= inference_mock(model,test_iter)
print("flops : ", flops)