In [1]:
#!bash download_model.sh

In [2]:
#!pip install onnxruntime-gpu

In [3]:
from transformers import BertModel
from torch import nn
import numpy as np
import torch

## Define Model

In [4]:
class BERT_Arch(nn.Module):
    def __init__(self):
        super(BERT_Arch, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.relu =  nn.ReLU()
        self.fc1 = nn.Linear(768,512)
        self.fc2 = nn.Linear(512,2)
        self.softmax = nn.LogSoftmax(dim=1)
        
    #define the forward pass
    def forward(self, sent_id, mask):
        #pass the inputs to the model  
        _, cls_hs = self.bert(sent_id, attention_mask=mask, return_dict=False)
        x = self.fc1(cls_hs)
        x = self.relu(x)
        x = self.dropout(x)
        # output layer
        x = self.fc2(x)
        # apply softmax activation
        x = self.softmax(x)
        conf, preds = torch.max(x, dim=1)
        return preds.int()

## Load Model

In [5]:
#!bash download_model.sh

In [6]:
model = BERT_Arch()
m_p = 'model.pt'
model.load_state_dict(torch.load(m_p))
model = model.eval()
model = model.half().cuda()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Convert to ONNX (Works on  Pytorch 1.9.1+) 
Fixed with PR : https://github.com/pytorch/pytorch/pull/53053

Following : https://github.com/huggingface/notebooks/blob/master/examples/onnx-export.ipynb


In [7]:
# import sys
# !{sys.executable} -m pip install -i https://test.pypi.org/simple/ ort-nightly
# !{sys.executable} -m pip install --upgrade onnxruntime-tools

In [8]:
with torch.no_grad():
    batch_size=2
    sequence_length = 64
    input_ids = np.random.randint(low=0,high=1024, size=(batch_size, sequence_length))
    attention_mask = np.random.randint(low=0,high=1,size=(batch_size, sequence_length))
    input_ids, attention_mask = torch.Tensor(input_ids).int().cuda(), torch.Tensor(attention_mask).int().cuda()
    
    torch_out = model(input_ids,attention_mask)

    torch.onnx.export(model,               # model being run
                      (input_ids,attention_mask), # model input (or a tuple for multiple inputs)
                      "sentiment_bert.onnx",   # where to save the model (can be a file or file-like object) 
                      export_params=True,        # store the trained parameter weights inside the model file
                      opset_version=11,          # the ONNX version to export the model to
                      do_constant_folding=True,  # whether to execute constant folding for optimization
                      input_names = ['input_ids', 'attention_mask'],   # the model's input names
                      output_names = ['preds'], # the model's output names
                      dynamic_axes={'input_ids' : {0 : 'batch_size', 1: 'sequence_length'},  # variable length axes, 
                                    'attention_mask': {0 : 'batch_size',1:  'sequence_length'},   # variable length axes
                                    'preds' : {0 : 'batch_size'}})  # variable length axes


In [9]:
torch_out.dtype

torch.int32

## Optimize BERT ONNX 
(Currently makes it slower :-( ) 

See Docs at: https://pypi.org/project/onnxruntime-tools/

In [10]:
from onnxruntime.transformers import optimizer

opt_model = optimizer.optimize_model(
    'sentiment_bert.onnx',
    'bert', 
    num_heads=12,
    hidden_size=768,
    use_gpu=True)


opt_model.save_model_to_file('sentiment_bert.opt.onnx')

epsilon value is not expeced: None
epsilon value is not expeced: None
epsilon value is not expeced: None
epsilon value is not expeced: None
epsilon value is not expeced: None
epsilon value is not expeced: None
epsilon value is not expeced: None
epsilon value is not expeced: None
epsilon value is not expeced: None
epsilon value is not expeced: None
epsilon value is not expeced: None
epsilon value is not expeced: None
epsilon value is not expeced: None
epsilon value is not expeced: None
epsilon value is not expeced: None
epsilon value is not expeced: None
epsilon value is not expeced: None
epsilon value is not expeced: None
epsilon value is not expeced: None
epsilon value is not expeced: None
epsilon value is not expeced: None
epsilon value is not expeced: None
epsilon value is not expeced: None
epsilon value is not expeced: None
epsilon value is not expeced: None
