In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import csv
import os
import logging
import argparse
import random
from tqdm import tqdm, trange

import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler

from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.modeling import BertForSequenceClassification
from pytorch_pretrained_bert.optimization import BertAdam
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [None]:
class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.

        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

In [35]:
from pathlib import Path
import torch

import pytorch_pretrained_bert as _bert

import numpy as np

# from dougu import flatten, lines
def flatten(list_of_lists):
    for list in list_of_lists:
        for item in list:
            yield item

_device = torch.device("cuda")


class Bert():

    MASK = "[MASK]"
    CLS = "[CLS]"
    SEP = "[SEP]"

#     supported_langs = set(lines(
#         Path(__file__).parent / "data" / "bert_langs.wiki"))

    def __init__(self, model, model_name, device=None, half_precision=False):
        super().__init__()
        self.model_name = model_name
        self.device = device or _device
        do_lower_case = "uncased" in model_name
        self.tokenizer = _bert.BertTokenizer.from_pretrained(
            self.model_name, do_lower_case=do_lower_case)
        maybe_model_wrapper = model.from_pretrained(model_name).to(
            device=self.device)
        try:
            self.model = maybe_model_wrapper.bert
        except AttributeError:
            self.model = maybe_model_wrapper
        if half_precision:
            self.model.half()
        self.max_len = \
            self.model.embeddings.position_embeddings.weight.size(0)
#         self.max_len = 1000
        self.dim = self.model.embeddings.position_embeddings.weight.size(1)

    def tokenize(self, text, masked_idxs=None):
        tokenized_text = self.tokenizer.tokenize(text)
        if masked_idxs is not None:
            for idx in masked_idxs:
                tokenized_text[idx] = self.MASK
        # prepend [CLS] and append [SEP]
        # see https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_classifier.py#L195  # NOQA
        tokenized = [self.CLS] + tokenized_text + [self.SEP]
        return tokenized

    def tokenize_to_ids(self, text, masked_idxs=None, pad=True):
        tokens = self.tokenize(text, masked_idxs)
        return self.convert_tokens_to_ids(tokens, pad=pad)

    def convert_tokens_to_ids(self, tokens, pad=True):
        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        ids = torch.tensor([token_ids]).to(device=self.device)
        assert ids.size(1) < self.max_len
        if pad:
            padded_ids = torch.zeros(1, self.max_len).to(ids)
            padded_ids[0, :ids.size(1)] = ids
            mask = torch.zeros(1, self.max_len).to(ids)
            mask[0, :ids.size(1)] = 1
            return padded_ids, mask
        else:
            return ids

    def subword_tokenize(self, tokens):
        """Segment each token into subwords while keeping track of
        token boundaries.
        Parameters
        ----------
        tokens: A sequence of strings, representing input tokens.
        Returns
        -------
        A tuple consisting of:
            - A list of subwords, flanked by the special symbols required
                by Bert (CLS and SEP).
            - An array of indices into the list of subwords, indicating
                that the corresponding subword is the start of a new
                token. For example, [1, 3, 4, 7] means that the subwords
                1, 3, 4, 7 are token starts, while all other subwords
                (0, 2, 5, 6, 8...) are in or at the end of tokens.
                This list allows selecting Bert hidden states that
                represent tokens, which is necessary in sequence
                labeling.
        """
        subwords = list(map(self.tokenizer.tokenize, tokens))
        subword_lengths = list(map(len, subwords))
        subwords = [self.CLS] + list(flatten(subwords)) + [self.SEP]
        token_start_idxs = 1 + np.cumsum([0] + subword_lengths[:-1])
        return subwords, token_start_idxs

    def subword_tokenize_to_ids(self, tokens):
        """Segment each token into subwords while keeping track of
        token boundaries and convert subwords into IDs.
        Parameters
        ----------
        tokens: A sequence of strings, representing input tokens.
        Returns
        -------
        A tuple consisting of:
            - A list of subword IDs, including IDs of the special
                symbols (CLS and SEP) required by Bert.
            - A mask indicating padding tokens.
            - An array of indices into the list of subwords. See
                doc of subword_tokenize.
        """
#         import pdb;pdb.set_trace()
        subwords, token_start_idxs = self.subword_tokenize(tokens)
        subword_ids, mask = self.convert_tokens_to_ids(subwords)
        token_starts = torch.zeros(1, self.max_len).to(subword_ids)
        token_starts[0, token_start_idxs] = 1
        return subword_ids, mask, token_starts

    def segment_ids(self, segment1_len, segment2_len):
        ids = [0] * segment1_len + [1] * segment2_len
        return torch.tensor([ids]).to(device=self.device)

    @staticmethod
    def Model(model_name, **kwargs):
        return Bert(_bert.BertModel, model_name, **kwargs)

    @staticmethod
    def ForMaskedLM(model_name, **kwargs):
        return Bert(_bert.BertForMaskedLM, model_name, **kwargs)

    @staticmethod
    def ForSequenceClassification(model_name, **kwargs):
        return Bert(
            _bert.BertForSequenceClassification, model_name, **kwargs)

    @staticmethod
    def ForNextSentencePrediction(model_name, **kwargs):
        return Bert(_bert.BertForNextSentencePrediction, model_name, **kwargs)

    @staticmethod
    def ForPreTraining(model_name, **kwargs):
        return Bert(_bert.BertForPreTraining, model_name, **kwargs)

    @staticmethod
    def ForQuestionAnswering(model_name, **kwargs):
        return Bert(_bert.BertForQuestionAnswering, model_name, **kwargs)

In [36]:
text="TAX INVOICE GSTIN 03AUVPS8413J121 ORIGINAL FOR BUYER Ph:\n 098144-60916 R.S.Printers PLOT NO 5-A UDHYOG VIHAR ,OPP FCI GODN BHATTIAN ROAD VILL BHADUR LUDHIANA\n rsprinter.Idh@gmail.com INVOICE No. B20 Date 01.05.2018 Name Details of Buyer (Billed To)\n Name 'EXCEL TRADING CO. ANANDPURI COLONY NOORWALA ROAD\n LUDHIANA LUDHIANA\n GSTIN O3AUWPS7596M1ZC\n State Punjab State Code 03 LUDHIANA\n GSTIN O3AUWPS7596M1ZC\n State Punjab State Code 03 TRANSPORT GR.No. WEHICLE No. PVT MARK. DATE ..\n NO OF CASES.\n Taxable LIGST CGST\n Value Amt. Amt. Item Name Qty Rate HSN/\n SAC Amount SGST\n Amt. 0% 3.00 6.0 900.00 6.0 TRANSFER STICKER H.D\n TRANSFER STICKER SIZE LABEL TRANSFER STICKER SIZE LABEL 49089000\n 49089000\n 49089000 5000\n 2000\n 3530 1.50 15000.00\n 3000.00\n 3530.00 15000.00\n 3000.00\n 3530.00 6.0 180.00 6.0 900.00\n 180.00\n 212.00 1.00 6.0 212.00 6.0 Totals 21530.00 21530.00 1292.00 1292.00\n 21530.00 TAXABLE VALUE IGST :\n CGST SGST :\n Round off 1292.00\n 1292.00 Bank Details Total Value (in Figures) 24114.00\n"

In [37]:
featurized_sentences = []
# for tokens in sentences:
bert=Bert.Model("bert-base-cased")
tokens=text.split(' ')
features = {}
features["bert_ids"], features["bert_mask"], features["bert_token_starts"] = bert.subword_tokenize_to_ids(tokens)
featurized_sentences.append(features)

In [38]:
true=torch.Tensor([int(x) for x in list('0000000003000000000000000020400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000000000000300000000000000000000000000000000000000')])

In [41]:
def collate_fn(featurized_sentences_batch):
    bert_batch = [torch.cat([features[key] for features in featurized_sentences], dim=0) for key in ("bert_ids", "bert_mask", "bert_token_starts")]
    return bert_batch

In [60]:
class SequenceTagger(torch.nn.Module):
    
        def __init__(self,bert):
                self.bert = bert

                bert_dim = 786 # (or get the dim from BertEmbeddings)
                n_labels = 5  # need to set this for your task
                self.out = torch.nn.Linear(bert_dim, n_labels)
    
        def forward(self, bert_batch, true_labels):
                bert_ids, bert_mask, bert_token_starts = bert_batch
                # truncate to longest sequence length in batch (usually much smaller than 512) to save GPU RAM
                max_length = (bert_mask != 0).max(0)[0].nonzero()[-1].item()
                if max_length < bert_ids.shape[1]:
                        bert_ids = bert_ids[:, :max_length]
                        bert_mask = bert_mask[:, :max_length]

                segment_ids = torch.zeros_like(bert_mask)  # dummy segment IDs, since we only have one sentence
                bert_last_layer = self.bert(bert_ids, segment_ids)[0][-1]
                # select the states representing each token start, for each instance in the batch
                bert_token_reprs = [
                         layer[starts.nonzero().squeeze(1)]
                         for layer, starts in zip(bert_last_layer, bert_token_starts)]
                # need to pad because sentence length varies
                padded_bert_token_reprs = pad_sequence(
                         bert_token_reprs, batch_first=True, padding_value=-1)
                # output/classification layer: input bert states and get log probabilities for cross entropy loss
                pred_logits = self.log_softmax(self.out(self.dropout(padded_bert_token_reprs)))
                mask = true_labels != -1  # I did set label = -1 for all padding tokens somewhere else
                loss = cross_entropy(pred_logits, true_labels)
                # average/reduce the loss according to the actual number of of predictions (i.e. one prediction per token).
                loss /= mask.float().sum()
                return loss


In [59]:
from pytorch_pretrained_bert import BertTokenizer, BertModel
bert = BertModel.from_pretrained('bert-base-cased').to(device=torch.device("cuda"))

In [64]:
bert_batch = collate_fn(featurized_sentences)

In [67]:
bert_ids, bert_mask, bert_token_starts = bert_batch

In [68]:
max_length = (bert_mask != 0).max(0)[0].nonzero()[-1].item()
if max_length < bert_ids.shape[1]:
        bert_ids = bert_ids[:, :max_length]
        bert_mask = bert_mask[:, :max_length]

In [69]:
segment_ids = torch.zeros_like(bert_mask)  # dummy segment IDs, since we on

In [71]:
bert_last_layer = bert(bert_ids, segment_ids)[0][-1]

In [74]:
bert_token_reprs = [
         layer[starts.nonzero().squeeze(1)]
         for layer, starts in zip(bert_last_layer, bert_token_starts)]
# need to pad because sentence length varies
padded_bert_token_reprs = pad_sequence(
         bert_token_reprs, batch_first=True, padding_value=-1)

NameError: name 'pad_sequence' is not defined

In [92]:
len(true)

217