## Data Preparation

In [16]:
import re
money_re = re.compile(r'(?i)^((inr|\(?r(s\.?\)?)?|\$)[ ]{0,3})?((\d{1,2}[,])(\d{2}[,])*)*\d{3}(\.\d{2})?(\/-)?(([ ]{0,3}inr|rs\.?|))?$')
number_re = re.compile(r'^-?[0-9]+([.,][0-9]+)*$')
phone_re = re.compile(r'^(((((\([0-9]{3}\))|([0-9]{3}))-?)[0-9]{3}-?)|([0-9]{3}-))[0-9]{4}$')
email_re = re.compile(r'^.+@.+\.[a-z]+$')
gst_re = re.compile(r'\d{2}[A-Z]{5}\d{4}[A-Z]{1}\d[A-Z0-9]{1}[A-Z\d]{1}')

pattern_date_1 = (
    "(2[01]\d{2}|19\d{2})[ ]?[- ///.][ ]?(0?[1-9]|1[012])"
    "[ ]?[- ///.][ ]?(3[01]|[012]?\d)"
)

# Examples:
# 2017-11-30

######################
pattern_date_2 = (
    "(3[01]|[012]?\d)[ ]?[-][ ]?(0?[1-9]|1[012])"
    "[ ]?[-][ ]?(2[01]\d{2}|19\d{2})"
)

# Examples:
# 29-01-2017
# 05-10-2017

######################
pattern_date_3 = (
    "(3[01]|[012]?\d)[ ]?[//][ ]?(0?[1-9]|1[012])"
    "[ ]?[//][ ]?(2[01]\d{2}|19\d{2})"
)
# 11/01/2018

######################
pattern_date_4 = (
    "(3[01]|[012]?\d)[ ]?[/.][ ]?(0?[1-9]|1[012])"
    "[ ]?[/.][ ]?(2[01]\d{2}|19\d{2})"
)
# 31.08.2017

#####################
pattern_date_5= (
    "(?i)(3[01]|[012]?\d)(th|nd|st)?[ ]?[-][ ]?"
    "(Jan(uary)?|Feb(ruary)?|Mar(ch)?|Apr(il)?|May|Jun(e)?|Jul(y)?|Aug(ust)?"
    "|Sep(tember)?|Oct(ober)?|Nov(ember)?|Dec(ember)?)"
    "[ ]?[-][ ]?\d{2}(\d{2})?"
)

# Examples:
# 21-SEP-19
# 07-Nov-2017
# 7-Nov-2017
# 26-Dec-2017

######################
pattern_date_6= (
    "(?i)(3[01]|[012]?\d)(th|nd|st)?[ ]?[ ,][ ]?"
    "(Jan(uary)?|Feb(ruary)?|Mar(ch)?|Apr(il)?|May|Jun(e)?|Jul(y)?|Aug(ust)?"
    "|Sep(tember)?|Oct(ober)?|Nov(ember)?|Dec(ember)?)"
    "[ ]?[ ,'][ ]?\d{2}(\d{2})?"
)
# 06 Dec 2017
# 5 Jan 2018
# 24 February 2018
# 28 Feb 2018
# 28 February,2018
# 18th Aug'17
# 09 Nov 2017

########################
pattern_date_7 = (
    "(?i)(Jan(uary)?|Feb(ruary)?|Mar(ch)?|Apr(il)?|May|Jun(e)?"
    "|Jul(y)?|Aug(ust)?|Sep(tember)?|Oct(ober)?|Nov(ember)?|Dec(ember)?)"
    "[- ][0123]?\d(th|nd|st)?[- ,'][ ]?\d{2}(\d{2})?"
)
date_re = re.compile('|'.join([pattern_date_1, pattern_date_2, pattern_date_3, \
pattern_date_4, pattern_date_5, pattern_date_6, pattern_date_7])
)
abbrev_dict = {
    "#": "number",
    "acct": "account",
    "amt":  "amount",
    "cnt":  "count",
    "cust": "customer",
    "dept": "department",
    "no.":  "number",
    "no":   "number",
    "num":  "number",
    "ord": "order",
    "pcs": "pieces",
    "qty": "quantity",
    "ref": "reference",
    "seq": "sequence",
    "shp": "ship",
    "tel": "telephone",
    "tkt": "ticket"
}
def get_type(words):
    # default type is text
#     word_type = "text"
    # try parsing as date
    if money_re.match(words):
        words = "money"
    elif number_re.match(words):
        words = "number"
    elif date_re.match(words):
        words = "date"
    return words



In [17]:
import pickle

X_train, X_val, y_train, y_val=pickle.load(open('train.pk','rb'))
X_train,X_val, y_train, y_val=list(X_train), list(X_val), list(y_train), list(y_val)

In [18]:
 # Since the model can't take more than 512 tokens for a particular sentence after word peice tokenizer
 # We can further divide the sentences into 50 words length sentences to fit into memory & can be used by Bert model
 def divide_chunks(l, n): 
    # looping till length l 
    for i in range(0, len(l), n):  
        yield l[i:i + n]
 

 def chunks(x,l,n=84):
    text = x.split(' ')
    labels = list(l)
    length = len(text)
    if length > n:
        ntext = list(divide_chunks(text,n))
        nlabels=list(divide_chunks(labels,n))
        return ntext,nlabels
    else:
        ntext = text
        nlabels=labels
        return ntext,nlabels


In [19]:
XT,YT=[],[]
for i in zip(X_val,y_val):
    xt2,yt2=i
    xt,yt=chunks(xt2,yt2,n=50)
    for j in zip(xt,yt):
        xt1,yt1=j
        
        XT.append([get_type(x) for x in xt1])

        YT.append(yt1)
XVAL=XT
YVAL=YT

In [20]:
XT,YT=[],[]
for i in zip(X_train,y_train):
    xt2,yt2=i
    xt,yt=chunks(xt2,yt2,n=50)
    for j in zip(xt,yt):
        xt1,yt1=j
        
        XT.append([get_type(x) for x in xt1])

        YT.append(yt1)
    


In [22]:
XVAL[9]

['of',
 'Goods',
 'HSN/SAC',
 'Quantity',
 'Rate',
 'per',
 'Disc.',
 'Amount',
 'No.',
 'ZIPPER',
 'ROLL',
 'HSN:',
 'number',
 'number',
 'number',
 'ROLL',
 'money',
 'ROLL',
 'money',
 'CGST',
 'OUTPUT\n',
 'SGST',
 'OUTPUT\n',
 'ROUND',
 'OFF',
 'money',
 'money',
 '(-)0.28',
 'Less',
 'Total\n',
 'number',
 'ROLL',
 'money',
 'Amount',
 'Chargeable',
 '(in',
 'words)',
 'E.',
 'O.E\n',
 'INR',
 'Sixty',
 'Eight',
 'Thousand',
 'Six',
 'Hundred',
 'Seventy',
 'One',
 'Only\n',
 'HSN/SAC\n',
 'Taxable']

# Define pre-trained tokenizer


In [5]:
from pathlib import Path
import torch
from torch import nn
# import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM,BertForTokenClassification

import pytorch_pretrained_bert as _bert

import numpy as np

def flatten(list_of_lists):
    for list in list_of_lists:
        for item in list:
            yield item



_device = torch.device("cuda")
class Bert():

    MASK = '[MASK]'
    CLS = "[CLS]"
    SEP = "[SEP]"

#     supported_langs = set(lines(
#         Path(__file__).parent / "data" / "bert_langs.wiki"))

    def __init__(self, model, model_name, device=None, half_precision=False):
        super().__init__()
        self.model_name = model_name
        self.device = device or _device
        do_lower_case = "uncased" in model_name
        self.tokenizer = _bert.BertTokenizer.from_pretrained(
            self.model_name, do_lower_case=do_lower_case)
        maybe_model_wrapper = model.from_pretrained(model_name).to(
            device=self.device)
        try:
            self.model = maybe_model_wrapper.bert
        except AttributeError:
            self.model = maybe_model_wrapper
        if half_precision:
            self.model.half()
        self.max_len = \
            self.model.embeddings.position_embeddings.weight.size(0)
        self.dim = self.model.embeddings.position_embeddings.weight.size(1)

    def tokenize(self, text, masked_idxs=None):
        tokenized_text = self.tokenizer.tokenize(text)
        if masked_idxs is not None:
            for idx in masked_idxs:
                tokenized_text[idx] = self.MASK
        # prepend [CLS] and append [SEP]
        # see https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_classifier.py#L195  # NOQA
        tokenized = [self.CLS] + tokenized_text + [self.SEP]
        return tokenized

    def tokenize_to_ids(self, text, masked_idxs=None, pad=True):
        tokens = self.tokenize(text, masked_idxs)
        return self.convert_tokens_to_ids(tokens, pad=pad)

    def convert_tokens_to_ids(self, tokens, pad=True):
        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        ids = torch.tensor([token_ids]).to(device=self.device)
        assert ids.size(1) < self.max_len
        if pad:
            padded_ids = torch.zeros(1, self.max_len).to(ids)
            padded_ids[0, :ids.size(1)] = ids
            mask = torch.zeros(1, self.max_len).to(ids)
            mask[0, :ids.size(1)] = 1
            return padded_ids, mask
        else:
            return ids

    def subword_tokenize(self, tokens):
        """Segment each token into subwords while keeping track of
        token boundaries.
        Parameters
        ----------
        tokens: A sequence of strings, representing input tokens.
        Returns
        -------
        A tuple consisting of:
            - A list of subwords, flanked by the special symbols required
                by Bert (CLS and SEP).
            - An array of indices into the list of subwords, indicating
                that the corresponding subword is the start of a new
                token. For example, [1, 3, 4, 7] means that the subwords
                1, 3, 4, 7 are token starts, while all other subwords
                (0, 2, 5, 6, 8...) are in or at the end of tokens.
                This list allows selecting Bert hidden states that
                represent tokens, which is necessary in sequence
                labeling.
        """
        subwords = list(map(self.tokenizer.tokenize, tokens))
        subword_lengths = list(map(len, subwords))
        subwords = [self.CLS] + list(flatten(subwords)) + [self.SEP]
        token_start_idxs = 1 + np.cumsum([0] + subword_lengths[:-1])
        return subwords, token_start_idxs

    def subword_tokenize_to_ids(self, tokens):
        """Segment each token into subwords while keeping track of
        token boundaries and convert subwords into IDs.
        Parameters
        ----------
        tokens: A sequence of strings, representing input tokens.
        Returns
        -------
        A tuple consisting of:
            - A list of subword IDs, including IDs of the special
                symbols (CLS and SEP) required by Bert.
            - A mask indicating padding tokens.
            - An array of indices into the list of subwords. See
                doc of subword_tokenize.
        """
        subwords, token_start_idxs = self.subword_tokenize(tokens)
        subword_ids, mask = self.convert_tokens_to_ids(subwords)
        token_starts = torch.zeros(1, self.max_len).to(subword_ids)
        token_starts[0, token_start_idxs] = 1
        return subword_ids, mask, token_starts

    def segment_ids(self, segment1_len, segment2_len):
        ids = [0] * segment1_len + [1] * segment2_len
        return torch.tensor([ids]).to(device=self.device)

    @staticmethod
    def Model(model_name, **kwargs):
        return Bert(_bert.BertModel, model_name, **kwargs)

    @staticmethod
    def ForMaskedLM(model_name, **kwargs):
        return Bert(_bert.BertForMaskedLM, model_name, **kwargs)

    @staticmethod
    def ForSequenceClassification(model_name, **kwargs):
        return Bert(
            _bert.BertForSequenceClassification, model_name, **kwargs)

    @staticmethod
    def ForNextSentencePrediction(model_name, **kwargs):
        return Bert(_bert.BertForNextSentencePrediction, model_name, **kwargs)

    @staticmethod
    def ForForPreTraining(model_name, **kwargs):
        return Bert(_bert.BertForPreTraining, model_name, **kwargs)

    @staticmethod
    def ForForQuestionAnswering(model_name, **kwargs):
        return Bert(_bert.BertForQuestionAnswering, model_name, **kwargs)

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [6]:
bert = Bert(BertModel,'bert-base-cased')

In [7]:
def bert_features(tokens):
    featurized_sentences=[]
    for i in tokens:
        features = {}
        features["bert_ids"], features["bert_mask"], features["bert_token_starts"] = bert.subword_tokenize_to_ids(i)
        featurized_sentences.append(features)
    bert_batch = [torch.cat([features[key] for features in featurized_sentences], dim=0) for key in ("bert_ids", "bert_mask", "bert_token_starts")]
    return bert_batch

In [13]:
bert_feature = bert_features(XT)
bert_feature_val = bert_features(XVAL)

In [14]:
from sklearn.utils.class_weight import compute_class_weight

cc=compute_class_weight('balanced',[0,1,2,3,4],[int(y) for x in YT for y in x])

# Model Definintion & Learning

In [1]:
import pandas as pd
import pickle as pk
# from err import model

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn.utils.rnn import pad_sequence

import numpy as np

import re
from tqdm import tqdm
import ast
from sklearn.metrics import confusion_matrix, f1_score,classification_report
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM,BertForTokenClassification

pd.set_option('display.max_rows', 100)

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [4]:
model = BertModel.from_pretrained("bert-base-cased").to(device=torch.device("cuda"))


In [2]:
class SequenceTagger(torch.nn.Module):
     def __init__(self):
            super(SequenceTagger, self).__init__()
            self.bert = model
            bert_dim = 768 # (or get the dim from BertEmbeddings)
            n_labels = 5  # need to set this for your task
            self.out = torch.nn.Linear(bert_dim, n_labels)
    
     def forward(self, bert_batch):
#             import pdb;pdb.set_trace()
            bert_ids, bert_mask, bert_token_starts = bert_batch
            
            # truncate to longest sequence length in batch (usually much smaller than 512) to save GPU RAM
            max_length = (bert_mask != 0).max(0)[0].nonzero()[-1].item()
            if max_length < bert_ids.shape[1]:
                    bert_ids = bert_ids[:, :max_length]
                    bert_mask = bert_mask[:, :max_length]

            segment_ids = torch.zeros_like(bert_mask)  # dummy segment IDs, since we only have one sentence
            bert_last_layer = self.bert(bert_ids, segment_ids)[0][-1]
            # select the states representing each token start, for each instance in the batch
            bert_token_reprs = [
                   layer[starts.nonzero().squeeze(1)]
                   for layer, starts in zip(bert_last_layer, bert_token_starts)]
            # need to pad because sentence length varies
            padded_bert_token_reprs = pad_sequence(
                   bert_token_reprs, batch_first=True, padding_value=-1)
            # output/classification layer: input bert states and get log probabilities for cross entropy loss
            pred_logits = self.out(padded_bert_token_reprs)

            return pred_logits

In [18]:
from torch.nn.utils.rnn import pad_sequence

encoder = SequenceTagger().cuda()

In [27]:
batch_size = 4


#weight=torch.tensor(list(cc)).cuda()
cross_entropy = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(encoder.parameters(), lr =  1e-5,weight_decay=0.0)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=10)

encoder.zero_grad()
epochs = 100
score = 0


train_size = len(XT)
loss_plot=[]
for iij in range(epochs):
    encoder.train()
    epoch_loss = 0
    for param_group in optimizer.param_groups:
        print ("Learning rate: ", param_group['lr'])
    
    for j, batch in enumerate(np.random.randint(0,train_size, size=(int(train_size/batch_size),batch_size))):
        optimizer.zero_grad()
        ytr=[]
        ypr=[]

        for index in batch:
            true_labels=torch.tensor([int(x) for x in list(list(YT)[index])]).long()
            
            bert_batch=[bert_feature[0][index].view([1,512]),bert_feature[1][index].view([1,512]),bert_feature[2][index].view([1,512])]
            pred_logits=encoder.forward(bert_batch)
            y_pred=pred_logits.view([pred_logits.size()[1],5])
            ytr.append(true_labels.cuda())
            ypr.append(y_pred)
        yp=torch.cat(ypr, dim=0)
        yt=torch.cat(ytr, dim=0).long()
        loss = cross_entropy(yp,yt)
        loss.backward()
        optimizer.step()
        epoch_loss +=loss.data
    
    loss_plot.append(epoch_loss/float(train_size/batch_size))
    scheduler.step(epoch_loss/float(train_size/batch_size))
######################################################################################
    encoder.eval()
    all_pred = []
    all_label = []
    for index, test_idx in enumerate(XT):

        true_labels=torch.tensor([int(x) for x in list(list(YT)[index])]).long()    
        bert_batch=[bert_feature[0][index].view([1,512]),bert_feature[1][index].view([1,512]),bert_feature[2][index].view([1,512])]
        pred_logits=encoder.forward(bert_batch)
        y_pred=pred_logits.view([pred_logits.size()[1],5])

        predictions = np.argmax(y_pred.cpu().data.numpy(), axis=1)

        all_pred = all_pred + list(predictions)
        all_label = all_label + list(true_labels)
    f1_train=f1_score(np.array(all_label), np.array(all_pred),average='macro')
#     print ('Epoch: ', iij, ' Loss: ', epoch_loss/float(train_size/batch_size),' ','F1 train: ',f1_train)
    all_pred = []
    all_label = []
    for index, test_idx in enumerate(XVAL):

        true_labels=torch.tensor([int(x) for x in list(list(YVAL)[index])]).long()    
        bert_batch=[bert_feature_val[0][index].view([1,512]),bert_feature_val[1][index].view([1,512]),bert_feature_val[2][index].view([1,512])]
        pred_logits=encoder.forward(bert_batch)
        y_pred=pred_logits.view([pred_logits.size()[1],5])

        predictions = np.argmax(y_pred.cpu().data.numpy(), axis=1)

        all_pred = all_pred + list(predictions)
        all_label = all_label + list(true_labels)
    f1_test=f1_score(np.array(all_label), np.array(all_pred),average='macro')


    print(classification_report(np.array(all_label), np.array(all_pred)))

    
    
    
#     scheduler.step(epoch_loss/float(train_size/batch_size))
    if score <= f1_test:
        print(classification_report(np.array(all_label), np.array(all_pred)))
        early_stop_count = 0
#         print ("\n Score increased from {} to {}".format(score, f1_test))
        score = f1_test

        if score > 0.95:
            print(classification_report(np.array(all_label), np.array(all_pred)))
            torch.save({
                'epoch': iij + 1,
                'state_dict': encoder.state_dict(),
                'optimizer': optimizer.state_dict(),
                'model': encoder,
                'score': score
            }, '/home/raghu/invoice/models/' + 'invoice_bert_withoutW_' + str(iij + 1)+ "_" + str(int(score*100)) + '.pt')
    else:
        early_stop_count = early_stop_count + 1
#         print ("Score decreased. Count: ", early_stop_count)    

#     if early_stop_count == 20:
#         break
    print ('Epoch: ', iij, ' Loss: ', epoch_loss/float(train_size/batch_size),' ','F1 Test Score',f1_test,' ','F1 Train Score',f1_train)
#     print ("---------------------------------------------------------------------------------")
    

Learning rate:  1e-05
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5868
           1       0.92      0.94      0.93        36
           2       0.95      0.92      0.93        38
           3       0.98      0.90      0.94       131
           4       1.00      0.97      0.99        37

   micro avg       1.00      1.00      1.00      6110
   macro avg       0.97      0.95      0.96      6110
weighted avg       1.00      1.00      1.00      6110

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5868
           1       0.92      0.94      0.93        36
           2       0.95      0.92      0.93        38
           3       0.98      0.90      0.94       131
           4       1.00      0.97      0.99        37

   micro avg       1.00      1.00      1.00      6110
   macro avg       0.97      0.95      0.96      6110
weighted avg       1.00      1.00      1.00      6110



KeyboardInterrupt: 

In [15]:
def final_predictions_from_block(text_blocks):
    all_pred = []
    for index, test_idx in enumerate(text_blocks):   
        bert_batch=[bert_feature_val[0][index].view([1,512]).cpu(),bert_feature_val[1][index].view([1,512]).cpu(),bert_feature_val[2][index].view([1,512]).cpu()]
        pred_logits=encoder.forward(bert_batch)
        y_pred=pred_logits.view([pred_logits.size()[1],5])

        predictions = np.argmax(y_pred.cpu().data.numpy(), axis=1)
        all_pred = all_pred + list(predictions)
    return all_pred

In [8]:
x = torch.load('/home/raghu/invoice/models/invoice_bert_withoutW_7_95.pt')

In [9]:
encoder = x['model']

In [7]:
encoder.eval()
all_pred = []
all_label = []
for index, test_idx in enumerate(XVAL):

    true_labels=torch.tensor([int(x) for x in list(list(YVAL)[index])]).long()    
    bert_batch=[bert_feature_val[0][index].view([1,512]),bert_feature_val[1][index].view([1,512]),bert_feature_val[2][index].view([1,512])]
    pred_logits=encoder.forward(bert_batch)
    y_pred=pred_logits.view([pred_logits.size()[1],5])

    predictions = np.argmax(y_pred.cpu().data.numpy(), axis=1)

    all_pred = all_pred + list(predictions)
    all_label = all_label + list(true_labels)
f1_test=f1_score(np.array(all_label), np.array(all_pred),average='macro')


print(classification_report(np.array(all_label), np.array(all_pred)))


NameError: name 'XVAL' is not defined

In [38]:
f1_test

0.9576969086397178

In [20]:
torch.save({
    'epoch': iij + 1,
    'state_dict': encoder.state_dict(),
    'optimizer': optimizer.state_dict(),
    'model': encoder,
    'score': score
}, '/home/raghu/invoice/models/' + 'invoice_all_withW_bert_' + str(iij + 1)+ "_" + str(int(f1_test*100)) + '.pt')

  "type " + obj.__name__ + ". It won't be checked "


In [12]:
torch.save(encoder,'/home/raghu/invoice_bert_95.pt')