In [None]:
# Mount the google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
import spacy
from tqdm import tqdm
import pickle

# Load the input data for training
path = "/content/drive/MyDrive/Data extraction from financial documents/data/"
filename = "annotations_50_updated.pkl"

with open(path + filename, 'rb') as f:
  TRAIN_DATA = pickle.load(f)

In [None]:
TRAIN_DATA

[('Trade Receivable 858.0', {'entities': [(17, 22, 'Account Receivable')]}),
 ('Account Receivable 713.0', {'entities': [(19, 24, 'Account Receivable')]}),
 ('Investments 666.0', {'entities': [(12, 17, 'Investments')]}),
 ('Equity share capital 377.0',
  {'entities': [(21, 26, 'Equity share capital')]}),
 ('Borrowings 278.0', {'entities': [(11, 16, 'Borrowings')]}),
 ('trade receivable 321.0', {'entities': [(17, 22, 'Account Receivable')]}),
 ('account receivable 249.0', {'entities': [(19, 24, 'Account Receivable')]}),
 ('investments 786.0', {'entities': [(12, 17, 'Investments')]}),
 ('equity share capital 326.0',
  {'entities': [(21, 26, 'Equity share capital')]}),
 ('borrowings 212.0', {'entities': [(11, 16, 'Borrowings')]}),
 ('trade receiable 116.0', {'entities': [(16, 21, 'Account Receivable')]}),
 ('account receiable 243.0', {'entities': [(18, 23, 'Account Receivable')]}),
 ('invstments 470.0', {'entities': [(11, 16, 'Investments')]}),
 ('equity share capial 415.0',
  {'entities'

### Build a Spacy NER model

In [None]:
# Define variables requred for the model
model = None
output_dir=Path("/content/drive/MyDrive/Data extraction from financial documents/models/model")
n_iter=100

In [None]:
#load the model

if model is not None:
    nlp = spacy.load(model)  
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('en')  
    print("Created blank 'en' model")

#set up the pipeline

if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
else:
    ner = nlp.get_pipe('ner')

Created blank 'en' model


In [None]:
#Train the recognizer by disabling the unnecessary pipeline except for NER

for _, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in tqdm(TRAIN_DATA):
            nlp.update(
                [text],  
                [annotations],  
                drop=0.5,  
                sgd=optimizer,
                losses=losses)
        print(losses)

100%|██████████| 50/50 [00:01<00:00, 40.36it/s]


{'ner': 89.10421368278912}


100%|██████████| 50/50 [00:01<00:00, 40.83it/s]


{'ner': 42.70418373383231}


100%|██████████| 50/50 [00:01<00:00, 42.08it/s]


{'ner': 23.583780533152293}


100%|██████████| 50/50 [00:01<00:00, 41.15it/s]


{'ner': 12.787695145412123}


100%|██████████| 50/50 [00:01<00:00, 42.94it/s]


{'ner': 5.220235081368129}


100%|██████████| 50/50 [00:01<00:00, 43.01it/s]


{'ner': 2.475828200571298}


100%|██████████| 50/50 [00:01<00:00, 41.19it/s]


{'ner': 0.07110868781723223}


100%|██████████| 50/50 [00:01<00:00, 42.84it/s]


{'ner': 0.08858482616253233}


100%|██████████| 50/50 [00:01<00:00, 41.91it/s]


{'ner': 0.04083345770084793}


100%|██████████| 50/50 [00:01<00:00, 41.92it/s]


{'ner': 0.13819045472867972}


100%|██████████| 50/50 [00:01<00:00, 40.70it/s]


{'ner': 0.00014175815703918894}


100%|██████████| 50/50 [00:01<00:00, 35.57it/s]


{'ner': 7.171073451892381e-05}


100%|██████████| 50/50 [00:01<00:00, 31.06it/s]


{'ner': 2.460232106783249e-07}


100%|██████████| 50/50 [00:01<00:00, 27.68it/s]


{'ner': 0.004889017922736962}


100%|██████████| 50/50 [00:01<00:00, 27.56it/s]


{'ner': 0.0024016758694512}


100%|██████████| 50/50 [00:01<00:00, 27.13it/s]


{'ner': 1.1246613960352143}


100%|██████████| 50/50 [00:01<00:00, 27.55it/s]


{'ner': 0.0033596042174094177}


100%|██████████| 50/50 [00:01<00:00, 29.02it/s]


{'ner': 2.6425372814764986e-05}


100%|██████████| 50/50 [00:01<00:00, 33.13it/s]


{'ner': 7.606751139369673e-05}


100%|██████████| 50/50 [00:01<00:00, 40.20it/s]


{'ner': 0.00011700118304051276}


100%|██████████| 50/50 [00:01<00:00, 42.98it/s]


{'ner': 0.012480775100023088}


100%|██████████| 50/50 [00:01<00:00, 43.63it/s]


{'ner': 0.002242695759137215}


100%|██████████| 50/50 [00:01<00:00, 43.07it/s]


{'ner': 0.07430706971011132}


100%|██████████| 50/50 [00:01<00:00, 42.47it/s]


{'ner': 7.89205684233965e-05}


100%|██████████| 50/50 [00:01<00:00, 41.67it/s]


{'ner': 4.087016785523387e-05}


100%|██████████| 50/50 [00:01<00:00, 43.37it/s]


{'ner': 0.00022327006212371115}


100%|██████████| 50/50 [00:01<00:00, 42.95it/s]


{'ner': 1.9691368447299793}


100%|██████████| 50/50 [00:01<00:00, 38.13it/s]


{'ner': 3.116506835493114e-06}


100%|██████████| 50/50 [00:01<00:00, 29.70it/s]


{'ner': 8.796188356842536e-08}


100%|██████████| 50/50 [00:02<00:00, 22.15it/s]


{'ner': 1.2360826220697473e-07}


100%|██████████| 50/50 [00:02<00:00, 19.06it/s]


{'ner': 6.850363327776095e-07}


100%|██████████| 50/50 [00:02<00:00, 18.91it/s]


{'ner': 8.160082583942177e-08}


100%|██████████| 50/50 [00:02<00:00, 18.55it/s]


{'ner': 4.270769827473835e-05}


100%|██████████| 50/50 [00:02<00:00, 17.83it/s]


{'ner': 3.0347229513127832e-05}


100%|██████████| 50/50 [00:02<00:00, 17.29it/s]


{'ner': 3.2121521722899967e-07}


100%|██████████| 50/50 [00:02<00:00, 17.26it/s]


{'ner': 2.5232643285117187e-08}


100%|██████████| 50/50 [00:02<00:00, 17.35it/s]


{'ner': 2.6212035830197534e-07}


100%|██████████| 50/50 [00:02<00:00, 17.33it/s]


{'ner': 9.860260023996792e-06}


100%|██████████| 50/50 [00:02<00:00, 17.08it/s]


{'ner': 2.3512681120327863e-07}


100%|██████████| 50/50 [00:02<00:00, 17.20it/s]


{'ner': 7.982395210780874e-08}


100%|██████████| 50/50 [00:02<00:00, 17.08it/s]


{'ner': 1.3539496449972258e-07}


100%|██████████| 50/50 [00:02<00:00, 17.11it/s]


{'ner': 0.0021361069788754803}


100%|██████████| 50/50 [00:02<00:00, 16.98it/s]


{'ner': 0.0037209182044772113}


100%|██████████| 50/50 [00:02<00:00, 17.01it/s]


{'ner': 2.2841233364212235e-07}


100%|██████████| 50/50 [00:02<00:00, 17.37it/s]


{'ner': 0.002725441623389266}


100%|██████████| 50/50 [00:02<00:00, 17.05it/s]


{'ner': 7.565789296664306e-05}


100%|██████████| 50/50 [00:02<00:00, 17.04it/s]


{'ner': 7.974864292835674e-06}


100%|██████████| 50/50 [00:02<00:00, 16.97it/s]


{'ner': 0.005056563382310468}


100%|██████████| 50/50 [00:02<00:00, 16.95it/s]


{'ner': 7.712699409981696e-06}


100%|██████████| 50/50 [00:02<00:00, 17.05it/s]


{'ner': 3.13465784609301e-05}


100%|██████████| 50/50 [00:02<00:00, 17.08it/s]


{'ner': 7.514527465878135e-06}


100%|██████████| 50/50 [00:02<00:00, 17.09it/s]


{'ner': 6.976512748806686e-08}


100%|██████████| 50/50 [00:02<00:00, 17.05it/s]


{'ner': 1.2008024278717687e-06}


100%|██████████| 50/50 [00:02<00:00, 17.21it/s]


{'ner': 1.0564082985773681e-05}


100%|██████████| 50/50 [00:02<00:00, 17.03it/s]


{'ner': 2.8770833766597676e-08}


100%|██████████| 50/50 [00:02<00:00, 17.23it/s]


{'ner': 8.543837431442036e-09}


100%|██████████| 50/50 [00:02<00:00, 17.13it/s]


{'ner': 9.104235041966481e-07}


100%|██████████| 50/50 [00:02<00:00, 17.06it/s]


{'ner': 4.5197091570656045e-07}


100%|██████████| 50/50 [00:02<00:00, 17.12it/s]


{'ner': 2.396182946133014e-06}


100%|██████████| 50/50 [00:02<00:00, 17.19it/s]


{'ner': 2.8141345897237273e-07}


100%|██████████| 50/50 [00:02<00:00, 17.00it/s]


{'ner': 3.800249607555334e-07}


100%|██████████| 50/50 [00:02<00:00, 17.19it/s]


{'ner': 2.668559281755317e-08}


100%|██████████| 50/50 [00:02<00:00, 17.12it/s]


{'ner': 8.521883286545987e-06}


100%|██████████| 50/50 [00:02<00:00, 17.35it/s]


{'ner': 1.4771755601360042e-08}


100%|██████████| 50/50 [00:02<00:00, 17.18it/s]


{'ner': 6.415749197003479e-07}


100%|██████████| 50/50 [00:02<00:00, 17.05it/s]


{'ner': 7.030467882838133e-10}


100%|██████████| 50/50 [00:02<00:00, 17.21it/s]


{'ner': 3.2101015361657463e-10}


100%|██████████| 50/50 [00:02<00:00, 17.21it/s]


{'ner': 1.1128564377472813e-08}


100%|██████████| 50/50 [00:02<00:00, 17.19it/s]


{'ner': 1.678659408736956e-05}


100%|██████████| 50/50 [00:02<00:00, 17.18it/s]


{'ner': 1.137287847949104e-07}


100%|██████████| 50/50 [00:02<00:00, 17.02it/s]


{'ner': 4.24581874609763e-08}


100%|██████████| 50/50 [00:02<00:00, 17.26it/s]


{'ner': 4.779270882327051e-12}


100%|██████████| 50/50 [00:02<00:00, 17.16it/s]


{'ner': 2.300556186708032e-08}


100%|██████████| 50/50 [00:02<00:00, 17.32it/s]


{'ner': 4.527245506868951e-08}


100%|██████████| 50/50 [00:02<00:00, 17.11it/s]


{'ner': 2.080728568163061e-09}


100%|██████████| 50/50 [00:02<00:00, 16.98it/s]


{'ner': 3.594514833746586e-09}


100%|██████████| 50/50 [00:02<00:00, 17.11it/s]


{'ner': 3.841943616354099e-06}


100%|██████████| 50/50 [00:02<00:00, 16.96it/s]


{'ner': 2.472116082512276e-05}


100%|██████████| 50/50 [00:02<00:00, 17.16it/s]


{'ner': 6.5724633502729205e-09}


100%|██████████| 50/50 [00:02<00:00, 17.06it/s]


{'ner': 5.613870966792791e-08}


100%|██████████| 50/50 [00:02<00:00, 17.15it/s]


{'ner': 4.979878680177599e-10}


100%|██████████| 50/50 [00:02<00:00, 17.09it/s]


{'ner': 6.525206695970147e-07}


100%|██████████| 50/50 [00:02<00:00, 17.07it/s]


{'ner': 1.6879263492883573e-10}


100%|██████████| 50/50 [00:02<00:00, 17.19it/s]


{'ner': 2.061518279531495e-08}


100%|██████████| 50/50 [00:02<00:00, 16.95it/s]


{'ner': 4.807006978545002e-08}


100%|██████████| 50/50 [00:02<00:00, 17.09it/s]


{'ner': 0.00010684777539838382}


100%|██████████| 50/50 [00:02<00:00, 16.99it/s]


{'ner': 6.06358249401973e-11}


100%|██████████| 50/50 [00:02<00:00, 16.99it/s]


{'ner': 9.965709457001077e-10}


100%|██████████| 50/50 [00:02<00:00, 16.92it/s]


{'ner': 5.751823144790902e-09}


100%|██████████| 50/50 [00:02<00:00, 17.20it/s]


{'ner': 1.6099911419906622e-06}


100%|██████████| 50/50 [00:02<00:00, 17.04it/s]


{'ner': 1.2273324536113134e-07}


100%|██████████| 50/50 [00:02<00:00, 16.91it/s]


{'ner': 2.3702760743201402e-08}


100%|██████████| 50/50 [00:02<00:00, 16.95it/s]


{'ner': 0.0036541316702214054}


100%|██████████| 50/50 [00:02<00:00, 17.05it/s]


{'ner': 5.75670420646724e-06}


100%|██████████| 50/50 [00:02<00:00, 17.25it/s]


{'ner': 0.00016595520769340964}


100%|██████████| 50/50 [00:02<00:00, 17.14it/s]


{'ner': 1.9101671089365344}


100%|██████████| 50/50 [00:02<00:00, 16.97it/s]


{'ner': 0.024568175424885253}


100%|██████████| 50/50 [00:02<00:00, 16.92it/s]


{'ner': 5.3417767756996}


100%|██████████| 50/50 [00:02<00:00, 16.96it/s]


{'ner': 2.4070291068763363}


100%|██████████| 50/50 [00:02<00:00, 16.92it/s]

{'ner': 3.381840373008801}





In [None]:
# Save model
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

Saved model to /content/drive/MyDrive/Data extraction from financial documents/models/model


In [None]:
# Load the trained model
nlp = spacy.load(output_dir)
doc = nlp("Trad receiable	819.0")
for ent in doc.ents:
  print(ent.label_, ent.text)

Account Receivable 819.0


In [None]:
# Run predictions
path = "/content/drive/MyDrive/Data extraction from financial documents/"
import os
os.chdir(path)
!python predict.py

Entity : Value
Account Receivable : 819.0
Investments : 254.5
Equity share capital : 355.0
Borrowings : 540.2
Account Receivable : 685.0

 The dataframe is :
     Company Financials  Amount
0    Account Receivable   819.0
1           Investments   254.5
2  Equity share capital   355.0
3            Borrowings   540.2
4    Account Receivable   685.0
