In [26]:
import spacy 
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank('en')   # Load a new spacy model
db = DocBin()    # Create a DocBin object

In [27]:
import json
train = open("train.json", encoding='utf-8')
validation = open('validation.json', encoding='utf-8')
TRAIN_DATA = json.load(train)
VALIDATION_DATA = json.load(validation)

In [28]:
for text, annot in tqdm(TRAIN_DATA['annotations']):
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot['entities']:
        span = doc.char_span(start, end, label=label, alignment_mode = "contract")
        if span is None:
            print("Skipping Entities")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)
    
db.to_disk("./training_data.spacy")   #save the docbin object

100%|███████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 250.15it/s]


#### Create a Spacy data for Validation

In [29]:
for text, annot in tqdm(VALIDATION_DATA['annotations']):
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot['entities']:
        span = doc.char_span(start, end, label=label, alignment_mode = "contract")
        if span is None:
            print("Skipping Entities")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)
    
db.to_disk("./validation_data.spacy")

100%|███████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 375.23it/s]


In [30]:
!python -m spacy init fill-config base_config.cfg config.cfg

[+] Auto-filled config with all values
[+] Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [31]:
!python -m spacy train config.cfg --output ./output

[i] Saving to output directory: output
[i] Using CPU
[1m
[+] Initialized pipeline
[1m
[i] Pipeline: ['tok2vec', 'ner']
[i] Initial learn rate: 0.001
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     43.69    0.00    0.00    0.00    0.00
 36     200        222.12   1132.07   69.84   73.33   66.67    0.70
 80     400          0.01      0.00   69.84   73.33   66.67    0.70
135     600          0.00      0.00   69.84   73.33   66.67    0.70
202     800          0.00      0.00   70.97   75.86   66.67    0.71
291    1000          0.00      0.00   70.97   75.86   66.67    0.71
391    1200          0.00      0.00   70.97   75.86   66.67    0.71
500    1400          0.00      0.00   70.97   75.86   66.67    0.71
700    1600          0.00      0.00   73.33   81.48   66.67    0.73
900    1800          0.00      0.00   72.13   78.57   66.67    0.72
1100    2000          0.00      0

[2022-04-08 02:31:47,118] [INFO] Set up nlp object from config
[2022-04-08 02:31:47,137] [INFO] Pipeline: ['tok2vec', 'ner']
[2022-04-08 02:31:47,146] [INFO] Created vocabulary
[2022-04-08 02:31:47,148] [INFO] Finished initializing nlp object
[2022-04-08 02:31:47,599] [INFO] Initialized pipeline components: ['tok2vec', 'ner']


In [32]:
nlp_ner = spacy.load('output/model-best')



In [33]:
doc = nlp_ner('''A cryptocurrency, crypto-currency, or crypto is a digital currency designed to work as a medium of exchange through a computer network that is not reliant on any central authority, such as a government or bank, to uphold or maintain it.

Individual coin ownership records are stored in a digital ledger, which is a computerized database using strong cryptography to secure transaction records, to control the creation of additional coins, and to verify the transfer of coin ownership.[1][2][3] Despite their name, cryptocurrencies are not necessarily considered to be currencies in the traditional sense and while varying categorical treatments have been applied to them, including classification as commodities, securities, as well as currencies, cryptocurrencies are generally viewed as a distinct asset class in practice.[4][5][6] Some crypto schemes use validators to maintain the cryptocurrency. In a proof-of-stake model, owners put up their tokens as collateral. In return, they get authority over the token in proportion to the amount they stake. Generally, these token stakers get additional ownership in the token over time via network fees, newly minted tokens or other such reward mechanisms''')

In [34]:
spacy.displacy.render(doc, style='ent')