In [10]:
#load spacy
# en_core_web_sm is a pretrained model
import spacy
nlp= spacy.load("en_core_web_sm")

In [11]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [12]:
# ner= name entity recognition
#This doc property is used for the named entities in the document
#If the entity recognizer has been applied, this property will return a tuple of named entity span objects.

In [13]:
doc= nlp("Tesla Inc is going to acquire twitter for $45 billion ")
for ent in doc.ents:
        print(ent.text, "|", ent.label_)

Tesla Inc | ORG
$45 billion | MONEY


In [14]:
doc= nlp("Tesla Inc is going to acquire twitter for $45 billion ")
for ent in doc.ents:
        print(ent.text, "|", ent.label, "|", spacy.explain(ent.label_))

Tesla Inc | 383 | Companies, agencies, institutions, etc.
$45 billion | 394 | Monetary values, including unit


In [15]:
## this can be render as below
from spacy import displacy

In [16]:
displacy.render(doc, style="ent")

In [17]:
#it didnt capture twitter,  tweak
doc= nlp("Tesla Inc is going to acquire Twitter Inc for $45 billion ")
for ent in doc.ents:
        print(ent.text, "|", ent.label, "|", spacy.explain(ent.label_))

Tesla Inc | 383 | Companies, agencies, institutions, etc.
Twitter Inc | 383 | Companies, agencies, institutions, etc.
$45 billion | 394 | Monetary values, including unit


In [18]:
displacy.render(doc, style="ent")

In [19]:
## entities supported by Spacy
nlp.pipe_labels["ner"]

['CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART']

In [20]:
#doc= nlp("Michael Bloomerg founded by Bloomerg in 1982")
doc= nlp("Michael Bloomerg founded by Bloomerg Inc in 1982")
for ent in doc.ents:
        print(ent.text, "|", ent.label, "|", spacy.explain(ent.label_))

Michael Bloomerg | 380 | People, including fictional
Bloomerg Inc | 380 | People, including fictional
1982 | 391 | Absolute or relative dates or periods


In [21]:
## Spcay fails to detect Bloomerg as organization 
## 391 is date
## 380 is GPE

Setting Custom Entity

In [22]:
 ## Token is a slice of word in NLP
## Span is substring of a Token in Spacy  
## copying doc1 form above

In [23]:
doc= nlp("Tesla Inc is going to acquire Twitter for $45 billion ")
for ent in doc.ents:
        print(ent.text, "|", ent.label_)

Tesla Inc | ORG
Twitter | PERSON
$45 billion | MONEY


In [24]:
#doc[0]
type(doc)

spacy.tokens.doc.Doc

In [25]:
doc[2:5]

is going to

In [26]:
type(doc[2:5])

spacy.tokens.span.Span

In [27]:
##How to set entity annotation for tokens outside of any provided spans. 
#Options: "blocked", "missing", "outside" and "unmodified" (preserve current state). Defaults to "outside".
# type str

In [28]:
from spacy.tokens import Span

s1= Span(doc, 0,1, label='ORG' )
s2= Span(doc, 6,7, label='ORG' )

doc.set_ents([s1, s2], default='unmodified')

In [29]:
for ent in doc.ents:
        print(ent.text, "|", ent.label_)

Tesla Inc | ORG
Twitter | ORG
$45 billion | MONEY


In [30]:
## you can create rule based entity using Spacy Entity Ruler

## Custom NER with spaCy v3 

In [31]:
! pip install -U spacy -q

In [71]:
## looking for spacy version
## to do custom NER using spacy 3

In [2]:
!python -m spacy info

[1m

spaCy version    3.4.3                         
Location         C:\Users\LZ575NE\Anaconda3\lib\site-packages\spacy
Platform         Windows-10-10.0.19044-SP0     
Python version   3.9.12                        
Pipelines        en_core_web_lg (3.4.1), en_core_web_sm (3.4.1)



In [73]:
## now convert the Json file into .spacy object
## spacy doesnt have trained NER process
## with .spacy doc object we will make custom NER

In [4]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

In [5]:
# load a new spacy model
nlp = spacy.blank("en") 

# create a DocBin object
db = DocBin() 

In [6]:
import json

In [7]:
f = open ('C:/Users/LZ575NE/OneDrive - EY/Desktop/project/ICDP/Data_scanning&redact_code/train2.json')
TRAIN_DATA = json.load(f)

In [8]:
## Now display custom the Tagged Entity / Annotations as in classes
TRAIN_DATA

{'classes': ['PERCENTAGE', 'VALUE', 'PERSON', 'ORG', 'CRYPTO'],
 'annotations': [['The global crypto market capitalisation tumbled 3.60 percent over the last 24 hours to $2.18 VALUE trillion while the total trading volume fell 6.79 percent to $95.77 PERCENTAGE billion.While DeFi ($15.16 billion) accounted for 15.83 percent of the trading volume, stablecoins ($75.16 billion) made up 78.48 percent. The market dominance of Bitcoin CRYPTO rose 0.29 percent to 40.45 percent today morning. Bitcoin is currently trading at $46,560.09 VALUE .As for major cryptocurrencies, Bitcoin tumbled 2.46 percent to trade at Rs 37,49,173 while Ethereum fell 4.48 percent at Rs 2,93,527.4. Cardano declined 7.25 percent to Rs 105.4. Avalanche fell 8.77 percent to Rs 8,048 PERCENTAGE , Polkadot tumbled 7.53 percent at Rs 2,137.04 and Litecoin dipped 1.04 percent to Rs 11,725.33 over the last 24 hours. Tether rose 0.12 percent to trade at Rs 80.18. Memecoin SHIB fell 5.61 percent while Dogecoin decreased 4.29 pe

In [9]:
## creating .doc object using the Annotations 

In [10]:
for text, annot in tqdm(TRAIN_DATA['annotations']): 
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents 
    db.add(doc)

db.to_disk("./training_data.spacy") # save the docbin object

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 44.61it/s]


In [11]:
## Manually specifying the widget pipline and language as eng

In [12]:
! python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency


[x] The provided output file already exists. To force overwriting the config
file, set the --force or -F flag.



In [13]:
## Now train the Model using the config file since we have the training file
## ---output-- is output directory
## output --paths  dump everything in my current root folder
## using training file as validation object or the test object 
## To train the model kernal takes time more than usual

In [14]:
! python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./training_data.spacy

[i] Saving to output directory: .

[2022-12-01 18:27:49,792] [INFO] Set up nlp object from config
[2022-12-01 18:27:49,812] [INFO] Pipeline: ['tok2vec', 'ner']
[2022-12-01 18:27:49,820] [INFO] Created vocabulary
[2022-12-01 18:27:49,821] [INFO] Finished initializing nlp object
[2022-12-01 18:27:50,045] [INFO] Initialized pipeline components: ['tok2vec', 'ner']



[i] Using CPU
[1m
[+] Initialized pipeline
[1m
[i] Pipeline: ['tok2vec', 'ner']
[i] Initial learn rate: 0.001
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00    227.72    0.00    0.00    0.00    0.00
200     200        306.75   6005.77  100.00  100.00  100.00    1.00
400     400          0.00      0.00  100.00  100.00  100.00    1.00
600     600          0.00      0.00  100.00  100.00  100.00    1.00
800     800          0.00      0.00  100.00  100.00  100.00    1.00
1000    1000          0.00      0.00  100.00  100.00  100.00    1.00
1200    1200          0.00      0.00  100.00  100.00  100.00    1.00
1400    1400          0.00      0.00  100.00  100.00  100.00    1.00
1600    1600          0.00      0.00  100.00  100.00  100.00    1.00
1800    1800          0.00      0.00  100.00  100.00  100.00    1.00
[+] Saved pipeline to output directory
model-last


In [39]:
## conversion will happen soon as we have taken  validation data same as training data 
## will test the Model using some more text data to check Model efficeincy

In [40]:
## now loading the model-best

In [41]:
##import os 

In [42]:
##os. getcwd() 

In [43]:
## dir_path = os.path.dirname(os.path.realpath('model-last'))

In [15]:
nlp_NER = spacy.load("C:/Users/LZ575NE/output/model-best")

In [16]:
## Now create the doc object

In [51]:
doc= nlp_NER(''' The price of bitcoin, the world's largest cryptocurrency, hovered around Rs 14.28 lakh, with a dominance of 38.65 percent

Major cryptocurrencies were trading in the green early on November 23 as the global crypto market cap jumped 3.68 percent to $819.18 billion, over the last day. On the other hand, the total crypto market volume dropped 5.92 percent to $62.21 billion over the last 24 hours.

The total volume in DeFi stood at $4.61 billion, which is 7.41 percent of the total crypto market 24-hour volume. The volume of all stable coins was $59.00 billion, which is 94.84 percent of the total crypto market 24-hour volume.

The price of bitcoin, the world's largest cryptocurrency, hovered around Rs 14.28 lakh, with a dominance of 38.65 percent, an increase of 0.05 per cent over the day, according to Coinmarketcap. ''')

In [47]:
## spacy render will render the Tags created 
# display in Jupyter

In [19]:
from spacy import displacy

In [None]:
#for i in doc:
#    print(i)
#colors = {"CRYPTO": "#F67DE3", "VALUE": "#7DF6D9", "PERCENTAGE":"#a6e22d"}
#options = {"colors": colors} 
#spacy.displacy.render(doc, style= 'ent', options= options,  jupyter=True)
#displacy.render(doc, style="ent", jupyter=True)
#spacy.displacy.render(doc, style="ent", jupyter=True)
displacy.serve(doc, style="ent")


Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

