<a href="https://colab.research.google.com/github/shivams289/DS_Algo/blob/main/Spacy_NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Spacy has the ‘ner’ pipeline component that identifies token spans fitting a predetermined set of named entities. These are available as the ‘ents’ property of a Doc object.


1.   `Doc.ents` are token spans with their own set of annotations.
1.   `ent.text` : Original entity text
2.   `ent.label_`: eg->ORG for organisation, PERSON for a person, PRODUCT for products
3.   `ent.start` : token span start index
4.   `ent.end `:   token span end index
5.  ` ent.start_char`: entity span start index
6.   `ent.end_char` :  entity span end index








In [1]:
pip install snorkel

Collecting snorkel
  Downloading snorkel-0.9.7-py3-none-any.whl (145 kB)
[?25l[K     |██▎                             | 10 kB 21.6 MB/s eta 0:00:01[K     |████▌                           | 20 kB 12.4 MB/s eta 0:00:01[K     |██████▊                         | 30 kB 10.1 MB/s eta 0:00:01[K     |█████████                       | 40 kB 9.0 MB/s eta 0:00:01[K     |███████████▎                    | 51 kB 5.1 MB/s eta 0:00:01[K     |█████████████▌                  | 61 kB 5.6 MB/s eta 0:00:01[K     |███████████████▊                | 71 kB 5.6 MB/s eta 0:00:01[K     |██████████████████              | 81 kB 6.2 MB/s eta 0:00:01[K     |████████████████████▎           | 92 kB 4.6 MB/s eta 0:00:01[K     |██████████████████████▌         | 102 kB 4.8 MB/s eta 0:00:01[K     |████████████████████████▉       | 112 kB 4.8 MB/s eta 0:00:01[K     |███████████████████████████     | 122 kB 4.8 MB/s eta 0:00:01[K     |█████████████████████████████▎  | 133 kB 4.8 MB/s eta 0:00:01[K

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from typing import List, Tuple, Union
from functools import partial

In [None]:
pip install tokenizers

In [14]:
from snorkel.labeling import labeling_function, PandasLFApplier
from snorkel.labeling.lf.core import LabelingFunction
from snorkel.preprocess import preprocessor

from tokenizers.pre_tokenizers import Whitespace

In [4]:
df = pd.read_csv('/content/drive/MyDrive/ner_news_10k_combined.csv')

In [5]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
# Identifying the entities is one thing
'''for text in df['text']:
  doc  = nlp(text)
  print(text)
  labels = []
  for ent in doc.ents:
    if ent.label_ == 'PERSON' :
      ents = [word for word in ent.text.split(' ') if word]
      for i, word in enumerate(ents):
        s, e = ent.start_char, ent.end_char
        new_token_span = (s + len(word), e - len(word))
        if i==0:
          label.append()

      #print([{ent.text:ent.label_}, {'Span:':(ent.start_char, ent.end_char)}])
      print('\n')'''

In [19]:
from enum import Enum
class Labels(Enum):
  
    PER_B = 11
    PER_I = 12
    LOC_B = 21
    LOC_I =22    #FAC: faciltities like buildings #GPE:Country, city, state #LOC: Locations not in GPE
    ORG_B = 31
    ORG_I = 32
    DATE = 4    #Dates or Periods
    MONEY = 5
    PRODUCT = 6
    NUMBERS = 8 #Percent # Quantity #Ordinal # Cardinal
    LAW = 9     #somewhat successful in labeling cases or legal documnet articles kind of things
    EVENT = 7   #This will be very useful in detectinig catastrophies like hurricanes, wars, sports events etc

In [20]:
pre_tokenizer = Whitespace()

@labeling_function()
def label_per_library_spacy(x):
  
    doc = nlp(x.text)

    labels = []
    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            start = ent.start_char
            end = ent.end_char
            #labels.append(((start, end), Labels.PER.value))
            entity_tokens = pre_tokenizer.pre_tokenize_str(x.text[start:end])

            for i, (token, token_span) in enumerate(entity_tokens):
                s, e = token_span
                shifted_token_span = (s + start, e + start)
                if i == 0:
                    labels.append((shifted_token_span, Labels.PER_B.value))
                else:
                    labels.append((shifted_token_span, Labels.PER_I.value))
             
        
    return x.text, labels

@labeling_function()
def label_org_library_spacy(x):

    doc = nlp(x.text)

    labels = []
    for ent in doc.ents:
        if ent.label_ == 'ORG':
            start = ent.start_char
            end = ent.end_char
            #labels.append(((start, end), Labels.ORG.value))
            entity_tokens = pre_tokenizer.pre_tokenize_str(x.text[start:end])

            for i, (token, token_span) in enumerate(entity_tokens):
                s, e = token_span
                shifted_token_span = (s + start, e + start)
                if i == 0:
                    labels.append((shifted_token_span, Labels.ORG_B.value))
                else:
                    labels.append((shifted_token_span, Labels.ORG_I.value))
             
        
    return x.text, labels

@labeling_function()
def label_loc_library_spacy(x):
  
    doc = nlp(x.text)

    labels = []
    for ent in doc.ents:
        if ent.label_ == 'LOC' or ent.label_ == 'GPE' or ent.label_ == 'FAC':
            start = ent.start_char
            end = ent.end_char
            #labels.append(((start, end), Labels.LOC.value))
            entity_tokens = pre_tokenizer.pre_tokenize_str(x.text[start:end])

            for i, (token, token_span) in enumerate(entity_tokens):
                s, e = token_span
                shifted_token_span = (s + start, e + start)
                if i == 0:
                    labels.append((shifted_token_span, Labels.LOC_B.value))
                else:
                    labels.append((shifted_token_span, Labels.LOC_I.value))
             
        
    return x.text, labels

@labeling_function()
def label_date_library_spacy(x):
   
    doc = nlp(x.text)

    labels = []
    for ent in doc.ents:
        if ent.label_ == 'DATE':
            start = ent.start_char
            end = ent.end_char
            labels.append(((start, end), Labels.DATE.value))
             
        
    return x.text, labels

@labeling_function()
def label_money_library_spacy(x):
   
    doc = nlp(x.text)

    labels = []
    for ent in doc.ents:
        if ent.label_ == 'MONEY':
            start = ent.start_char
            end = ent.end_char
            labels.append(((start, end), Labels.MONEY.value))
             
        
    return x.text, labels

@labeling_function()
def label_products_library_spacy(x):
   
    doc = nlp(x.text)

    labels = []
    for ent in doc.ents:
        if ent.label_ == 'PRODUCT':
            start = ent.start_char
            end = ent.end_char
            labels.append(((start, end), Labels.PRODUCT.value))
             
        
    return x.text, labels

@labeling_function()
def label_numbers_library_spacy(x):
   
    doc = nlp(x.text)

    labels = []
    for ent in doc.ents:
        if ent.label_ == 'PERCENT' or ent.label_ == 'QUANTITY' or ent.label_ == 'CARDINAL' or ent.label_ == 'ORDINAL':
            start = ent.start_char
            end = ent.end_char
            labels.append(((start, end), Labels.NUMBERS.value))
             
        
    return x.text, labels

@labeling_function()
def label_law_library_spacy(x):
  
    doc = nlp(x.text)

    labels = []
    for ent in doc.ents:
        if ent.label_ == 'LAW':
            start = ent.start_char
            end = ent.end_char
            labels.append(((start, end), Labels.LAW.value))
             
        
    return x.text, labels

@labeling_function()
def label_events_library_spacy(x):
  
    doc = nlp(x.text)

    labels = []
    for ent in doc.ents:
        if ent.label_ == 'EVENT':
            start = ent.start_char
            end = ent.end_char
            labels.append(((start, end), Labels.EVENT.value))
             
        
    return x.text, labels



In [21]:
from snorkel.labeling.lf import LabelingFunction
from snorkel.types import DataPoint

from snorkel.labeling.apply.core import ApplierMetadata, BaseLFApplier, _FunctionCaller



def apply_lfs_to_data_point(x: DataPoint, lfs: List[LabelingFunction], f_caller: _FunctionCaller):
    labels = []
    for j, lf in enumerate(lfs):
        text, ner_tags = f_caller(lf, x)
        if len(ner_tags):
            labels.append((text, ner_tags, j))
    return labels


def rows_to_labels_dict(labels, num_lfs):
    d = {}
    for label_list in labels:
        for (text, ner_tags, j) in label_list:
            for tag in ner_tags:
                span, label = tag
                if (text, span) in d:
                    d[text, span][j] = label
                else:                        
                    d[text, span] = [-1]*num_lfs
                    d[text, span][j] = label

    return d


class PandasLFApplierForNER(BaseLFApplier):
    def apply(self,
              df: pd.DataFrame,
              progress_bar: bool = True,
              fault_tolerant: bool = False,
              return_meta: bool = False,
              ) -> pd.DataFrame:
        
        f_caller = _FunctionCaller(fault_tolerant)
        apply_fn = partial(apply_lfs_to_data_point, lfs=self._lfs, f_caller=f_caller)
        call_fn = df.apply
        
        if progress_bar:
            tqdm.pandas()
            call_fn = df.progress_apply
            
        labels = call_fn(apply_fn, axis=1)
        labels_with_index = rows_to_labels_dict(labels, len(self._lfs))
        return labels_with_index

In [22]:
lfs = [label_per_library_spacy, label_loc_library_spacy, label_org_library_spacy, label_money_library_spacy, label_date_library_spacy, label_numbers_library_spacy, label_products_library_spacy, label_law_library_spacy, label_events_library_spacy]
applier = PandasLFApplierForNER(lfs)
labels = applier.apply(df)

100%|██████████| 64/64 [00:13<00:00,  4.59it/s]


In [23]:
rows = []
for k, v in labels.items():
    text, span = k
    labels = v
    rows.append([text, span] + labels)

In [24]:
label_df = pd.DataFrame(rows, columns=["text", "span", "label_per", "label_loc", "label_org", "label_money", "label_date", "label_numbers", "label_products", "label_law", "label_events" ])
label_df

Unnamed: 0,text,span,label_per,label_loc,label_org,label_money,label_date,label_numbers,label_products,label_law,label_events
0,"Lease Agreement, effective June 1, 1998, by an...","(77, 83)",11,-1,-1,-1,-1,-1,-1,-1,-1
1,"Lease Agreement, effective June 1, 1998, by an...","(84, 95)",12,-1,-1,-1,-1,-1,-1,-1,-1
2,"Lease Agreement, effective June 1, 1998, by an...","(56, 57)",-1,-1,31,-1,-1,-1,-1,-1,-1
3,"Lease Agreement, effective June 1, 1998, by an...","(57, 58)",-1,-1,32,-1,-1,-1,-1,-1,-1
4,"Lease Agreement, effective June 1, 1998, by an...","(58, 59)",-1,-1,32,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...
717,Her focus on meme-worthy investments and her u...,"(157, 162)",-1,-1,31,-1,-1,-1,-1,-1,-1
718,Her focus on meme-worthy investments and her u...,"(163, 166)",-1,-1,32,-1,-1,-1,-1,-1,-1
719,Her focus on meme-worthy investments and her u...,"(166, 167)",-1,-1,32,-1,-1,-1,-1,-1,-1
720,Her focus on meme-worthy investments and her u...,"(239, 247)",-1,-1,31,-1,-1,-1,-1,-1,-1


In [25]:
label_matrix = label_df[["label_per", "label_loc", "label_org", "label_money", "label_date", "label_numbers", "label_products", "label_law", "label_events"]].to_numpy()
label_matrix

array([[11, -1, -1, ..., -1, -1, -1],
       [12, -1, -1, ..., -1, -1, -1],
       [-1, -1, 31, ..., -1, -1, -1],
       ...,
       [-1, -1, 32, ..., -1, -1, -1],
       [-1, -1, 31, ..., -1, -1, -1],
       [-1, -1, 32, ..., -1, -1, -1]])

In [27]:
from snorkel.labeling.model import LabelModel

label_model = LabelModel(cardinality=33, verbose=True)
label_model.fit(L_train=label_matrix, n_epochs=10, log_freq=100, seed=123)

In [28]:
train_predictions = label_model.predict(label_matrix)
train_predictions

array([11, 12, 31, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31,
       32, 32, 32, 32,  4,  4,  4,  8, 21, 22, 22, 21, 21, 21, 21, 31, 32,
       31, 31, 32,  4, 21, 31, 32, 31, 32, 31, 32, 31, 32, 32, 32, 32, 32,
       31,  5,  4,  4,  8,  8,  8,  8, 21,  5,  5,  4,  4,  4,  4, 31, 32,
       32, 31, 31,  5,  5,  5,  4,  4,  4,  4,  4,  8, 21, 22, 22, 22, 21,
       21, 31, 32, 32, 32, 32, 31, 32, 31, 32, 31, 31, 32, 31,  5,  4,  4,
        4,  4,  8,  8, 31, 32, 32, 32, 32, 31, 32, 32, 11, 11, 11, 11, 11,
       21, 22, 22, 22, 21, 21, 31, 31, 31, 32, 31, 31, 32,  5,  4,  4,  8,
       11, 12, 31, 32, 32, 31, 31,  5,  5,  5,  4,  4,  8,  8, 21, 22, 22,
       22, 21, 21, 31, 32, 31, 31, 32,  5,  4,  4,  4,  8,  8, 11, 12, 12,
       12, 12, 11, 12, 11, 12, 12, 12, 12, 11, 12, 11, 12, 21, 22, 21, 22,
       21, 22, 21, 22, 21, 22, 21, 21, 31, 31, 32, 32, 31, 32, 32, 32, 32,
       31, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 31, 32, 32, 31,
       32, 32, 31, 31, 31

In [29]:
def generate_training_data(label_df, predictions):
    labelled_data = {}
    sentences = label_df['text'].tolist()
    spans = label_df['span'].tolist()
    preds = predictions.tolist()

    for i, (sentence, span) in enumerate(zip(sentences, spans)):
        s, e = span
        if sentence not in labelled_data:
            labelled_data[sentence] = []
        labelled_data[sentence].append((sentence[s:e], Labels(preds[i]).name))
    
    return labelled_data

In [30]:
labelled_data = generate_training_data(label_df, train_predictions)

In [31]:
for k, v in labelled_data.items():
    print(f"Sentence : {k}")
    print(f"NER tags : {v}")
    print()

Sentence : Lease Agreement, effective June 1, 1998, by and between R.K. Fitzpatrick and Cheryl Fitzpatrick, R.J. Fitzpatrick Smelters, Inc., and ISA Indiana, Inc. is incorporated by reference herein to Exhibit 10.21 of ISAs report on Form 10-K for the year ended December 31, 1999, as filed on April 14, 2000.
NER tags : [('Cheryl', 'PER_B'), ('Fitzpatrick', 'PER_I'), ('R', 'ORG_B'), ('.', 'ORG_I'), ('K', 'ORG_I'), ('.', 'ORG_I'), ('Fitzpatrick', 'ORG_I'), ('R', 'ORG_B'), ('.', 'ORG_I'), ('J', 'ORG_I'), ('.', 'ORG_I'), ('Fitzpatrick', 'ORG_I'), ('Smelters', 'ORG_I'), (',', 'ORG_I'), ('Inc', 'ORG_I'), ('.', 'ORG_I'), ('ISA', 'ORG_B'), ('Indiana', 'ORG_I'), (',', 'ORG_I'), ('Inc', 'ORG_I'), ('.', 'ORG_I'), ('June 1, 1998', 'DATE'), ('the year ended December 31, 1999', 'DATE'), ('April 14, 2000', 'DATE'), ('10.21', 'NUMBERS')]

Sentence : In January, 2009, we expanded into the stainless steel and high-temperature alloys recycling business by purchasing inventories from Ventures Metals, LLC,