# Named Entity Recognition

## Basic Setup


In [14]:
# import os
# import statistics
# import json
# import tiktoken
import re
import glob
import numpy as np
import pandas as pd
import swifter
from pathlib import Path
from dotenv import load_dotenv
##
load_dotenv('../.env') 

True

In [2]:
directory = Path("../data")

## Named Entity Recognition model

### Define model 

Roberta Named Entity works best for this task. 

Using this version from HF : `mn-xlm-roberta-base-named-entity`

In [3]:

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

## Roberta based NER
roberta_tokenizer = AutoTokenizer.from_pretrained("2rtl3/mn-xlm-roberta-base-named-entity")
roberta_model = AutoModelForTokenClassification.from_pretrained("2rtl3/mn-xlm-roberta-base-named-entity")
nlp_roberta = pipeline("ner", model=roberta_model, tokenizer=roberta_tokenizer)

print("Number of parameters ->", roberta_model.num_parameters()/1000000, "Mn")


Number of parameters -> 277.456901 Mn


In [4]:

entity_classes = list(roberta_model.config.label2id.keys())
entity_classes


['LOC', 'MISC', 'O', 'ORG', 'PER']

### Helper functions for combining tokens to words

In [5]:
## words that must not be considered as names
stop_words = ["", "the", "The", "THE", "Sir", "Dr", "Mr", "of", "and"]

In [6]:

## Helper funciton to combine tokens into names and output a list of objects with entity types and names
def combine_tokens_list(ner_results, stop_words = [], metadata={}):
    ## ner_results are the results out of the Roberta Named Entity Recognition model. 
    ## stop_words are the words that should be disccarded as named entities
    ## metadata is an optional dictionary of attributes to be added to the output.
    ##  This can be used to add section number, chapter number, etc to the output.
    name = ""
    entities_list = []
    entity = np.nan
    current_word_start = 0
    prev_word_end = 0
    for res in ner_results:
        word = res['word']
        current_word_start = res['start']
        if (word[0] == "▁") or (current_word_start > prev_word_end):
            ## Save previous name
            if not ( name in stop_words or len(name)<=2):
                ## If entity is not set yet, then set current entity.
                entity = res['entity'] if entity == "NA" else entity
                ## removing trailing hypens from the names before saving
                entities_list = entities_list + [{'name': name.rstrip("-"), 'entity': entity, **metadata}]
            name = re.sub(r'[^a-zA-Z0-9\-]', '', word)
            entity = res['entity']
        else:
            # Remove all the special characters except '-' from the token
            # Add token to the ongoing name. 
            name = name + re.sub(r'[^a-zA-Z0-9\-]', '', word)
            
        prev_word_end = res['end']
    
    ## append the last name
    entities_list = entities_list + [{'name': name, 'entity': entity, **metadata}]
    ## Return
    return entities_list


In [7]:

## Helper funciton to combine tokens into names and arrange them into dictionary
def combine_tokens_dict(ner_results, classes = [], stop_words = []):
    name = ""
    ## Define a dictionary of entities and add classes as keys in the python dict
    entities_dict = {}
    for label in classes:
        entities_dict[label] = []
    ##
    entity = "NA"
    current_word_start = 0
    prev_word_end = 0
    for res in ner_results:
        word = res['word']
        current_word_start = res['start']
        if (word[0] == "▁") or (current_word_start > prev_word_end):
            ## Save previous name
            if not ( name in stop_words or len(name)<=2):
                ## If entity is not set yet, then set current entity.
                entity = res['entity'] if entity == "NA" else entity
                ## removing trailing hypens from the names before saving
                entities_dict[entity] = entities_dict[entity] + [name.rstrip("-")]
            name = re.sub(r'[^a-zA-Z0-9\-]', '', word)
            entity = res['entity']
        else:
            # Remove all the special characters except '-' from the token.
            # Add token to the ongoing name. 
            name = name + re.sub(r'[^a-zA-Z0-9\-]', '', word)
            
        prev_word_end = res['end']
    
    ## append the last name
    entities_dict[entity] = entities_dict[entity] + [name.rstrip("-")]
    ## Calculate unique values per class
    for label in classes:
        entities_dict[label] = list(set(entities_dict[label]))
    return entities_dict


## Calculate Named entites

### Test out the model for multiple text

first lets run a test on example text


In [8]:

text = """Ugrasrava, the son of Lomaharshana, surnamed Sauti, well-versed in the
Puranas, bending with humility, one day approached the great sages of
rigid vows, sitting at their ease, who had attended the twelve years'
sacrifice of Saunaka, surnamed Kulapati, in the forest of Naimisha."""



#### Test the combine_tokens_dict function.

In [9]:

### Calculate named entities as dictionary. 
ner_results = nlp_roberta(text)
entities = combine_tokens_dict(ner_results, entity_classes, stop_words)
entities

{'LOC': ['Naimisha'],
 'MISC': ['Puranas'],
 'O': [],
 'ORG': [],
 'PER': ['Lomaharshana', 'Sauti', 'Kulapati', 'Ugrasrava', 'Saunaka']}

#### Test the combine_tokens_list function.

In [10]:
ner_results = nlp_roberta(text)
entities = combine_tokens_list(ner_results, stop_words, metadata = {'some': 'other'})
entities

[{'name': 'Ugrasrava', 'entity': 'PER', 'some': 'other'},
 {'name': 'Lomaharshana', 'entity': 'PER', 'some': 'other'},
 {'name': 'Sauti', 'entity': 'PER', 'some': 'other'},
 {'name': 'Puranas', 'entity': 'MISC', 'some': 'other'},
 {'name': 'Saunaka', 'entity': 'PER', 'some': 'other'},
 {'name': 'Kulapati', 'entity': 'PER', 'some': 'other'},
 {'name': 'Naimisha', 'entity': 'LOC', 'some': 'other'}]

## Reverse Indexing

Creating a reverse indexed entities dataframe here 

The idea is to create a dataframe of entities and tag every section they appear in. 

In [11]:
## Reverse index
## Index all the text chunks that contain a particular named entity. 

def row2NamedEntities(row):
    # print(row)
    ner_results = nlp_roberta(row['text'])
    entities = combine_tokens_list(ner_results, stop_words, metadata={'chunk_id': row['chunk_id']} )
    return entities


In [12]:
def dfText2DfNE(dataframe):
    ## Takes a dataframe from the parsed data and returns dataframe with named entities. 
    ## The input dataframe must have a text and a chunk_id column. 

    ## Using swifter for parallelism
    ## 1. Calculate named entities for each row of the dataframe. 
    results = dataframe.swifter.apply(row2NamedEntities, axis=1)

    ## Flatten the list of lists to one single list of entities. 
    entities_list = np.concatenate(results).ravel().tolist()

    ## Remove all NaN entities
    entities_dataframe = pd.DataFrame(entities_list).replace(' ', np.nan)
    entities_dataframe = entities_dataframe.dropna(subset=['entity'])

    ## Count the number of occurances per chunk id
    entities_dataframe = entities_dataframe.groupby(['name', 'entity', 'chunk_id']).size().reset_index(name='count')

    return entities_dataframe


### Reverse Index each file

In [26]:
file_list = glob.glob(f"{directory}/*.csv")

for file in file_list:
    file_name = file.split("/")[-1]
    print("Filename :", file_name, '...')
    if file_name in ['tiny_tales_glossary.csv', 'summaries_combined.csv']:
        print('Skipping: ', file_name)
        continue

    df_text = pd.read_csv(directory/file_name, sep="|")
    df_ne = dfText2DfNE(df_text)
    df_ne['file'] = file_name
    outfile_name = f"{file_name.replace('.csv', '_named_entities.csv')}"
    df_ne.to_csv(directory/outfile_name, index=False, sep="|")
    print("Wrote file :", outfile_name)



Filename : summaries_combined.csv ...
Skipping:  summaries_combined.csv
Filename : tiny_tales_summaries.csv ...


Pandas Apply:   0%|          | 0/200 [00:00<?, ?it/s]

Wrote file : tiny_tales_summaries_named_entities.csv
Filename : km_ganguli_translation_6.csv ...


Pandas Apply:   0%|          | 0/627 [00:00<?, ?it/s]

Wrote file : km_ganguli_translation_6_named_entities.csv
Filename : km_ganguli_translation_14.csv ...


Pandas Apply:   0%|          | 0/358 [00:00<?, ?it/s]

Wrote file : km_ganguli_translation_14_named_entities.csv
Filename : km_ganguli_translation_15.csv ...


Pandas Apply:   0%|          | 0/136 [00:00<?, ?it/s]

Wrote file : km_ganguli_translation_15_named_entities.csv
Filename : km_ganguli_translation_7.csv ...


Pandas Apply:   0%|          | 0/955 [00:00<?, ?it/s]

Wrote file : km_ganguli_translation_7_named_entities.csv
Filename : km_ganguli_translation_5.csv ...


Pandas Apply:   0%|          | 0/756 [00:00<?, ?it/s]

Wrote file : km_ganguli_translation_5_named_entities.csv
Filename : km_ganguli_translation_17.csv ...


Pandas Apply:   0%|          | 0/16 [00:00<?, ?it/s]

Wrote file : km_ganguli_translation_17_named_entities.csv
Filename : km_ganguli_translation_16.csv ...


Pandas Apply:   0%|          | 0/32 [00:00<?, ?it/s]

Wrote file : km_ganguli_translation_16_named_entities.csv
Filename : km_ganguli_translation_4.csv ...


Pandas Apply:   0%|          | 0/261 [00:00<?, ?it/s]

Wrote file : km_ganguli_translation_4_named_entities.csv
Filename : km_ganguli_translation_12.csv ...


Pandas Apply:   0%|          | 0/1970 [00:00<?, ?it/s]

Wrote file : km_ganguli_translation_12_named_entities.csv
Filename : km_ganguli_translation_13.csv ...


Pandas Apply:   0%|          | 0/1054 [00:00<?, ?it/s]

Wrote file : km_ganguli_translation_13_named_entities.csv
Filename : km_ganguli_translation_1.csv ...


Pandas Apply:   0%|          | 0/951 [00:00<?, ?it/s]

Wrote file : km_ganguli_translation_1_named_entities.csv
Filename : km_ganguli_translation_3.csv ...


Pandas Apply:   0%|          | 0/1260 [00:00<?, ?it/s]

Wrote file : km_ganguli_translation_3_named_entities.csv
Filename : tiny_tales_glossary.csv ...
Skipping:  tiny_tales_glossary.csv
Filename : km_ganguli_translation_11.csv ...


Pandas Apply:   0%|          | 0/89 [00:00<?, ?it/s]

Wrote file : km_ganguli_translation_11_named_entities.csv
Filename : km_ganguli_translation_10.csv ...


Pandas Apply:   0%|          | 0/87 [00:00<?, ?it/s]

Wrote file : km_ganguli_translation_10_named_entities.csv
Filename : km_ganguli_translation_2.csv ...


Pandas Apply:   0%|          | 0/309 [00:00<?, ?it/s]

Wrote file : km_ganguli_translation_2_named_entities.csv
Filename : km_ganguli_translation_9.csv ...


Pandas Apply:   0%|          | 0/334 [00:00<?, ?it/s]

Wrote file : km_ganguli_translation_9_named_entities.csv
Filename : kaggle_tilak_summaries.csv ...


Pandas Apply:   0%|          | 0/2376 [00:00<?, ?it/s]

Wrote file : kaggle_tilak_summaries_named_entities.csv
Filename : km_ganguli_translation_8.csv ...


Pandas Apply:   0%|          | 0/498 [00:00<?, ?it/s]

Wrote file : km_ganguli_translation_8_named_entities.csv
Filename : km_ganguli_translation_18.csv ...


Pandas Apply:   0%|          | 0/38 [00:00<?, ?it/s]

Wrote file : km_ganguli_translation_18_named_entities.csv
Filename : wikipedia_parva_summaries.csv ...


Pandas Apply:   0%|          | 0/19 [00:00<?, ?it/s]

Wrote file : wikipedia_parva_summaries_named_entities.csv
