# Named Entity Recognition

## Basic Setup


In [1]:
# import os
# import statistics
# import json
# import tiktoken
import re
import glob
import numpy as np
import pandas as pd
# import swifter
from pathlib import Path
from dotenv import load_dotenv
##
load_dotenv('../.env') 

True

In [2]:
directory = Path("../data")

## Named Entity Recognition model

### Define model 

Roberta Named Entity works best for this task. 

Using this version from HF : `mn-xlm-roberta-base-named-entity`

In [3]:

from transformers import pipeline

## Roberta based NER
roberta = pipeline("token-classification", model="2rtl3/mn-xlm-roberta-base-named-entity", aggregation_strategy="simple")

print("Number of parameters ->", roberta.model.num_parameters()/1000000, "Mn")


  from .autonotebook import tqdm as notebook_tqdm


Number of parameters -> 277.456901 Mn


In [4]:

entity_classes = list(roberta.model.config.label2id.keys())
entity_classes


['LOC', 'MISC', 'O', 'ORG', 'PER']

## Calculate Named entites

### Test out the model for multiple text

first lets run a test on example text


In [5]:

text = """The Rishi Vyasa published this mass of knowledge in both a detailed and
an abridged form. It is the wish of the learned in the world to possess
the details and the abridgement. Some read the Bharata beginning with the
initial mantra (invocation), others with the story of Astika, others with
Uparichara, while some Brahmanas study the whole. Men of learning display
their various knowledge of the institutes in commenting on the
composition. Some are skilful in explaining it, while others, in
remembering its contents.

The son of Satyavati having, by penance and meditation, analysed the
eternal Veda, afterwards composed this holy history, when that learned
Brahmarshi of strict vows, the noble Dwaipayana Vyasa, offspring of
Parasara, had finished this greatest of narrations, he began to consider
how he might teach it to his disciples. And the possessor of the six
attributes, Brahma, the world's preceptor, knowing of the anxiety of the
Rishi Dwaipayana, came in person to the place where the latter was, for
gratifying the saint, and benefiting the people. And when Vyasa,
surrounded by all the tribes of Munis, saw him, he was surprised; and,
standing with joined palms, he bowed and ordered a seat to be brought.
And Vyasa having gone round him who is called Hiranyagarbha seated on
that distinguished seat stood near it; and being commanded by Brahma
Parameshthi, he sat down near the seat, full of affection and smiling in
joy. Then the greatly glorious Vyasa, addressing Brahma Parameshthi,
said, "O divine Brahma, by me a poem hath been composed which is greatly
respected. The mystery of the Veda, and what other subjects have been
explained by me; the various rituals of the Upanishads with the Angas;
the compilation of the Puranas and history formed by me and named after
the three divisions of time, past, present, and future; the determination
of the nature of decay, fear, disease, existence, and non-existence, a
description of creeds and of the various modes of life; rule for the four
castes, and the import of all the Puranas; an account of asceticism and

"""


#### Test the combine_tokens_list function.

In [6]:
ner_results = roberta(text)
# entities = combine_tokens_list(ner_results, stop_words, metadata = {'some': 'other'})
entities = []
for result in ner_results:
    entities = entities + [{'name': result['word'], 'entity': result['entity_group']}]

entities

[{'name': 'The Rishi Vyasa', 'entity': 'MISC'},
 {'name': 'the Bharata', 'entity': 'MISC'},
 {'name': 'Astika', 'entity': 'PER'},
 {'name': 'Uparichara', 'entity': 'PER'},
 {'name': 'Brahmanas', 'entity': 'MISC'},
 {'name': 'Satyavati', 'entity': 'PER'},
 {'name': 'Veda', 'entity': 'MISC'},
 {'name': 'Brahmarshi', 'entity': 'PER'},
 {'name': 'Dwaipayana Vyasa', 'entity': 'PER'},
 {'name': 'Parasara', 'entity': 'PER'},
 {'name': 'Brahma', 'entity': 'PER'},
 {'name': 'the Rishi Dwaipayana', 'entity': 'MISC'},
 {'name': 'Vyasa', 'entity': 'PER'},
 {'name': 'Munis', 'entity': 'LOC'},
 {'name': 'Vyasa', 'entity': 'PER'},
 {'name': 'Hiranyagarbha', 'entity': 'PER'},
 {'name': 'Brahma Parameshthi', 'entity': 'PER'},
 {'name': 'Vyasa', 'entity': 'PER'},
 {'name': 'Brahma Parameshthi', 'entity': 'PER'},
 {'name': 'Brahma', 'entity': 'PER'},
 {'name': 'the Veda', 'entity': 'MISC'},
 {'name': 'the Upanishads', 'entity': 'MISC'},
 {'name': 'the Angas', 'entity': 'MISC'},
 {'name': 'the Puranas', '

## Reverse Indexing

Creating a reverse indexed entities dataframe here 

The idea is to create a dataframe of entities and tag every section they appear in. 

In [7]:
## Reverse index
## Index all the text chunks that contain a particular named entity. 

def row2NamedEntities(row):
    # print(row)
    ner_results = roberta(row['text'])
    metadata = {'chunk_id': row['chunk_id']}
    entities = []
    for result in ner_results:
        entities = entities + [{'name': result['word'], 'entity': result['entity_group'], **metadata}]
        
    return entities


In [8]:
def dfText2DfNE(dataframe):
    ## Takes a dataframe from the parsed data and returns dataframe with named entities. 
    ## The input dataframe must have a text and a chunk_id column. 

    ## Using swifter for parallelism
    ## 1. Calculate named entities for each row of the dataframe. 
    results = dataframe.apply(row2NamedEntities, axis=1)

    ## Flatten the list of lists to one single list of entities. 
    entities_list = np.concatenate(results).ravel().tolist()

    ## Remove all NaN entities
    entities_dataframe = pd.DataFrame(entities_list).replace(' ', np.nan)
    entities_dataframe = entities_dataframe.dropna(subset=['entity'])

    ## Count the number of occurances per chunk id
    entities_dataframe = entities_dataframe.groupby(['name', 'entity', 'chunk_id']).size().reset_index(name='count')

    return entities_dataframe


In [9]:
ner_directory = directory/"named_entities"
file_list = glob.glob(f"{directory}/*.csv")
df_text = pd.read_csv(directory/"tiny_tales_summaries.csv", sep="|")
dfne = dfText2DfNE(df_text)
dfne

Unnamed: 0,name,entity,chunk_id,count
0,Abhimanyu,PER,cid_7a30e824ef804f2d956b7c340355ec0d,1
1,Abhimanyu,PER,cid_869bb0c9f5f04141a9786af859398348,1
2,Abhimanyu,PER,cid_956f25c0c15c429dbf75549523a50dca,1
3,Abhimanyu,PER,cid_9af9d8d833204d4da009adecad490677,1
4,Abhimanyu,PER,cid_b5f3052a52594a68b4388cfd926eb182,4
...,...,...,...,...
1323,vati,PER,cid_bfcd527551fc46929ad1153aa8d67b8a,1
1324,vigaha,PER,cid_be680f27c5e84a3e955c6353299b7e2e,1
1325,yagandha,PER,cid_5c4b2c4baa9945bab2a81a49f1d0a678,1
1326,yana,LOC,cid_b825b13ed08b406da76c9fea6b7d3002,1


### Reverse Index each file

In [10]:
ner_directory = directory/"named_entities"
file_list = glob.glob(f"{directory}/*.csv")

for file in file_list:
    file_name = file.split("/")[-1]
    print("Filename :", file_name, '...')
    if file_name in ['tiny_tales_glossary.csv', 'summaries_combined.csv']:
    # if not file_name in ['wikipedia_parva_summaries.csv']:
        print('Skipping: ', file_name)
        continue

    df_text = pd.read_csv(directory/file_name, sep="|")
    df_ne = dfText2DfNE(df_text)
    df_ne['file'] = file_name
    outfile_name = f"{file_name.replace('.csv', '_named_entities.csv')}"
    df_ne.to_csv(ner_directory/outfile_name, index=False, sep="|")
    print("Wrote file :", outfile_name)



Filename : summaries_combined.csv ...
Skipping:  summaries_combined.csv
Filename : tiny_tales_summaries.csv ...
Wrote file : tiny_tales_summaries_named_entities.csv
Filename : km_ganguli_translation_6.csv ...
Wrote file : km_ganguli_translation_6_named_entities.csv
Filename : km_ganguli_translation_14.csv ...
Wrote file : km_ganguli_translation_14_named_entities.csv
Filename : km_ganguli_translation_15.csv ...
Wrote file : km_ganguli_translation_15_named_entities.csv
Filename : km_ganguli_translation_7.csv ...
Wrote file : km_ganguli_translation_7_named_entities.csv
Filename : km_ganguli_translation_5.csv ...
Wrote file : km_ganguli_translation_5_named_entities.csv
Filename : km_ganguli_translation_17.csv ...
Wrote file : km_ganguli_translation_17_named_entities.csv
Filename : km_ganguli_translation_16.csv ...
Wrote file : km_ganguli_translation_16_named_entities.csv
Filename : km_ganguli_translation_4.csv ...
Wrote file : km_ganguli_translation_4_named_entities.csv
Filename : km_gangu