# Named Entity Recognition(NER).

## Import Data

In [1]:
### Import the Text Data from WEB || Backup The Data || Find Format Of The Data

from urllib.request import urlopen

Link= 'https://www.gutenberg.org/files/65567/65567-0.txt'
RAW_Data = urlopen(url= Link).read()
Data_Backup = RAW_Data

print('The Format or The Type of the Data is:-',type(RAW_Data))
print('Total Length of the Raw Data is :-',len(RAW_Data))

The Format or The Type of the Data is:- <class 'bytes'>
Total Length of the Raw Data is :- 120585


In [2]:
### Converting The Data into String Format

RAW_Data= RAW_Data.decode("utf-8")
print('After Conversion The Format or The Type of the Data is:-',type(RAW_Data))

After Conversion The Format or The Type of the Data is:- <class 'str'>


In [3]:
### First few lines of the Data
RAW_Data[1400:2000]

'2\r\n\r\nCONDUCTED BY R. CHAMBERS (SECUNDUS)\r\n\r\nNO. 17.—VOL. I.       SATURDAY, APRIL 26, 1884.       PRICE 1½_d._]\r\n\r\n\r\n\r\n\r\nPOST-OFFICE LIFE-ASSURANCE AND ANNUITIES.\r\n\r\n\r\nThe numerous aids which the government have from time to time afforded\r\nthrough the agency of the Post-office for the encouragement of thrift\r\nand providence amongst the poorer classes have generally been attended\r\nwith so much success, that it is surprising to hear of even one\r\nexception in regard to such efforts. There is no doubt, however, as\r\nwas pointed out two years ago in this _Journal_, that the existing\r\nscheme of Post-'

In [4]:
### Find the starting position of Meaningful Data
RAW_Data.find('The numerous aids which the government have from time to time afforded')

1568

In [5]:
### Extract Meaningful Data from the Raw Data
Text_Data= RAW_Data[1568:]
print('Total Length of the Final Data is :-',len(Text_Data))

Total Length of the Final Data is :- 118134


In [6]:
### Some sample Data
Text_Data[:500]

'The numerous aids which the government have from time to time afforded\r\nthrough the agency of the Post-office for the encouragement of thrift\r\nand providence amongst the poorer classes have generally been attended\r\nwith so much success, that it is surprising to hear of even one\r\nexception in regard to such efforts. There is no doubt, however, as\r\nwas pointed out two years ago in this _Journal_, that the existing\r\nscheme of Post-office Life-assurance and Annuities, which has been in\r\noperation si'

In [7]:
### Cleaning the data.
import re

### Removing all the special characters.
Text_Data= re.sub(r'[?|.|!|:|,]',r'',Text_Data)

### Removing tha [\r\n] tag.
Text_Data= re.sub(r'[\r\n]',r' ',Text_Data)

### Removing extra spaces in the text.
Text_Data_Cleaned= re.sub(r' +', ' ', Text_Data)

In [8]:
### Some Sample Data
Text_Data_Cleaned[:500]

'The numerous aids which the government have from time to time afforded through the agency of the Post-office for the encouragement of thrift and providence amongst the poorer classes have generally been attended with so much success that it is surprising to hear of even one exception in regard to such efforts There is no doubt however as was pointed out two years ago in this _Journal_ that the existing scheme of Post-office Life-assurance and Annuities which has been in operation since 1865 has '

In [9]:
print('         Total Length of the Data:-',len(Text_Data_Cleaned))
print('Total Number of Words in the Data:-',len(Text_Data_Cleaned.split()))

         Total Length of the Data:- 112644
Total Number of Words in the Data:- 19799


# 1. NER Using Spacy
### Installing spacy library and the NER model

In [10]:
### Installing the spacy library
# !pip install spacy

### Installing the spacy NER model
# !python -m spacy download en_core_web_sm

In [11]:
import spacy
import en_core_web_sm

### Load SpaCy model
nlp = spacy.load('en_core_web_sm')

FoundEntities= nlp(Text_Data_Cleaned)

In [12]:
### Creating Data Frame for Named Entity Tagged Words.

entities = []
labels = []
position_start = []
position_end = []

import pandas as pd

for ent in FoundEntities.ents:
    entities.append(ent.text)
    labels.append(ent.label_)
    
Ner_Data = pd.DataFrame( {'Entities':entities, 'Labels':labels} )
Ner_Data['Labels_Explanation']= [spacy.explain(Ner_Data['Labels'][i]) for i in range(len(Ner_Data['Labels']))]

print('##### Total Found Named Entity:-',len(Ner_Data))
Ner_Data

##### Total Found Named Entity:- 846


Unnamed: 0,Entities,Labels,Labels_Explanation
0,Post,ORG,"Companies, agencies, institutions, etc."
1,one,CARDINAL,Numerals that do not fall under another type
2,two years ago,DATE,Absolute or relative dates or periods
3,Journal,ORG,"Companies, agencies, institutions, etc."
4,Post-office,ORG,"Companies, agencies, institutions, etc."
...,...,...,...
841,Project Gutenberg,PERSON,"People, including fictional"
842,Project,ORG,"Companies, agencies, institutions, etc."
843,Gutenberg Literary Archive Foundation,ORG,"Companies, agencies, institutions, etc."
844,eBooks,ORG,"Companies, agencies, institutions, etc."


# 2. NER Using NLTK algorithm

In [13]:
### Tokenize the Text Data || POS tagging || Labelling Named Entity(NE)

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

### Tokenize the Sentence --
word_list = word_tokenize(Text_Data)

### POS tagging
pos_tags = nltk.pos_tag(word_list)
print('\n############## After Tokenizing Total Number of POS Tagged words :-',len(pos_tags))
print('############## After Tokenizing Few POS Tagged Words:-  \n\n',pos_tags[:20])

### Labelling the words with Named Entity(NE)
chunk_word=[]
chunks = nltk.ne_chunk(pos_tags, binary=False)  # binary=True

print('\n############### Few Named Entity Tagged Words ###############\n')

Counter = 1
for chunk in chunks:
    if (hasattr(chunk,'label') and (Counter <=5)):
        print(chunk)
        Counter = Counter + 1


############## After Tokenizing Total Number of POS Tagged words :- 20521
############## After Tokenizing Few POS Tagged Words:-  

 [('The', 'DT'), ('numerous', 'JJ'), ('aids', 'NNS'), ('which', 'WDT'), ('the', 'DT'), ('government', 'NN'), ('have', 'VBP'), ('from', 'IN'), ('time', 'NN'), ('to', 'TO'), ('time', 'NN'), ('afforded', 'VBN'), ('through', 'IN'), ('the', 'DT'), ('agency', 'NN'), ('of', 'IN'), ('the', 'DT'), ('Post-office', 'NNP'), ('for', 'IN'), ('the', 'DT')]

############### Few Named Entity Tagged Words ###############

(PERSON Annuities/NNP)
(PERSON Fawcett/NNP)
(ORGANIZATION Committee/NNP)
(ORGANIZATION House/NNP)
(ORGANIZATION Commons/NNP)


In [14]:
### Creating Data Frame For Named Entity Tagged Words.

import pandas as pd

entities =[]
labels =[]

for chunk in chunks:
    if hasattr(chunk,'label'):
        #print(chunk)
        
        entities.append(' '.join(i[0] for i in chunk))
        labels.append(chunk.label())
        
        entities_labels = list(set(zip(entities, labels)))
        NER_Data = pd.DataFrame(entities_labels, columns = ["Entities","Labels"])
        
print('##### Total Found Named Entity:-',len(NER_Data))
NER_Data

##### Total Found Named Entity:- 320


Unnamed: 0,Entities,Labels
0,Far,PERSON
1,Cameron,PERSON
2,Postume Postume,PERSON
3,Exactly That,PERSON
4,Buckwheat,PERSON
...,...,...
315,Providence,ORGANIZATION
316,Lavenham,GPE
317,Wood,PERSON
318,Wire,PERSON


&emsp;&emsp;&emsp; &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp; &emsp;&emsp; &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp; &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp; &emsp;&emsp; &emsp;&emsp;&emsp;&emsp;&emsp; &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;Pranab Kumar Paul.