# Named Entity Recognition (NER)

In [1]:
import nltk
from collections import Counter
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk import word_tokenize, pos_tag, ne_chunk
nltk.download('words')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package wordnet to /home/reza/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to /home/reza/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/reza/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/reza/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/reza/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


True

In [2]:
example_text = "Avengers: Endgame is a 2019 American superhero film based on the Marvel Comics superhero team the Avengers, produced by Marvel Studios and distributed by Walt Disney Studios Motion Pictures. The movie features an ensemble cast including Robert Downey Jr., Chris Evans, Mark Ruffalo, Chris Hemsworth, and others. (Source: wikipedia)."

### the tokenization and POS tagging

In [3]:
tokenized_words = word_tokenize(example_text)

pos_tagging_words = pos_tag(tokenized_words)


### the chunking 

In [4]:
chunked = ne_chunk(pos_tagging_words)
for word in str(chunked).split('\n'):
    if '/NN' in word:
        print(word)

  Avengers/NNS
  Endgame/NN
  superhero/NN
  film/NN
  (ORGANIZATION Marvel/NNP Comics/NNP)
  superhero/NN
  team/NN
  (ORGANIZATION Avengers/NNPS)
  (PERSON Marvel/NNP Studios/NNP)
  (PERSON Walt/NNP Disney/NNP Studios/NNP)
  Motion/NNP
  Pictures/NNP
  movie/NN
  cast/NN
  (PERSON Robert/NNP Downey/NNP Jr./NNP)
  (PERSON Chris/NNP Evans/NNP)
  (PERSON Mark/NNP Ruffalo/NNP)
  (PERSON Chris/NNP Hemsworth/NNP)
  others/NNS
  (PERSON Source/NN)
  wikipedia/NN


#### the NER in persian

In [5]:
from hazm import WordTokenizer
from hazm import Normalizer
from hazm import Chunker
from hazm import POSTagger
from hazm import tree2brackets

In [6]:
example_persian_sentence = "نور خورشید به سختی از میان برگ‌های انبوه درختان کهنسال به زمین نفوذ می‌کرد و سایه‌ای خنک و دل‌انگیز بر بستر جنگل گسترانده بود."

### the tokenization 

In [7]:
normalizer = Normalizer()
wordTokenizer = WordTokenizer()

normalized_sentence = normalizer.normalize(example_persian_sentence)
words = wordTokenizer.tokenize(normalized_sentence)


### the POS tagging

In [10]:
posTagger = POSTagger(model="pos_tagger.model")

tagged_words = posTagger.tag(words)


### the chunking

In [9]:
chunker = Chunker(model="chunker.model")

chunked = tree2brackets(chunker.parse(tagged_words))
chunked


'[نور خورشید NP] [به PP] [سختی NP] [از PP] [میان برگ\u200cهای انبوه درختان کهنسال NP] [به PP] [زمین NP] [نفوذ می\u200cکرد VP] و [سایه\u200cای خنک و دل\u200cانگیز NP] [بر PP] [بستر جنگل NP] [گسترانده_بود VP] .'