<a href="https://colab.research.google.com/github/rahiakela/nlp-research-and-practice/blob/main/nlp-in-real-world/02-advanced-nlp-applications/01_information_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Setup

In [4]:
%%capture

!pip install transformers
!pip -q install spacy
!pip install spacy-transformers==1.1.5 -f https://download.pytorch.org/whl/torch_stable.html

!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_lg
!python -m spacy download en_core_web_trf

In [None]:
!pip install transformers

In [6]:
import re

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag
from nltk import ne_chunk

from transformers import pipeline

import spacy
from spacy import displacy

In [None]:
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('punkt')

##Entity extraction

In [3]:
raw_text = "The Mars Orbiter Mission (MOM), informally known as Mangalyaan, was launched into Earth orbit on 5 November 2013 by the Indian Space Research Organisation (ISRO) and has entered Mars orbit on 24 September 2014. India thus became the first country to enter Mars orbit on its first attempt. It was completed at a record low cost of $74 million."

In [3]:
#### RegEx ############

text_list = [
    "send to j_2.4-dj3@xyz.co.net for queries.",
    "follow me on twitter@jyotikasingh_."
]

# \S matches any non-whitespace character
# @ for its occurrence in the emaIl ID,
#  . for the period after @
# + for when a character is repeated one or more times
for text in text_list:
  print(re.findall("\S+@\S+\.\S+", text))

['j_2.4-dj3@xyz.co.net']
[]


In [8]:
#### SpaCy ############

nlp = spacy.load("en_core_web_lg")

doc = nlp(raw_text)

for word in doc.ents:
  print(word.text, word.label_)

displacy.render(doc, style="ent", jupyter=True)

The Mars Orbiter Mission PRODUCT
Mangalyaan PERSON
Earth LOC
5 November 2013 DATE
the Indian Space Research Organisation ORG
ISRO ORG
Mars LOC
24 September 2014 DATE
India GPE
first ORDINAL
Mars LOC
first ORDINAL
$74 million MONEY


In [13]:
#### NLTK ############

doc = pos_tag(word_tokenize(raw_text))

In [14]:
doc

[('The', 'DT'),
 ('Mars', 'NNP'),
 ('Orbiter', 'NNP'),
 ('Mission', 'NNP'),
 ('(', '('),
 ('MOM', 'NNP'),
 (')', ')'),
 (',', ','),
 ('informally', 'RB'),
 ('known', 'VBN'),
 ('as', 'IN'),
 ('Mangalyaan', 'NNP'),
 (',', ','),
 ('was', 'VBD'),
 ('launched', 'VBN'),
 ('into', 'IN'),
 ('Earth', 'NNP'),
 ('orbit', 'NN'),
 ('on', 'IN'),
 ('5', 'CD'),
 ('November', 'NNP'),
 ('2013', 'CD'),
 ('by', 'IN'),
 ('the', 'DT'),
 ('Indian', 'JJ'),
 ('Space', 'NNP'),
 ('Research', 'NNP'),
 ('Organisation', 'NNP'),
 ('(', '('),
 ('ISRO', 'NNP'),
 (')', ')'),
 ('and', 'CC'),
 ('has', 'VBZ'),
 ('entered', 'VBN'),
 ('Mars', 'NNP'),
 ('orbit', 'NN'),
 ('on', 'IN'),
 ('24', 'CD'),
 ('September', 'NNP'),
 ('2014', 'CD'),
 ('.', '.'),
 ('India', 'NNP'),
 ('thus', 'RB'),
 ('became', 'VBD'),
 ('the', 'DT'),
 ('first', 'JJ'),
 ('country', 'NN'),
 ('to', 'TO'),
 ('enter', 'VB'),
 ('Mars', 'NNP'),
 ('orbit', 'NN'),
 ('on', 'IN'),
 ('its', 'PRP$'),
 ('first', 'JJ'),
 ('attempt', 'NN'),
 ('.', '.'),
 ('It', 'PRP'),


In [15]:
NLTK_LABELS = ["PERSON", "ORGANIZATION", "GPE"]

tagged_doc = []
for sent in sent_tokenize(raw_text):
  tagged_doc.append(pos_tag(word_tokenize(sent)))

entities = []
for sent in tagged_doc:
  trees = ne_chunk(sent)
  for tree in trees:
    if (hasattr(tree, "label") and tree.label() in NLTK_LABELS):
      entities.append((
          " ".join([entity for (entity, label) in tree
                    # filter for non-entities
                    if (
                        # removing noise, if it is a URL or empty
                        "http" not in entity.lower()
                        and "\n" not in entity.lower()
                        and len(entity.strip()) > 0
                    )
                    ]), tree.label(),
          ))
print(entities)

[('Mars', 'ORGANIZATION'), ('MOM', 'ORGANIZATION'), ('Mangalyaan', 'GPE'), ('Earth', 'GPE'), ('Indian', 'GPE'), ('Space Research Organisation', 'ORGANIZATION'), ('ISRO', 'ORGANIZATION'), ('Mars', 'PERSON'), ('India', 'GPE'), ('Mars', 'PERSON')]


In [4]:
##### spaCy transformers #######
nlp = spacy.load("en_core_web_trf")

doc = nlp(raw_text)

displacy.render(doc, style="ent", jupyter=True)

In [None]:
####### Transformers ###########

ner = pipeline("ner", model="dslim/bert-base-NER", grouped_entities=True)

In [9]:
ner(raw_text)

[{'entity_group': 'MISC',
  'score': 0.7344233,
  'word': 'Mars Orbiter Mission',
  'start': 4,
  'end': 24},
 {'entity_group': 'MISC',
  'score': 0.6008749,
  'word': 'MOM',
  'start': 26,
  'end': 29},
 {'entity_group': 'LOC',
  'score': 0.43170488,
  'word': 'Man',
  'start': 52,
  'end': 55},
 {'entity_group': 'MISC',
  'score': 0.5044301,
  'word': '##gal',
  'start': 55,
  'end': 58},
 {'entity_group': 'LOC',
  'score': 0.47212598,
  'word': '##ya',
  'start': 58,
  'end': 60},
 {'entity_group': 'MISC',
  'score': 0.48969537,
  'word': '##an',
  'start': 60,
  'end': 62},
 {'entity_group': 'LOC',
  'score': 0.7542032,
  'word': 'Earth',
  'start': 82,
  'end': 87},
 {'entity_group': 'ORG',
  'score': 0.99907124,
  'word': 'Indian Space Research Organisation',
  'start': 120,
  'end': 154},
 {'entity_group': 'ORG',
  'score': 0.9986104,
  'word': 'ISRO',
  'start': 156,
  'end': 160},
 {'entity_group': 'LOC',
  'score': 0.99694604,
  'word': 'Mars',
  'start': 178,
  'end': 182},
