# Using SPACY

In [1]:
import spacy

In [2]:
nlp_sm = spacy.load("en_core_web_sm")
nlp_md = spacy.load("en_core_web_md")
nlp_lg = spacy.load("en_core_web_lg")

In [16]:
text = ["Missing Indian student in US confirmed dead; body found at Purdue University campus"]

In [23]:
text1 = ["Apple Inc. was founded by steve jobs and steve wozniak in Cupertiono. Google is located in Mountain view, CAlifornia."]

In [17]:
doc_sm = nlp_sm(text[0])
doc_md = nlp_md(text[0])
doc_lg = nlp_lg(text[0])

In [47]:
for i in doc_sm:
    print(f"{i}: {i.pos_}: {spacy.explain(str(i.pos_))}: {i.lemma_}")

Missing: VERB: verb: miss
Indian: ADJ: adjective: indian
student: NOUN: noun: student
in: ADP: adposition: in
US: PROPN: proper noun: US
confirmed: VERB: verb: confirm
dead: ADJ: adjective: dead
;: PUNCT: punctuation: ;
body: NOUN: noun: body
found: VERB: verb: find
at: ADP: adposition: at
Purdue: PROPN: proper noun: Purdue
University: PROPN: proper noun: University
campus: NOUN: noun: campus


In [46]:
t = []
for i in doc_sm:
    print(type(i.pos_))

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>


str

In [18]:
for i in doc_sm.ents:
    print(f"{i.text} | {i.label_} | {spacy.explain(i.label_)}")

Indian | NORP | Nationalities or religious or political groups
US | GPE | Countries, cities, states
Purdue University | ORG | Companies, agencies, institutions, etc.


In [19]:
for i in doc_md.ents:
    print(f"{i.text} | {i.label_} | {spacy.explain(i.label_)}")

Indian | NORP | Nationalities or religious or political groups
US | GPE | Countries, cities, states
Purdue University | ORG | Companies, agencies, institutions, etc.


In [20]:
for i in doc_lg.ents:
    print(f"{i.text} | {i.label_} | {spacy.explain(i.label_)}")

Indian | NORP | Nationalities or religious or political groups
US | GPE | Countries, cities, states
Purdue University | ORG | Companies, agencies, institutions, etc.


# Using NLTK

In [22]:
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk

In [24]:
nltk.download('punkt')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\priyavrat.sharma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\priyavrat.sharma\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\priyavrat.sharma\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\priyavrat.sharma\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [25]:
tokens = word_tokenize(text[0])
pos_tags = pos_tag(tokens)

In [26]:
tree = ne_chunk(pos_tags)

In [27]:
for subtree in tree:
    if isinstance(subtree,nltk.Tree):
        entity = " ".join([word for word,tag in subtree.leaves()])
        label = subtree.label()
        print(f"{entity}: {label}")

Indian: GPE
US: GSP
Purdue University: ORGANIZATION


# Using Transformer Based Model

In [28]:
from transformers import pipeline




In [29]:
nlp = pipeline(task = "ner", model = "dbmdz/bert-large-cased-finetuned-conll03-english")

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [30]:
predictions = nlp(text[0])
print(predictions)

[{'entity': 'I-MISC', 'score': 0.99519265, 'index': 2, 'word': 'Indian', 'start': 8, 'end': 14}, {'entity': 'I-LOC', 'score': 0.99802136, 'index': 5, 'word': 'US', 'start': 26, 'end': 28}, {'entity': 'I-ORG', 'score': 0.9659601, 'index': 12, 'word': 'Purdue', 'start': 59, 'end': 65}, {'entity': 'I-ORG', 'score': 0.93503267, 'index': 13, 'word': 'University', 'start': 66, 'end': 76}]


In [33]:
for i in predictions:
    print(f"{i['entity']} | {i['score']} | {i['word']}")

I-MISC | 0.9951926469802856 | Indian
I-LOC | 0.9980213642120361 | US
I-ORG | 0.9659600853919983 | Purdue
I-ORG | 0.9350326657295227 | University
