In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [7]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\sanjiv\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [3]:
def preprocess(sentence):
    tokens = word_tokenize(sentence)
    tokens_pos = pos_tag(tokens)

    return tokens_pos

In [4]:
ex = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

In [8]:
pos_words = preprocess(ex)

In [9]:
pos_words

[('European', 'JJ'),
 ('authorities', 'NNS'),
 ('fined', 'VBD'),
 ('Google', 'NNP'),
 ('a', 'DT'),
 ('record', 'NN'),
 ('$', '$'),
 ('5.1', 'CD'),
 ('billion', 'CD'),
 ('on', 'IN'),
 ('Wednesday', 'NNP'),
 ('for', 'IN'),
 ('abusing', 'VBG'),
 ('its', 'PRP$'),
 ('power', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mobile', 'JJ'),
 ('phone', 'NN'),
 ('market', 'NN'),
 ('and', 'CC'),
 ('ordered', 'VBD'),
 ('the', 'DT'),
 ('company', 'NN'),
 ('to', 'TO'),
 ('alter', 'VB'),
 ('its', 'PRP$'),
 ('practices', 'NNS')]

In [12]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [13]:
doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')

In [14]:
doc

European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices

In [15]:
print([(X.text, X.label_) for X in doc.ents])

[('European', 'NORP'), ('Google', 'ORG'), ('$5.1 billion', 'MONEY'), ('Wednesday', 'DATE')]


In [16]:
ipl = '''The Indian Premier League (IPL) is a professional Twenty20 cricket league, contested by eight teams based out of eight different Indian cities.[3] The league was founded by the Board of Control for Cricket in India (BCCI) in 2007. It is usually held between March and May of every year and has an exclusive window in the ICC Future Tours Programme.[4]

The IPL is the most-attended cricket league in the world and in 2014 was ranked sixth by average attendance among all sports leagues.[5] In 2010, the IPL became the first sporting event in the world to be broadcast live on YouTube.[6][7] The brand value of the IPL in 2019 was ₹475 billion (US$6.7 billion), according to Duff & Phelps.[8] According to BCCI, the 2015 IPL season contributed ₹11.5 billion (US$160 million) to the GDP of the Indian economy.[9] '''

In [18]:
doc2 = nlp(ipl)

In [19]:
[(X.text, X.label_) for X in doc2.ents]

[('Indian', 'NORP'),
 ('Twenty20', 'CARDINAL'),
 ('eight', 'CARDINAL'),
 ('eight', 'CARDINAL'),
 ('Indian', 'NORP'),
 ('the Board of Control for Cricket', 'ORG'),
 ('India', 'GPE'),
 ('BCCI', 'ORG'),
 ('2007', 'DATE'),
 ('between March and May of every year', 'DATE'),
 ('the ICC Future Tours', 'ORG'),
 ('2014', 'DATE'),
 ('sixth', 'ORDINAL'),
 ('2010', 'DATE'),
 ('first', 'ORDINAL'),
 ('2019', 'DATE'),
 ('US$6.7 billion', 'MONEY'),
 ('Duff &', 'ORG'),
 ('BCCI', 'ORG'),
 ('2015', 'DATE'),
 ('₹11.5 billion', 'MONEY'),
 ('US$160 million', 'MONEY'),
 ('Indian', 'NORP')]

In [22]:
print([(X, X.ent_iob_, X.ent_type_) for X in doc2])

[(The, 'O', ''), (Indian, 'B', 'NORP'), (Premier, 'O', ''), (League, 'O', ''), ((, 'O', ''), (IPL, 'O', ''), (), 'O', ''), (is, 'O', ''), (a, 'O', ''), (professional, 'O', ''), (Twenty20, 'B', 'CARDINAL'), (cricket, 'O', ''), (league, 'O', ''), (,, 'O', ''), (contested, 'O', ''), (by, 'O', ''), (eight, 'B', 'CARDINAL'), (teams, 'O', ''), (based, 'O', ''), (out, 'O', ''), (of, 'O', ''), (eight, 'B', 'CARDINAL'), (different, 'O', ''), (Indian, 'B', 'NORP'), (cities.[3, 'O', ''), (], 'O', ''), (The, 'O', ''), (league, 'O', ''), (was, 'O', ''), (founded, 'O', ''), (by, 'O', ''), (the, 'B', 'ORG'), (Board, 'I', 'ORG'), (of, 'I', 'ORG'), (Control, 'I', 'ORG'), (for, 'I', 'ORG'), (Cricket, 'I', 'ORG'), (in, 'O', ''), (India, 'B', 'GPE'), ((, 'O', ''), (BCCI, 'B', 'ORG'), (), 'O', ''), (in, 'O', ''), (2007, 'B', 'DATE'), (., 'O', ''), (It, 'O', ''), (is, 'O', ''), (usually, 'O', ''), (held, 'O', ''), (between, 'B', 'DATE'), (March, 'I', 'DATE'), (and, 'I', 'DATE'), (May, 'I', 'DATE'), (of, 'I'