conda install -c conda-forge spacy-model-en_core_web_md

In [11]:
data = """The Mahabharata is a story about a great battle between the Kauravas and the Pandavas.
The battle was fought in Kurukshetra near Delhi. 
Many kings and princes took part in the battle. 
The Pandavas defeated the Kauravas.
The Bhagvad Gita is a holy book of the Hindus.
It is a part of the Mahabharata. 
Then, Lord Rama, with the help of It is a book of collection of teachings of Lord Krishna to Arjuna in the battlefield.
It is the longest epic in the world."""

In [20]:
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation


In [9]:
stopword_list = stopwords.words("english")
stopword_list.remove("no")
stopword_list.remove("not")
stopword_list.remove("nor")

def cleaning_text(data):
    tokens = word_tokenize(data)
    clean_tokens = [ i.lower() for i in tokens if (i.lower() not in stopword_list) and (i not in punctuation)]
    clean_tokens = [ i for i in clean_tokens if (len(i)>1 and i.isalpha())]
    return " ".join(clean_tokens)

In [12]:
clean_text = cleaning_text(data)
clean_text

'mahabharata story great battle kauravas pandavas battle fought kurukshetra near delhi many kings princes took part battle pandavas defeated kauravas bhagvad gita holy book hindus part mahabharata lord rama help book collection teachings lord krishna arjuna battlefield longest epic world'

In [18]:
nlp = spacy.load("en_core_web_md")



In [22]:
doc1 = nlp(clean_text)
doc1

mahabharata story great battle kauravas pandavas battle fought kurukshetra near delhi many kings princes took part battle pandavas defeated kauravas bhagvad gita holy book hindus part mahabharata lord rama help book collection teachings lord krishna arjuna battlefield longest epic world

In [23]:
[(token.text,token.pos_) for token in doc1]

[('mahabharata', 'PROPN'),
 ('story', 'NOUN'),
 ('great', 'ADJ'),
 ('battle', 'NOUN'),
 ('kauravas', 'NOUN'),
 ('pandavas', 'PROPN'),
 ('battle', 'PROPN'),
 ('fought', 'VERB'),
 ('kurukshetra', 'PROPN'),
 ('near', 'ADP'),
 ('delhi', 'PROPN'),
 ('many', 'ADJ'),
 ('kings', 'NOUN'),
 ('princes', 'NOUN'),
 ('took', 'VERB'),
 ('part', 'NOUN'),
 ('battle', 'NOUN'),
 ('pandavas', 'PROPN'),
 ('defeated', 'VERB'),
 ('kauravas', 'PROPN'),
 ('bhagvad', 'PROPN'),
 ('gita', 'PROPN'),
 ('holy', 'PROPN'),
 ('book', 'PROPN'),
 ('hindus', 'PROPN'),
 ('part', 'PROPN'),
 ('mahabharata', 'PROPN'),
 ('lord', 'PROPN'),
 ('rama', 'PROPN'),
 ('help', 'AUX'),
 ('book', 'NOUN'),
 ('collection', 'NOUN'),
 ('teachings', 'NOUN'),
 ('lord', 'PROPN'),
 ('krishna', 'PROPN'),
 ('arjuna', 'PROPN'),
 ('battlefield', 'VERB'),
 ('longest', 'ADJ'),
 ('epic', 'ADJ'),
 ('world', 'NOUN')]

In [24]:
# named entity
[(token.text,token.label) for token in doc1.ents]

[('kurukshetra', 381),
 ('delhi', 384),
 ('kauravas bhagvad gita', 380),
 ('krishna arjuna battlefield', 380)]

#### without cleaning

In [25]:
doc = nlp(data)

In [26]:
[(token.text,token.pos_) for token in doc]

[('The', 'DET'),
 ('Mahabharata', 'PROPN'),
 ('is', 'AUX'),
 ('a', 'DET'),
 ('story', 'NOUN'),
 ('about', 'ADP'),
 ('a', 'DET'),
 ('great', 'ADJ'),
 ('battle', 'NOUN'),
 ('between', 'ADP'),
 ('the', 'DET'),
 ('Kauravas', 'PROPN'),
 ('and', 'CCONJ'),
 ('the', 'DET'),
 ('Pandavas', 'PROPN'),
 ('.', 'PUNCT'),
 ('\n', 'SPACE'),
 ('The', 'DET'),
 ('battle', 'NOUN'),
 ('was', 'AUX'),
 ('fought', 'VERB'),
 ('in', 'ADP'),
 ('Kurukshetra', 'PROPN'),
 ('near', 'ADP'),
 ('Delhi', 'PROPN'),
 ('.', 'PUNCT'),
 ('\n', 'SPACE'),
 ('Many', 'ADJ'),
 ('kings', 'NOUN'),
 ('and', 'CCONJ'),
 ('princes', 'NOUN'),
 ('took', 'VERB'),
 ('part', 'NOUN'),
 ('in', 'ADP'),
 ('the', 'DET'),
 ('battle', 'NOUN'),
 ('.', 'PUNCT'),
 ('\n', 'SPACE'),
 ('The', 'DET'),
 ('Pandavas', 'PROPN'),
 ('defeated', 'VERB'),
 ('the', 'DET'),
 ('Kauravas', 'PROPN'),
 ('.', 'PUNCT'),
 ('\n', 'SPACE'),
 ('The', 'DET'),
 ('Bhagvad', 'PROPN'),
 ('Gita', 'PROPN'),
 ('is', 'AUX'),
 ('a', 'DET'),
 ('holy', 'ADJ'),
 ('book', 'NOUN'),
 ('of',

In [27]:
# named entity
[(token.text,token.label) for token in doc.ents]

[('Mahabharata', 380),
 ('Kurukshetra', 381),
 ('Delhi', 384),
 ('The Bhagvad Gita', 380),
 ('Hindus', 381),
 ('Rama', 380),
 ('Krishna', 380),
 ('Arjuna', 380)]

In [31]:
spacy.explain(380)

In [32]:
from spacy import displacy
displacy.render(doc1,style="ent",jupyter=True)

In [34]:
from spacy import displacy
displacy.render(doc,style="ent",jupyter=True)

In [35]:
nlp.get_pipe("ner").labels

('CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART')

In [38]:
vect1 = nlp("machine")
vect1.vector

array([-1.7952e+00,  8.8564e-01,  1.2995e+00,  5.3467e+00,  2.3356e+00,
        6.3150e-01,  3.2695e+00,  5.3845e+00, -4.7579e+00, -1.1556e+00,
        7.7207e+00,  1.7057e+00, -5.1134e+00,  5.7279e+00, -6.4964e-01,
        2.5735e+00,  4.0722e+00,  2.7346e+00, -3.6327e-01,  3.1659e-01,
        1.6892e+00,  4.0048e+00,  2.5983e-02,  1.4939e+00, -3.3041e+00,
       -1.4575e+00, -1.2547e+00, -4.5687e+00,  1.5173e+00,  6.8314e-01,
        5.7678e-01, -2.0256e+00,  9.1923e-01,  2.8375e+00,  1.7972e+00,
       -3.1808e-01,  5.2218e+00,  1.3707e+00,  3.3360e+00,  4.5912e+00,
       -1.0869e+00, -2.5762e+00,  4.5750e+00,  5.6772e-01,  9.5806e-01,
        7.2935e-01, -5.6902e-01,  8.2880e-01,  6.3900e-01, -4.3184e+00,
        2.8204e+00,  6.8541e-01,  3.3632e+00, -3.3472e+00, -4.6527e-01,
        2.1472e+00,  1.3270e+00,  8.0798e-01,  9.7008e-01,  2.0223e+00,
        4.2663e+00,  2.2851e+00, -3.5020e+00, -3.2424e+00, -1.6429e+00,
        8.8501e-01, -3.4272e+00, -4.5300e+00, -2.9577e+00, -6.50

In [39]:
vect1 = nlp("data")
vect1.vector

array([  8.433   ,  -0.62196 ,   4.5699  ,   4.2771  ,   7.5471  ,
        -3.4932  ,  -2.7744  ,  11.539   ,  -0.41976 ,  -5.7145  ,
        13.106   ,   5.9285  ,  -5.0834  ,   2.0165  ,  -1.3187  ,
        -3.1088  ,   8.6808  ,  -0.37767 , -12.834   ,  -7.7306  ,
        -1.3145  ,   0.19578 , -12.683   ,  -4.448   , -12.181   ,
        -6.8301  ,   3.1909  ,  -4.2021  ,  -1.1167  ,   5.6552  ,
         4.5593  ,   1.1842  ,  -8.839   ,   5.1291  ,   8.8025  ,
        -7.7602  ,  -7.9684  ,  -7.8937  ,  10.286   ,   5.7079  ,
         3.1926  ,  -0.52753 ,  -2.0976  ,   6.7266  ,  -5.971   ,
        -0.52903 ,   8.3433  ,  -1.7521  ,   4.4892  ,  -2.456   ,
         5.7153  ,  -0.29028 ,  -2.1078  ,  -4.7923  ,  -2.8459  ,
         2.5139  ,  -7.3703  ,   0.25596 ,   5.1344  , -12.615   ,
         5.0602  ,   6.3371  ,  -6.1839  ,   0.71782 ,   5.8044  ,
         4.7112  ,   0.20968 ,  -3.2077  ,  -0.62559 ,   0.22191 ,
         2.8759  ,   5.6576  ,  -4.5906  ,   5.0782  ,  -4.020

In [48]:
text1 = "Rakesh plays Basketball and Football"
doc2 = nlp(text1)
for ent in doc2.ents:
    print(ent.text,ent.label)

Basketball and Football 388


In [49]:
spacy.explain("Basketball")

  


In [52]:
nlp2 = spacy.load("en_core_web_md",disable= ["ner"])
ruler = nlp2.add_pipe("entity_ruler")
patterns = [{"label":"SPORT","pattern":"Basketball"},{"label":"SPORT","pattern":"Football"},{"label":"Person","pattern":"Rakesh"}]
ruler.add_patterns(patterns)
text1 = "Rakesh plays Basketball and Football"
doc2 = nlp2(text1)
for ent in doc2.ents:
    print(ent.text,ent.label_)



Rakesh Person
Basketball SPORT
Football SPORT
