In [None]:
! pip install spacy
! python -m spacy download en_core_web_md

In [1]:
data = """The sun is a huge ball of gases. It has a diameter
of 1,392,000 km. It is so huge that it can hold millions of planets 
inside it. The Sun is mainly made up of hydrogen and helium gas. The
surface of the Sun is known as the photosphere. The photosphere is surrounded 
by a thin layer of gas known as the chromospheres. Without the Sun, there would 
be no life on Earth. There would be no plants, no animals and no human beings. As,
all the living things on Earth get their energy from the Sun for their survival."""

In [2]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
stopword_list = stopwords.words("english")
stopword_list.remove("no")
stopword_list.remove("not")
stopword_list.remove("nor")
def cleaning_text(data):
    tokens = word_tokenize(data)
    clean_tokens = [ i.lower() for i in tokens if (i.lower() not in stopword_list) and (i not in punctuation) ]
    clean_tokens = [ i for i in  clean_tokens if (len(i)>1) and i.isalpha()]
    return " ".join(clean_tokens)

In [3]:
clean_text = cleaning_text(data)
clean_text

'sun huge ball gases diameter km huge hold millions planets inside sun mainly made hydrogen helium gas surface sun known photosphere photosphere surrounded thin layer gas known chromospheres without sun would no life earth would no plants no animals no human beings living things earth get energy sun survival'

In [4]:
import spacy
nlp = spacy.load('en_core_web_md')


In [5]:
doc1 = nlp(clean_text)

In [7]:
[(token.text, token.pos_) for token in doc1]# part of speech tagging

[('sun', 'PROPN'),
 ('huge', 'ADJ'),
 ('ball', 'NOUN'),
 ('gases', 'NOUN'),
 ('diameter', 'NOUN'),
 ('km', 'ADP'),
 ('huge', 'ADJ'),
 ('hold', 'NOUN'),
 ('millions', 'NOUN'),
 ('planets', 'NOUN'),
 ('inside', 'ADP'),
 ('sun', 'NOUN'),
 ('mainly', 'ADV'),
 ('made', 'VERB'),
 ('hydrogen', 'NOUN'),
 ('helium', 'NOUN'),
 ('gas', 'NOUN'),
 ('surface', 'PROPN'),
 ('sun', 'PROPN'),
 ('known', 'VERB'),
 ('photosphere', 'PROPN'),
 ('photosphere', 'PROPN'),
 ('surrounded', 'VERB'),
 ('thin', 'ADJ'),
 ('layer', 'NOUN'),
 ('gas', 'NOUN'),
 ('known', 'VERB'),
 ('chromospheres', 'NOUN'),
 ('without', 'ADP'),
 ('sun', 'NOUN'),
 ('would', 'AUX'),
 ('no', 'DET'),
 ('life', 'NOUN'),
 ('earth', 'NOUN'),
 ('would', 'AUX'),
 ('no', 'DET'),
 ('plants', 'VERB'),
 ('no', 'DET'),
 ('animals', 'NOUN'),
 ('no', 'DET'),
 ('human', 'ADJ'),
 ('beings', 'NOUN'),
 ('living', 'VERB'),
 ('things', 'NOUN'),
 ('earth', 'NOUN'),
 ('get', 'VERB'),
 ('energy', 'NOUN'),
 ('sun', 'NOUN'),
 ('survival', 'NOUN')]

In [8]:
# named entity
[(token.text, token.label_) for token in doc1.ents]

[('millions', 'CARDINAL'), ('sun', 'PERSON')]

In [9]:
# without cleaning
doc = nlp(data)

In [10]:
[(token.text, token.pos_) for token in doc]# part of speech tagging

[('The', 'DET'),
 ('sun', 'NOUN'),
 ('is', 'AUX'),
 ('a', 'DET'),
 ('huge', 'ADJ'),
 ('ball', 'NOUN'),
 ('of', 'ADP'),
 ('gases', 'NOUN'),
 ('.', 'PUNCT'),
 ('It', 'PRON'),
 ('has', 'VERB'),
 ('a', 'DET'),
 ('diameter', 'NOUN'),
 ('\n', 'SPACE'),
 ('of', 'ADP'),
 ('1,392,000', 'NUM'),
 ('km', 'NOUN'),
 ('.', 'PUNCT'),
 ('It', 'PRON'),
 ('is', 'AUX'),
 ('so', 'ADV'),
 ('huge', 'ADJ'),
 ('that', 'SCONJ'),
 ('it', 'PRON'),
 ('can', 'AUX'),
 ('hold', 'VERB'),
 ('millions', 'NOUN'),
 ('of', 'ADP'),
 ('planets', 'NOUN'),
 ('\n', 'SPACE'),
 ('inside', 'ADP'),
 ('it', 'PRON'),
 ('.', 'PUNCT'),
 ('The', 'DET'),
 ('Sun', 'PROPN'),
 ('is', 'AUX'),
 ('mainly', 'ADV'),
 ('made', 'VERB'),
 ('up', 'ADP'),
 ('of', 'ADP'),
 ('hydrogen', 'NOUN'),
 ('and', 'CCONJ'),
 ('helium', 'NOUN'),
 ('gas', 'NOUN'),
 ('.', 'PUNCT'),
 ('The', 'DET'),
 ('\n', 'SPACE'),
 ('surface', 'NOUN'),
 ('of', 'ADP'),
 ('the', 'DET'),
 ('Sun', 'PROPN'),
 ('is', 'AUX'),
 ('known', 'VERB'),
 ('as', 'ADP'),
 ('the', 'DET'),
 ('photo

In [11]:
# named entity
[(token.text, token.label_) for token in doc.ents]

[('1,392,000 km', 'QUANTITY'),
 ('millions', 'CARDINAL'),
 ('Sun', 'ORG'),
 ('Sun', 'ORG'),
 ('Sun', 'ORG'),
 ('Earth', 'LOC'),
 ('Earth', 'LOC'),
 ('Sun', 'ORG')]

In [12]:
spacy.explain('CARDINAL')

'Numerals that do not fall under another type'

In [13]:
spacy.explain('ORG')

'Companies, agencies, institutions, etc.'

In [14]:
spacy.explain('LOC')

'Non-GPE locations, mountain ranges, bodies of water'

In [15]:
from spacy import displacy
displacy.render(doc1,style='ent',jupyter = True)

In [16]:
from spacy import displacy
displacy.render(doc,style='ent',jupyter = True)

In [17]:
nlp.get_pipe('ner').labels

('CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART')

In [18]:
vect1 = nlp('machine')
vect1.vector

array([-1.7952e+00,  8.8564e-01,  1.2995e+00,  5.3467e+00,  2.3356e+00,
        6.3150e-01,  3.2695e+00,  5.3845e+00, -4.7579e+00, -1.1556e+00,
        7.7207e+00,  1.7057e+00, -5.1134e+00,  5.7279e+00, -6.4964e-01,
        2.5735e+00,  4.0722e+00,  2.7346e+00, -3.6327e-01,  3.1659e-01,
        1.6892e+00,  4.0048e+00,  2.5983e-02,  1.4939e+00, -3.3041e+00,
       -1.4575e+00, -1.2547e+00, -4.5687e+00,  1.5173e+00,  6.8314e-01,
        5.7678e-01, -2.0256e+00,  9.1923e-01,  2.8375e+00,  1.7972e+00,
       -3.1808e-01,  5.2218e+00,  1.3707e+00,  3.3360e+00,  4.5912e+00,
       -1.0869e+00, -2.5762e+00,  4.5750e+00,  5.6772e-01,  9.5806e-01,
        7.2935e-01, -5.6902e-01,  8.2880e-01,  6.3900e-01, -4.3184e+00,
        2.8204e+00,  6.8541e-01,  3.3632e+00, -3.3472e+00, -4.6527e-01,
        2.1472e+00,  1.3270e+00,  8.0798e-01,  9.7008e-01,  2.0223e+00,
        4.2663e+00,  2.2851e+00, -3.5020e+00, -3.2424e+00, -1.6429e+00,
        8.8501e-01, -3.4272e+00, -4.5300e+00, -2.9577e+00, -6.50

In [19]:
text1 = 'Rajesh plays Basketball and Lagori'
doc2 = nlp(text1)
for ent in doc2.ents:
    print(ent.text,ent.label_)

Rajesh ORG
Basketball WORK_OF_ART
Lagori PERSON


In [20]:
spacy.explain('WORK_OF_ART')

'Titles of books, songs, etc.'

In [21]:
nlp2 = spacy.load('en_core_web_md', disable=['ner'])
ruler = nlp2.add_pipe('entity_ruler')
patterns = [{'label':'SPORT','pattern':'Basketball'},{'label':'SPORT','pattern':'Lagori'},{'label':'PERSON','pattern':'Rajesh'}]
ruler.add_patterns(patterns)
text1 = 'Rajesh plays Basketball and Lagori'
doc2 = nlp2(text1)
for ent in doc2.ents:
    print(ent.text,ent.label_)

Rajesh PERSON
Basketball SPORT
Lagori SPORT


In [22]:
nlp = spacy.load('en_core_web_md')
doc1 = nlp(clean_text)
entities = []
label = []
start_position = []
end_position = []
for ent in doc1.ents:
    entities.append(ent.text)
    label.append(ent.label_)
    start_position.append(ent.start_char)
    end_position.append(ent.end_char)
import pandas as pd
df = pd.DataFrame({'Entities':entities,'label':label,'start_position':start_position,'end_position':end_position})
df

Unnamed: 0,Entities,label,start_position,end_position
0,millions,CARDINAL,42,50
1,sun,PERSON,198,201


In [23]:
nlp = spacy.load('en_core_web_md')
doc1 = nlp(data)
entities = []
label = []
start_position = []
end_position = []
for ent in doc1.ents:
    entities.append(ent.text)
    label.append(ent.label_)
    start_position.append(ent.start_char)
    end_position.append(ent.end_char)
import pandas as pd
df = pd.DataFrame({'Entities':entities,'label':label,'start_position':start_position,'end_position':end_position})
df

Unnamed: 0,Entities,label,start_position,end_position
0,"1,392,000 km",QUANTITY,54,66
1,millions,CARDINAL,99,107
2,Sun,ORG,135,138
3,Sun,ORG,204,207
4,Sun,ORG,331,334
5,Earth,LOC,363,368
6,Earth,LOC,457,462
7,Sun,ORG,489,492


In [None]:
Entities         Label
Rajesh          Person
Python          Skillset
Bombay          Location

In [None]:
Person       Skillset   Location

Rajesh      Python      Bombay
Priyanka    Java        Bangalore