<a href="https://colab.research.google.com/github/ravi-kiran-iiml/my_git/blob/master/spacy_first_steps.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://towardsdatascience.com/a-short-introduction-to-nlp-in-python-with-spacy-d0aa819af3ad

In [1]:
!pip install spacy



In [5]:
import spacy
nlp = spacy.load("en") 
doc = nlp("The big grey dog ate all of the chocolate, but fortunately he wasn't sick!")

print(doc)


The big grey dog ate all of the chocolate, but fortunately he wasn't sick!


# Tokenization

In [6]:
doc.text.split() 

['The',
 'big',
 'grey',
 'dog',
 'ate',
 'all',
 'of',
 'the',
 'chocolate,',
 'but',
 'fortunately',
 'he',
 "wasn't",
 'sick!']

In [7]:
[token.orth_ for token in doc]

['The',
 'big',
 'grey',
 'dog',
 'ate',
 'all',
 'of',
 'the',
 'chocolate',
 ',',
 'but',
 'fortunately',
 'he',
 'was',
 "n't",
 'sick',
 '!']

In [8]:
[(token, token.orth_, token.orth) for token in doc]

[(The, 'The', 5059648917813135842),
 (big, 'big', 15511632813958231649),
 (grey, 'grey', 10475807793332549289),
 (dog, 'dog', 7562983679033046312),
 (ate, 'ate', 10806788082624814911),
 (all, 'all', 13409319323822384369),
 (of, 'of', 886050111519832510),
 (the, 'the', 7425985699627899538),
 (chocolate, 'chocolate', 10946593968795032542),
 (,, ',', 2593208677638477497),
 (but, 'but', 14560795576765492085),
 (fortunately, 'fortunately', 13851269277375979931),
 (he, 'he', 1655312771067108281),
 (was, 'was', 9921686513378912864),
 (n't, "n't", 2043519015752540944),
 (sick, 'sick', 14841597609857081305),
 (!, '!', 17494803046312582752)]

In [9]:
[token.orth_ for token in doc if not token.is_punct | token.is_space]

['The',
 'big',
 'grey',
 'dog',
 'ate',
 'all',
 'of',
 'the',
 'chocolate',
 'but',
 'fortunately',
 'he',
 'was',
 "n't",
 'sick']

# Lemmatization


In [11]:
practice = "practice practiced practicing" 
nlp_practice = nlp(practice) 
print(nlp_practice)
[word.lemma_ for word in nlp_practice] 

practice practiced practicing


['practice', 'practice', 'practice']

# POS Tagging

In [13]:
doc2 = nlp("Conor's dog's toy was hidden under the man's sofa in the woman's house")

pos_tags = [(i, i.tag_) for i in doc2]
pos_tags

[(Conor, 'NNP'),
 ('s, 'POS'),
 (dog, 'NN'),
 ('s, 'POS'),
 (toy, 'NN'),
 (was, 'VBD'),
 (hidden, 'VBN'),
 (under, 'IN'),
 (the, 'DT'),
 (man, 'NN'),
 ('s, 'POS'),
 (sofa, 'NN'),
 (in, 'IN'),
 (the, 'DT'),
 (woman, 'NN'),
 ('s, 'POS'),
 (house, 'NN')]

In [14]:
owners_possessions = []
for i in pos_tags:
  if i[1] == "POS":
    owner = i[0].nbor(-1)
    possession = i[0].nbor(1)
    owners_possessions.append((owner, possession))

owners_possessions

[(Conor, dog), (dog, toy), (man, sofa), (woman, house)]

In [15]:
[(i[0].nbor(-1), i[0].nbor(+1)) for i in pos_tags if i[1] == "POS"]

[(Conor, dog), (dog, toy), (man, sofa), (woman, house)]

# Entity recognition

In [16]:
wiki_obama = """Barack Obama is an American politician who served as the 44th President of the United States from 2009 to 2017. He is the first African American to have served as president, as well as the first born outside the contiguous United States."""

nlp_obama = nlp(wiki_obama)

[(i, i.label_, i.label) for i in nlp_obama.ents]

[(American, 'NORP', 381),
 (44th, 'ORDINAL', 396),
 (the United States, 'GPE', 384),
 (2009 to 2017, 'DATE', 391),
 (first, 'ORDINAL', 396),
 (African American, 'NORP', 381),
 (first, 'ORDINAL', 396),
 (United States, 'GPE', 384)]

In [17]:
for ix, sent in enumerate(nlp_obama.sents, 1):
  print("Sentence number {}: {}".format(ix, sent))

Sentence number 1: Barack Obama is an American politician who served as the 44th President of the United States from 2009 to 2017.
Sentence number 2: He is the first African American to have served as president, as well as the first born outside the contiguous United States.
