In [1]:
text="""Hello Mr. Smith, how are you doing today? The weather is great, and city is awesome.
The sky is pinkish-blue. You shouldn't eat cardboard"""

In [2]:
import spacy
from spacy import displacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load('en_core_web_sm')
doc = nlp(text)

### Word Tokenization

In [3]:
tokens = [token for token in doc]
print(tokens)

[Hello, Mr., Smith, ,, how, are, you, doing, today, ?, The, weather, is, great, ,, and, city, is, awesome, ., 
, The, sky, is, pinkish, -, blue, ., You, should, n't, eat, cardboard]


### Sentence Tokenization

In [4]:
sents = [sent for sent in doc.sents]
print(sents)
print(len(sents))

[Hello Mr. Smith, how are you doing today?, The weather is great, and city is awesome.
, The sky is pinkish-blue., You shouldn't eat cardboard]
4


### Stop Words

In [5]:
#spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
# better way -->
spacy_stopwords = nlp.Defaults.stop_words

print('Number of stop words: %d' % len(spacy_stopwords))
print('Stop words: %s' % list(spacy_stopwords))

Number of stop words: 305
Stop words: ['rather', 'could', 'two', 'next', 'most', 'into', 'may', 'towards', 'against', 'is', 'before', 'can', 'might', 'mine', 'throughout', 'each', 'such', 'very', 'yours', 'where', 'further', 'along', 'about', 'its', 're', 'various', 'bottom', 'wherein', 'go', 'indeed', 'my', 'have', 'make', 'neither', 'now', 'ours', 'sixty', 'that', 'would', 'take', 'this', 'moreover', 'over', 'them', 'here', 'perhaps', 'whenever', 'thru', 'hundred', 'these', 'beforehand', 'all', 'within', 'whether', 'same', 'fifteen', 'formerly', 'becoming', 'anywhere', 'still', 'thus', 'beyond', 'some', 'whoever', 'quite', 'therein', 'with', 'alone', 'through', 'doing', 'former', 'been', 'third', 'onto', 'sometimes', 'when', 'then', 'me', 'five', 'ever', 'has', 'beside', 'nor', 'not', 'other', 'side', 'amongst', 'above', 'nevertheless', 'unless', 'to', 'several', 'seem', 'upon', 'your', 'did', 'made', 'meanwhile', 'afterwards', 'since', 'he', 'had', 'top', 'already', 'own', 'she', 'e

In [6]:
stop_word = [word for word in doc if word.is_stop == True]
print(stop_word)

[how, are, you, doing, is, and, is, is, should]


### Adding/removing stop words

In [7]:
# add single word
STOP_WORDS.add("your_additional_stop_word_here")
# nlp.Defaults.stop_words.add("my_new_stopword")

# add multiple words
STOP_WORDS |= {"my_new_stopword1021","my_new_stopword21",}

# remove single word
try:
    STOP_WORDS.remove("is")
except:
    pass
# if word does not exist in stop word list then error will occur

# remove multiple words
try:
    STOP_WORDS -= {"is", "are"}
except:
    pass

print(list(STOP_WORDS))

['rather', 'could', 'two', 'next', 'most', 'into', 'may', 'towards', 'against', 'before', 'your_additional_stop_word_here', 'can', 'might', 'mine', 'throughout', 'each', 'such', 'very', 'yours', 'where', 'further', 'along', 'about', 'its', 're', 'various', 'bottom', 'wherein', 'go', 'indeed', 'my', 'have', 'make', 'my_new_stopword21', 'neither', 'now', 'ours', 'sixty', 'that', 'would', 'take', 'this', 'moreover', 'over', 'them', 'here', 'perhaps', 'whenever', 'thru', 'hundred', 'these', 'beforehand', 'all', 'within', 'whether', 'same', 'fifteen', 'formerly', 'becoming', 'anywhere', 'still', 'thus', 'beyond', 'some', 'whoever', 'quite', 'therein', 'with', 'alone', 'through', 'doing', 'former', 'been', 'third', 'onto', 'sometimes', 'when', 'then', 'me', 'five', 'ever', 'has', 'beside', 'nor', 'not', 'other', 'side', 'amongst', 'above', 'nevertheless', 'unless', 'to', 'several', 'seem', 'upon', 'your', 'did', 'made', 'meanwhile', 'afterwards', 'since', 'he', 'had', 'top', 'already', 'own'

### POS tagging

In [8]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)

Hello hello INTJ UH compound Xxxxx True False
Mr. mr. PROPN NNP compound Xx. False False
Smith smith PROPN NNP dep Xxxxx True False
, , PUNCT , punct , False False
how how ADV WRB advmod xxx True True
are be VERB VBP aux xxx True True
you -PRON- PRON PRP nsubj xxx True True
doing do VERB VBG ROOT xxxx True True
today today NOUN NN npadvmod xxxx True False
? ? PUNCT . punct ? False False
The the DET DT det Xxx True False
weather weather NOUN NN nsubj xxxx True False
is be VERB VBZ ROOT xx True True
great great ADJ JJ acomp xxxx True False
, , PUNCT , punct , False False
and and CCONJ CC cc xxx True True
city city NOUN NN nsubj xxxx True False
is be VERB VBZ conj xx True True
awesome awesome ADJ JJ acomp xxxx True False
. . PUNCT . punct . False False

 
 SPACE   
 False False
The the DET DT det Xxx True False
sky sky NOUN NN nsubj xxx True False
is be VERB VBZ ROOT xx True True
pinkish pinkish ADJ JJ amod xxxx True False
- - PUNCT HYPH punct - False False
blue blue NOUN NN acomp xxxx Tr

In [9]:
displacy.render(doc, style='dep', jupyter=True, options={'distance': 70})

### NER Tagging

In [10]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)
    
print('Name Entity: {0}'.format(doc.ents))

Smith 10 15 PERSON
today 35 40 DATE

 84 85 GPE
Name Entity: (Smith, today, 
)


In [11]:
displacy.render(doc, style='ent', jupyter=True)