In [4]:
import spacy
from nltk.tokenize import TweetTokenizer

In [3]:
nlp = spacy.load('en_core_web_sm')
text = "Mary, don't slap the green witch"
print([str(token) for token in nlp(text.lower())])

['mary', ',', 'do', "n't", 'slap', 'the', 'green', 'witch']


In [5]:
tweet = u"Snow White and the Seven Degrees #MakeAMovieCold@midnight:-)" # u is for unicode text. It prevents letters from being broken.
tokenizer = TweetTokenizer()
print(tokenizer.tokenize(tweet.lower()))

['snow', 'white', 'and', 'the', 'seven', 'degrees', '#makeamoviecold', '@midnight', ':-)']


In [6]:
def n_grams(text, n):
  return [text[i:i+n] for i in range(len(text)-n+1)]

cleaned = ['mary', ',', 'do', "n't", 'slap', 'the', 'green', 'witch']
print(n_grams(cleaned, 3))

[['mary', ',', 'do'], [',', 'do', "n't"], ['do', "n't", 'slap'], ["n't", 'slap', 'the'], ['slap', 'the', 'green'], ['the', 'green', 'witch']]


In [16]:
doc = nlp(u"he was running late")
for token in doc:
  print('{} --> {}'.format(token, token.lemma_)) # lemma is the basic term of the word

he --> he
was --> be
running --> run
late --> late


In [21]:
doc = nlp(u"Mary slapped the green witch.")
for token in doc:
  print('{} - {}'.format(token, token.pos_))  # pos(part of speeach)

<class 'spacy.tokens.token.Token'>
Mary - PROPN
<class 'spacy.tokens.token.Token'>
slapped - VERB
<class 'spacy.tokens.token.Token'>
the - DET
<class 'spacy.tokens.token.Token'>
green - ADJ
<class 'spacy.tokens.token.Token'>
witch - NOUN
<class 'spacy.tokens.token.Token'>
. - PUNCT


In [25]:
doc = nlp(u"Mary slapped the green witch.")
for chunk in doc.noun_chunks:
  print('{} - {}'.format(chunk, chunk.label_))  # chunking or shallow parsing

Mary - NP
the green witch - NP
