In [1]:
# Import Python library for NLP
import nltk

In [2]:
# Initializing stemming/lemmatization objects for English
stemmer = nltk.stem.SnowballStemmer(language='english')
lemmatizer = nltk.stem.WordNetLemmatizer()

In [3]:
# Defining words that we would like to prcoess
words = ['cement', 'read', 'swimming', 'including', 
         'qualification', 'reinventing', 'underestimating', 
         'are', 'was', 'being', 'slept', 'arisen', 'dealt']

In [4]:
# Printing lemma and stem for each word
for _word in words:
    _lemma = lemmatizer.lemmatize(_word)
    _stem = stemmer.stem(_word)
    print('Word "{0}": lemma - {1}, stem - {2}'.format(_word, _lemma, _stem))

Word "cement": lemma - cement, stem - cement
Word "read": lemma - read, stem - read
Word "swimming": lemma - swimming, stem - swim
Word "including": lemma - including, stem - includ
Word "qualification": lemma - qualification, stem - qualif
Word "reinventing": lemma - reinventing, stem - reinvent
Word "underestimating": lemma - underestimating, stem - underestim
Word "are": lemma - are, stem - are
Word "was": lemma - wa, stem - was
Word "being": lemma - being, stem - be
Word "slept": lemma - slept, stem - slept
Word "arisen": lemma - arisen, stem - arisen
Word "dealt": lemma - dealt, stem - dealt


In [5]:
# Lemmatizer takes part-of-speeach tag as an optional paramater, 
# which allow us to predict lemma more accurately
lemmatizer.lemmatize('was', pos='v')

'be'

In [6]:
# Let's do POS-tagging for each word in our set
tagged_words = nltk.pos_tag(words)
tagged_words

[('cement', 'NN'),
 ('read', 'VBD'),
 ('swimming', 'VBG'),
 ('including', 'VBG'),
 ('qualification', 'NN'),
 ('reinventing', 'VBG'),
 ('underestimating', 'VBG'),
 ('are', 'VBP'),
 ('was', 'VBD'),
 ('being', 'VBG'),
 ('slept', 'JJ'),
 ('arisen', 'NN'),
 ('dealt', 'NN')]

In [7]:
# Helper function to convert NLTK POS-tag into WordNet POS-tag
def get_wordnet_pos(tag):
    mappings = {'J': 'a', 'V': 'v', 'N': 'n', 'R': 'r'}
    return mappings.get(tag[:1], 'n')

In [8]:
# Print lemma/stem for each word again,
# now taking in account POS-tags
for _word, _tag in tagged_words:
    _lemma = lemmatizer.lemmatize(_word, pos=get_wordnet_pos(_tag))
    _stem = stemmer.stem(_word)
    print('Word "{0}": lemma - {1}, stem - {2}'.format(_word, _lemma, _stem))

Word "cement": lemma - cement, stem - cement
Word "read": lemma - read, stem - read
Word "swimming": lemma - swim, stem - swim
Word "including": lemma - include, stem - includ
Word "qualification": lemma - qualification, stem - qualif
Word "reinventing": lemma - reinvent, stem - reinvent
Word "underestimating": lemma - underestimate, stem - underestim
Word "are": lemma - be, stem - are
Word "was": lemma - be, stem - was
Word "being": lemma - be, stem - be
Word "slept": lemma - slept, stem - slept
Word "arisen": lemma - arisen, stem - arisen
Word "dealt": lemma - dealt, stem - dealt
