# Basic POS based Lemmatisation using NLTK

Know more about Lemmatisation: https://en.wikipedia.org/wiki/Lemmatisation

In [1]:
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.tokenize import TweetTokenizer 

[nltk_data] Downloading package wordnet to /home/rahul/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
wnl = WordNetLemmatizer()
tweet_tknzr = TweetTokenizer()

In [3]:
def word_lemmatizer(word,pos='n'):
    return wnl.lemmatize(word,pos)

In [4]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return 'a'
    elif tag.startswith('V'):
        return 'v'
    elif tag.startswith('N'):
        return 'n'
    elif tag.startswith('R'):
        return 'r'
    else:
        return 'n'
    
# { Part-of-speech constants
#ADJ, ADV, NOUN, VERB = 'a', 'r', 'n', 'v'
# } 

In [5]:
def word_to_word(word):
    tag = nltk.pos_tag([word])
    return word_lemmatizer(word, get_wordnet_pos(tag[0][1]))

In [6]:
def sentence_to_list(sentence, tokenizer_flag=0):
    
    results = []
    
    if tokenizer_flag == 0:
        tokens = nltk.word_tokenize(sentence)
    else:
        tokens = tweet_tknzr.tokenize(sentence)
        
    tagged = nltk.pos_tag(tokens)
    
    for tag in tagged:
        results.append(word_lemmatizer(tag[0],get_wordnet_pos(tag[1])))
    return results

In [7]:
def sentence_to_sentence(sentence, tokenizer_flag=0):
    res = sentence_to_list(sentence, tokenizer_flag)
    result = ' '.join(res)
    return result

### How to use :

In [8]:
# For a word
print(word_to_word("better"))

well


In [9]:
# For sentence to list
print(sentence_to_list("He was not playing"))

['He', 'be', 'not', 'play']


In [10]:
# For sentence to sentence
print(sentence_to_sentence("He was not playing"))

He be not play


Default tokenizer_flag uses 'word_tokenize' which has issues with apostrophe, etc..

tokenizer_flag = 1, uses TweetTokenizer which handles the isssue.

In [11]:
#with flag = 0 (default)
print(sentence_to_list("He's playing"),"\n")
#with flag = 1
print(sentence_to_list("He's playing",1))

['He', "'s", 'play'] 

["He's", 'playing']
