In [1]:
from nltk import pos_tag
from nltk.tokenize import word_tokenize,sent_tokenize,WhitespaceTokenizer,TreebankWordTokenizer
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk.corpus import stopwords,wordnet
import string
import re

In [2]:
stpwords = set(stopwords.words('english'))
panctuations = list(string.punctuation)

In [12]:
sentence = ('Then, beginning late last month, for the first time in a half-century, India carried out an offensive against China, taking back high ground the Chinese recently grabbed. Chinese forces were surprised when Indian troops mounted their attempt to retake strategic high points. Stunned Chinese soldiers retreated.')

In [13]:
sent_tokenize(sentence)

['Then, beginning late last month, for the first time in a half-century, India carried out an offensive against China, taking back high ground the Chinese recently grabbed.',
 'Chinese forces were surprised when Indian troops mounted their attempt to retake strategic high points.',
 'Stunned Chinese soldiers retreated.']

In [14]:
#Word Tokenization(Divide the sentences into words)
word_tokenize(sentence)

['Then',
 ',',
 'beginning',
 'late',
 'last',
 'month',
 ',',
 'for',
 'the',
 'first',
 'time',
 'in',
 'a',
 'half-century',
 ',',
 'India',
 'carried',
 'out',
 'an',
 'offensive',
 'against',
 'China',
 ',',
 'taking',
 'back',
 'high',
 'ground',
 'the',
 'Chinese',
 'recently',
 'grabbed',
 '.',
 'Chinese',
 'forces',
 'were',
 'surprised',
 'when',
 'Indian',
 'troops',
 'mounted',
 'their',
 'attempt',
 'to',
 'retake',
 'strategic',
 'high',
 'points',
 '.',
 'Stunned',
 'Chinese',
 'soldiers',
 'retreated',
 '.']

In [15]:
#White Space Tokenization(Tokenize the sentence based on space)
WhitespaceTokenizer().tokenize(sentence)

['Then,',
 'beginning',
 'late',
 'last',
 'month,',
 'for',
 'the',
 'first',
 'time',
 'in',
 'a',
 'half-century,',
 'India',
 'carried',
 'out',
 'an',
 'offensive',
 'against',
 'China,',
 'taking',
 'back',
 'high',
 'ground',
 'the',
 'Chinese',
 'recently',
 'grabbed.',
 'Chinese',
 'forces',
 'were',
 'surprised',
 'when',
 'Indian',
 'troops',
 'mounted',
 'their',
 'attempt',
 'to',
 'retake',
 'strategic',
 'high',
 'points.',
 'Stunned',
 'Chinese',
 'soldiers',
 'retreated.']

In [16]:
#Uses regular expressions to tokenize text
TreebankWordTokenizer().tokenize(sentence)

['Then',
 ',',
 'beginning',
 'late',
 'last',
 'month',
 ',',
 'for',
 'the',
 'first',
 'time',
 'in',
 'a',
 'half-century',
 ',',
 'India',
 'carried',
 'out',
 'an',
 'offensive',
 'against',
 'China',
 ',',
 'taking',
 'back',
 'high',
 'ground',
 'the',
 'Chinese',
 'recently',
 'grabbed.',
 'Chinese',
 'forces',
 'were',
 'surprised',
 'when',
 'Indian',
 'troops',
 'mounted',
 'their',
 'attempt',
 'to',
 'retake',
 'strategic',
 'high',
 'points.',
 'Stunned',
 'Chinese',
 'soldiers',
 'retreated',
 '.']

In [17]:
#Stemming
sentences = sent_tokenize(sentence)
stemmer = PorterStemmer()
print("{0:20}{1:20}".format("Word","Root Word"))
for i in range(len(sentences)):
    words = word_tokenize(sentences[i])
    for word in words:
        if (word not in stpwords and word not in panctuations):
            print("{0:20}{1:20}".format(word,stemmer.stem(word)))

Word                Root Word           
Then                then                
beginning           begin               
late                late                
last                last                
month               month               
first               first               
time                time                
half-century        half-centuri        
India               india               
carried             carri               
offensive           offens              
China               china               
taking              take                
back                back                
high                high                
ground              ground              
Chinese             chines              
recently            recent              
grabbed             grab                
Chinese             chines              
forces              forc                
surprised           surpris             
Indian              indian              
troops          

In [18]:

#Lemmatization
#Function to convert nltk tag to wordnet tag
def nltk_to_wordnet(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

sentences = sent_tokenize(sentence)
lematizer = WordNetLemmatizer()
print("{0:20}{1:20}".format("Word","Lemma"))
for i in range(len(sentences)):
    words = word_tokenize(sentences[i])
    nltk_tagged = pos_tag(words)
    wordnet_tagged = map(lambda x:(x[0],nltk_to_wordnet(x[1])),nltk_tagged)
    for word,tag in wordnet_tagged:
        if (word not in stpwords and word not in panctuations):
            if tag is not None:
                print("{0:20}{1:20}".format(word,lematizer.lemmatize(word,tag)))
            else:
                print("{0:20}{1:20}".format(word,lematizer.lemmatize(word)))


Word                Lemma               
Then                Then                
beginning           begin               
late                late                
last                last                
month               month               
first               first               
time                time                
half-century        half-century        
India               India               
carried             carry               
offensive           offensive           
China               China               
taking              take                
back                back                
high                high                
ground              ground              
Chinese             Chinese             
recently            recently            
grabbed             grab                
Chinese             Chinese             
forces              force               
surprised           surprise            
Indian              Indian              
troops          

In [10]:
nltk_taged = pos_tag(['I','am','going','to','school'])
wordnet_tagged = map(lambda x:(x[0],nltk_to_wordnet(x[1])),nltk_taged)
for word,tag in wordnet_tagged:
    print(f"{word}----->{tag}")

I----->None
am----->v
going----->v
to----->None
school----->n
