In [1]:
# Tokenization of paragraphs/sentences
import nltk

In [2]:
from nltk.stem import PorterStemmer

In [3]:
paragraph = """Thank you all so very much. Thank you to the Academy. 
               Thank you to all of you in this room. I have to congratulate 
               the other incredible nominees this year. The Revenant was 
               the product of the tireless efforts of an unbelievable cast
               and crew. First off, to my brother in this endeavor, Mr. Tom 
               Hardy. Tom, your talent on screen can only be surpassed by 
               your friendship off screen … thank you for creating a t
               ranscendent cinematic experience. Thank you to everybody at 
               Fox and New Regency … my entire team. I have to thank 
               everyone from the very onset of my career … To my parents; 
               none of this would be possible without you. And to my 
               friends, I love you dearly; you know who you are. And lastly,
               I just want to say this: Making The Revenant was about
               man's relationship to the natural world. A world that we
               collectively felt in 2015 as the hottest year in recorded
               history. Our production needed to move to the southern
               tip of this planet just to be able to find snow. Climate
               change is real, it is happening right now. It is the most
               urgent threat facing our entire species, and we need to work
               collectively together and stop procrastinating. We need to
               support leaders around the world who do not speak for the 
               big polluters, but who speak for all of humanity, for the
               indigenous people of the world, for the billions and 
               billions of underprivileged people out there who would be
               most affected by this. For our children’s children, and 
               for those people out there whose voices have been drowned
               out by the politics of greed. I thank you all for this 
               amazing award tonight. Let us not take this planet for 
               granted. I do not take tonight for granted. Thank you so very much."""

In [4]:
sentences = nltk.sent_tokenize(paragraph)

In [5]:
stemmer = PorterStemmer()

In [6]:
# Stemming
for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    words = [stemmer.stem(word) for word in words]
    sentences[i] = ' '.join(words)

In [7]:
words

['thank', 'you', 'so', 'veri', 'much', '.']

In [8]:
sentences[:10]

['thank you all so veri much .',
 'thank you to the academi .',
 'thank you to all of you in thi room .',
 'i have to congratul the other incred nomine thi year .',
 'the reven wa the product of the tireless effort of an unbeliev cast and crew .',
 'first off , to my brother in thi endeavor , mr. tom hardi .',
 'tom , your talent on screen can onli be surpass by your friendship off screen … thank you for creat a t ranscend cinemat experi .',
 'thank you to everybodi at fox and new regenc … my entir team .',
 'i have to thank everyon from the veri onset of my career … to my parent ; none of thi would be possibl without you .',
 'and to my friend , i love you dearli ; you know who you are .']

In [9]:
#Lemmatization in NLTK

In [9]:
from nltk import WordNetLemmatizer

In [10]:
paragraph1 = paragraph
type(paragraph1)

str

In [11]:
sentences = nltk.sent_tokenize(paragraph1)
lemmatizer = WordNetLemmatizer() #Create an object of WordNetLemmatizer

In [12]:
# Lemmatization - Takes a bit more time than stemmer
for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i]) #because for stemming we need words
    
    #for each word we stem it and add it to the new list
    newwords = [lemmatizer.lemmatize(word) for word in words] #create a list, instead of new word and append
    sentences[i] = ' '.join(newwords) #all words are joined using space

In [13]:
sentences[:10]

['Thank you all so very much .',
 'Thank you to the Academy .',
 'Thank you to all of you in this room .',
 'I have to congratulate the other incredible nominee this year .',
 'The Revenant wa the product of the tireless effort of an unbelievable cast and crew .',
 'First off , to my brother in this endeavor , Mr. Tom Hardy .',
 'Tom , your talent on screen can only be surpassed by your friendship off screen … thank you for creating a t ranscendent cinematic experience .',
 'Thank you to everybody at Fox and New Regency … my entire team .',
 'I have to thank everyone from the very onset of my career … To my parent ; none of this would be possible without you .',
 'And to my friend , I love you dearly ; you know who you are .']

In [14]:
tagged_words = nltk.pos_tag(words) #list of words contains all words along with the POS

In [15]:
tagged_words[:5]

[('Thank', 'NNP'),
 ('you', 'PRP'),
 ('so', 'RB'),
 ('very', 'RB'),
 ('much', 'JJ')]

In [16]:
#We cannot use the POS codes to formulate anything
#so we create a new paragraph each word will be appended with corresponding POS
word_tags=[]
for tw in tagged_words:
    word_tags.append(tw[0]+"_"+tw[1])

In [17]:
word_tags[:5]

['Thank_NNP', 'you_PRP', 'so_RB', 'very_RB', 'much_JJ']

In [18]:
#and then we can create a tagged paragraph
tagged_paragraph = ' '.join(word_tags)

In [19]:
tagged_paragraph

'Thank_NNP you_PRP so_RB very_RB much_JJ ._.'