## Practice 1 - NLP Tokenization Stemming, Lemmatization and POS tagging
### Strictly used for internal purpose in Singapore Polytechnic. Do not disclose!

In [1]:
import re
import string
from pprint import pprint
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
stop_words_nltk = set(stopwords.words('english'))
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Wilson\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Wilson\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Wilson\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Wilson\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [23]:
# This will be our corpus which we will work on
corpus_original = "Need to finalize the demo corpus which will be used for this notebook and it should be done soon !!. It should be done by the ending of this month. But will it? This notebook has been run 4 times !!"
corpus = "Need to finalize the demo corpus which will be used for this notebook & should be done soon !!. It should be done by the ending of this month. But will it? This notebook has been run 4 times !!"

In [24]:
# lower case the corpus
corpus = corpus.lower()
print(corpus)

need to finalize the demo corpus which will be used for this notebook & should be done soon !!. it should be done by the ending of this month. but will it? this notebook has been run 4 times !!


In [25]:
# removing digits in the corpus
corpus = re.sub(r'\d','', corpus)
print(corpus)

need to finalize the demo corpus which will be used for this notebook & should be done soon !!. it should be done by the ending of this month. but will it? this notebook has been run  times !!


In [26]:
# removing punctuations
corpus = corpus.translate(str.maketrans('', '', string.punctuation))
print(corpus)

need to finalize the demo corpus which will be used for this notebook  should be done soon  it should be done by the ending of this month but will it this notebook has been run  times 


In [27]:
#removing trailing whitespaces
corpus = ' '.join([token for token in corpus.split()])
corpus

'need to finalize the demo corpus which will be used for this notebook should be done soon it should be done by the ending of this month but will it this notebook has been run times'

## 1. Tokenizing the text

In [32]:
tokenized_corpus_nltk = word_tokenize(corpus)
print("\nNLTK\nTokenized corpus:",tokenized_corpus_nltk)
tokenized_corpus_without_stopwords = [i for i in tokenized_corpus_nltk if not i in stop_words_nltk]
print("\nTokenized corpus without stopwords:",tokenized_corpus_without_stopwords)


NLTK
Tokenized corpus: ['need', 'to', 'finalize', 'the', 'demo', 'corpus', 'which', 'will', 'be', 'used', 'for', 'this', 'notebook', 'should', 'be', 'done', 'soon', 'it', 'should', 'be', 'done', 'by', 'the', 'ending', 'of', 'this', 'month', 'but', 'will', 'it', 'this', 'notebook', 'has', 'been', 'run', 'times']

Tokenized corpus without stopwords: ['need', 'finalize', 'demo', 'corpus', 'used', 'notebook', 'done', 'soon', 'done', 'ending', 'month', 'notebook', 'run', 'times']


## 2. Stemming

In [37]:
stemmer= PorterStemmer()

print("Before Stemming:")
print(corpus)

print("\nAfter Stemming:")
for word in tokenized_corpus_nltk:
    print(stemmer.stem(word),end=" ")

Before Stemming:
need to finalize the demo corpus which will be used for this notebook should be done soon it should be done by the ending of this month but will it this notebook has been run times

After Stemming:
need to final the demo corpu which will be use for thi notebook should be done soon it should be done by the end of thi month but will it thi notebook ha been run time 

In [55]:
lemmatizer=WordNetLemmatizer()

print("Before Lemmatization:")
print(corpus)

print("\nAfter Lemmatization:")
for word in tokenized_corpus_nltk:
    print(lemmatizer.lemmatize(word),end=" ")

Before Lemmatization:
need to finalize the demo corpus which will be used for this notebook should be done soon it should be done by the ending of this month but will it this notebook has been run times

After Lemmatization:
need to finalize the demo corpus which will be used for this notebook should be done soon it should be done by the ending of this month but will it this notebook ha been run time 

### Difference between Stemming and Lemmatization
Stemming just removes or stems the last few characters of a word, often leading to incorrect meanings and spelling. Lemmatization considers the context and converts the word to its meaningful base form, which is called Lemma. Sometimes, the same word can have multiple different Lemmas.

## 3. POS Tagging

In [59]:
print("POS Tagging using NLTK:")
pprint(nltk.pos_tag(word_tokenize(corpus_original)))

POS Tagging using NLTK:
[('Need', 'NN'),
 ('to', 'TO'),
 ('finalize', 'VB'),
 ('the', 'DT'),
 ('demo', 'NN'),
 ('corpus', 'NN'),
 ('which', 'WDT'),
 ('will', 'MD'),
 ('be', 'VB'),
 ('used', 'VBN'),
 ('for', 'IN'),
 ('this', 'DT'),
 ('notebook', 'NN'),
 ('and', 'CC'),
 ('it', 'PRP'),
 ('should', 'MD'),
 ('be', 'VB'),
 ('done', 'VBN'),
 ('soon', 'RB'),
 ('!', '.'),
 ('!', '.'),
 ('.', '.'),
 ('It', 'PRP'),
 ('should', 'MD'),
 ('be', 'VB'),
 ('done', 'VBN'),
 ('by', 'IN'),
 ('the', 'DT'),
 ('ending', 'VBG'),
 ('of', 'IN'),
 ('this', 'DT'),
 ('month', 'NN'),
 ('.', '.'),
 ('But', 'CC'),
 ('will', 'MD'),
 ('it', 'PRP'),
 ('?', '.'),
 ('This', 'DT'),
 ('notebook', 'NN'),
 ('has', 'VBZ'),
 ('been', 'VBN'),
 ('run', 'VBN'),
 ('4', 'CD'),
 ('times', 'NNS'),
 ('!', '.'),
 ('!', '.')]
