In [None]:
%pip install nltk

In [2]:
corpus = """
This is a tokenization example code. 
This is a sample text for tokenization!
I am using NLTK Library for this
"""

In [3]:
corpus

'\nThis is a tokenization example code. \nThis is a sample text for tokenization!\nI am using NLTK Library for this\n'

In [4]:
from nltk.tokenize import sent_tokenize

In [6]:
documents = sent_tokenize(corpus) #sentence tokenize - splits into sentences  , from paragraph/corpus 

In [7]:
## Paragraphs --> words
## sentence --> words

from nltk.tokenize import word_tokenize

In [10]:
word_tokenize(corpus) #using corpus directly

['This',
 'is',
 'a',
 'tokenization',
 'example',
 'code',
 '.',
 'This',
 'is',
 'a',
 'sample',
 'text',
 'for',
 'tokenization',
 '!',
 'I',
 'am',
 'using',
 'NLTK',
 'Library',
 'for',
 'this']

In [11]:
#using sentences: 
for sentence in documents: 
    word_tokenize(sentence)

In [14]:
from nltk.tokenize import wordpunct_tokenize
wordpunct_tokenize(corpus) # tokenizes punctuations like (', .) as well

['This',
 'is',
 'a',
 'tokenization',
 'example',
 'code',
 '.',
 'This',
 'is',
 'a',
 'sample',
 'text',
 'for',
 'tokenization',
 '!',
 'I',
 'am',
 'using',
 'NLTK',
 'Library',
 'for',
 'this']

In [15]:
from nltk.tokenize import TreebankWordTokenizer 
tokenizer=TreebankWordTokenizer() # Full stop is not treated as seperate word except in the last word.
tokenizer.tokenize(corpus)

['This',
 'is',
 'a',
 'tokenization',
 'example',
 'code.',
 'This',
 'is',
 'a',
 'sample',
 'text',
 'for',
 'tokenization',
 '!',
 'I',
 'am',
 'using',
 'NLTK',
 'Library',
 'for',
 'this']

## STEMMING

reducing the word to its stem that affixes to suffixes and prefixes to the roots of words known as a **lemma**. 

In [16]:
words = ['eating', 'eaten', 'writing', 'programming', 'programs', 'history', 'congratulations', 'finalized']

In [17]:
# PORTERSTEMMER
from nltk.stem import PorterStemmer

stemming = PorterStemmer() 
for word in words:
    print(word+'--->'+stemming.stem(word))

eating--->eat
eaten--->eaten
writing--->write
programming--->program
programs--->program
history--->histori
congratulations--->congratul
finalized--->final


Problems in words like congratulations and history and more happens in stemming-  can be fixed with lemmatization.

In [19]:
# RegexpStemmer Class
# reg exp stemmer algorithm

from nltk.stem import RegexpStemmer
reg_stemmer = RegexpStemmer('ing$|s$|e$|able$', min=4) #morphologically affixes



In [25]:
reg_stemmer.stem('ingeating')
reg_stemmer = RegexpStemmer('ing|s$|e$|able$', min=4) #morphologically affixes
reg_stemmer.stem('ingeating')


'eat'

In [26]:
#Snowball Stemmer - better technique
from nltk.stem import SnowballStemmer
snowball_stemmer = SnowballStemmer('english')

In [29]:
snowball_stemmer.stem('fairly')

'fair'

## Lemmatization

lemma is the root word which we get rather than the root stem we get in root stem. meaningful word. But lemmatization takes time.

### Wordnet Lemmatizer

In [30]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [32]:
''' 
POS- Noun-n
verb-v
adj-a
adverb-r
'''
lemmatizer.lemmatize('going') ## default pos = n (noun)

'going'

In [33]:
lemmatizer.lemmatize('going', pos='v')

'go'

In [35]:
for word in words:
    print(word+'--->'+lemmatizer.lemmatize(word, pos='v'))

eating--->eat
eaten--->eat
writing--->write
programming--->program
programs--->program
history--->history
congratulations--->congratulations
finalized--->finalize


## STOP WORDS

In [36]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [37]:
import nltk 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\soni2\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [38]:
stopwords.words('ENGLISH')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [40]:
paragraph="Hellooo this is Shreyans. I am from bits pilani.Hellooo this is Shreyans. I am from bits pilani.Hellooo this is Shreyans. I am from bits pilani.Hellooo this is Shreyans. I am from bits pilani."

In [45]:
stemmer=SnowballStemmer('english')
sentences = nltk.sent_tokenize(paragraph)

In [47]:
## apply stopwords and filter and then apply stemming

for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    words = [stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
    sentences[i] = ' '.join(words) #words into sentences

sentences

['hellooo shreyan .',
 'bit pilani.hellooo shreyan .',
 'bit pilani.hellooo shreyan .',
 'bit pilani.hellooo shreyan .',
 'bit pilani .']

In [59]:
## apply stopwords and filter and then apply lemmatization
paragraph="Hellooo this is Shreyans. I am from bits pilani.Hellooo this is Shreyans. I am from bits pilani.Hellooo this is Shreyans. I am from bits pilani.Hellooo this is Shreyans. I am from bits pilani."
sentences = nltk.sent_tokenize(paragraph)

for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    words = [lemmatizer.lemmatize(word.lower()) for word in words if word not in set(stopwords.words('english'))]
    sentences[i] = ' '.join(words) #words into sentences

sentences

['hellooo shreyans .',
 'i bit pilani.hellooo shreyans .',
 'i bit pilani.hellooo shreyans .',
 'i bit pilani.hellooo shreyans .',
 'i bit pilani .']

## PARTS OF SPEECH TAGGING (POS Tag)

In [61]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\soni2\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [63]:
# Will find pos tag

pos_t = nltk.pos_tag(['BITS Pilani', 'I', 'me', 'Shreyans'])
print(pos_t)

[('BITS Pilani', 'NNP'), ('I', 'PRP'), ('me', 'PRP'), ('Shreyans', 'NNPS')]
