# Text preprocessing using NLTK Library

## Tokenization

In [4]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

Token the input sentence into words and punctuation marks. Even if the punctuation is stick to previous and next word.

In [6]:
# tokenized with white spaces('\t\n\s') and puncuation marks

from nltk.tokenize import word_tokenize

sentence = "The price\t of buger \nin BurgerKing is 12$.\n"
print(word_tokenize(sentence))
sentence = "This is an example!An example of tokenization.\n"
print(word_tokenize(sentence))

['The', 'price', 'of', 'buger', 'in', 'BurgerKing', 'is', '12', '$', '.']
['This', 'is', 'an', 'example', '!', 'An', 'example', 'of', 'tokenization', '.']


In [7]:
# tokenized only with white spaces('\t\n\s')

from nltk.tokenize import WhitespaceTokenizer

tokenizer = WhitespaceTokenizer()
sentence = "The price\t of buger \nin BurgerKing is 12$.\n"
print(tokenizer.tokenize(sentence))

['The', 'price', 'of', 'buger', 'in', 'BurgerKing', 'is', '12$.']


In [8]:
# tokenized only with regex pattern we define

from nltk.tokenize import RegexpTokenizer

# words with numbers and characters are only token and others are removed
tokenizer = RegexpTokenizer(r'\w+')
print(tokenizer.tokenize(sentence))


['The', 'price', 'of', 'buger', 'in', 'BurgerKing', 'is', '12']


In [9]:
# tokenized with alphabetic and non alphabetic(all punctutaion one token)

from nltk.tokenize import WordPunctTokenizer

tokenizer = WordPunctTokenizer()
print(tokenizer.tokenize(sentence))

['The', 'price', 'of', 'buger', 'in', 'BurgerKing', 'is', '12', '$.']


In [14]:
# tokenized based on sentence (used when we have paragraph)
# split using . operator 
# only split if after . we have space 
# if the are sticked together or we have space before . we take them one sentence
from nltk.tokenize import sent_tokenize

paragraph = "Hello everyone . This is fourth session. Thank you for attending."
print(sent_tokenize(paragraph))
paragraph = "Hello everyone . This is fourth session.Thank you for attending."
print(sent_tokenize(paragraph))
paragraph = "Hello everyone . This is fourth session .Thank you for attending."
print(sent_tokenize(paragraph))

['Hello everyone .', 'This is fourth session.', 'Thank you for attending.']
['Hello everyone .', 'This is fourth session.Thank you for attending.']
['Hello everyone .', 'This is fourth session .Thank you for attending.']


## Normalization

It depends on the task we are doing.

In [15]:
# normalized by lowercasing all words
tokens = word_tokenize(sentence.lower())
print(tokens)

['the', 'price', 'of', 'buger', 'in', 'burgerking', 'is', '12', '$', '.']


In [17]:
# remove the non-alphabetic words
tokens = word_tokenize(sentence)
tokens = [w for w in tokens if w.isalpha()]
print(tokens)

['The', 'price', 'of', 'buger', 'in', 'BurgerKing', 'is']


## Lemmatization

In [22]:
nltk.download('omw-1.4')
nltk.download('wordnet')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [23]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

sentence = "been had done languages cities mice"

# first we tokenize the word of sentences
words = word_tokenize(sentence)

tokens_v = []
tokens_n = []
# you should define pos tag (v = verb, n=noun)
for word in words:
  # verbs has been converted to default
  tokens_v.append(lemmatizer.lemmatize(word,pos='v'))
  # nouns has been converted to default
  tokens_n.append(lemmatizer.lemmatize(word,pos='n'))
print(tokens_v)
print(tokens_n)


['be', 'have', 'do', 'languages', 'cities', 'mice']
['been', 'had', 'done', 'language', 'city', 'mouse']


we can define pos tag of a each word based on the lemma it has.

## Stemming

In [26]:
# snowball stemmer support different languages and we should define the language in constructor
from nltk.stem import PorterStemmer, SnowballStemmer

p_stemmer = PorterStemmer()
s_stemmer = SnowballStemmer('english')

sentence = "There are several types of stemming algorithms."
# first we need to tokenize the sentence
words = word_tokenize(sentence)

tokens_p = []
tokens_s = []

for word in words:
  tokens_p.append(p_stemmer.stem(word))
  tokens_s.append(s_stemmer.stem(word))

print(tokens_p)
print(tokens_s)


['there', 'are', 'sever', 'type', 'of', 'stem', 'algorithm', '.']
['there', 'are', 'sever', 'type', 'of', 'stem', 'algorithm', '.']


## Stopword

In [27]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [32]:
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

sentence = "There are several types of stemming algorithms."

words = word_tokenize(sentence)

token_1 = []
token_2 = []

for word in words:
  if word.lower() not in stop_words:
    token_1.append(word.lower())
print(token_1)

# add a stopword 
stop_words.extend(['types'])

for word in words:
  if word.lower() not in stop_words:
    token_2.append(word.lower())
print(token_2)

['several', 'types', 'stemming', 'algorithms', '.']
['several', 'stemming', 'algorithms', '.']
