In [2]:
import nltk

In [3]:
my_strings = "I am learning Natural Language Processing."

In [4]:
tokens = nltk.word_tokenize(my_strings)

In [5]:
tokens

['I', 'am', 'learning', 'Natural', 'Language', 'Processing', '.']

In [6]:
len(tokens)

7

In [7]:
phrase = "I am learning Natural Learning Processing. I am learning how to tokenize!"

In [8]:
tokens_sent = nltk.sent_tokenize(phrase)

In [9]:
tokens_sent

['I am learning Natural Learning Processing.',
 'I am learning how to tokenize!']

In [10]:
len(tokens_sent)

2

In [12]:
for item in tokens_sent:
    print(nltk.word_tokenize(item))

['I', 'am', 'learning', 'Natural', 'Learning', 'Processing', '.']
['I', 'am', 'learning', 'how', 'to', 'tokenize', '!']


In [17]:
#---------------
# NORMALIZING

In [18]:
md = nltk.corpus.gutenberg.words("melville-moby_dick.txt")

In [22]:
md_22 = md[:22]

In [23]:
# Get only proper words, not dates or punctuations etc.
for word in md_22:
    if word.isalpha():
        print(word)

Moby
Dick
by
Herman
Melville
ETYMOLOGY
Supplied
by
a
Late
Consumptive
Usher
to
a
Grammar
School


In [28]:
# Change all words to same case
for word in md_22:
    print(word.lower())

[
moby
dick
by
herman
melville
1851
]
etymology
.
(
supplied
by
a
late
consumptive
usher
to
a
grammar
school
)


In [32]:
norm = [word.lower() for word in md_22 if word.isalpha()]
# Normalizing or Text-Preprocessing refers to the following:
# 1. converting all letters to lower or upper case.
# 2. converting numbers into words or removing numbers.
# 3. removing punctuations, accent marks and other diacritics.
# 4. removing white spaces.
# 5. expanding abbreviations.
# 6. removing stop words, sparse terms, and particular words.
# 7. text canonicalization.

# Great link: https://medium.com/@datamonsters/text-preprocessing-in-python-steps-tools-and-examples-bf025f872908

In [33]:
norm

['moby',
 'dick',
 'by',
 'herman',
 'melville',
 'etymology',
 'supplied',
 'by',
 'a',
 'late',
 'consumptive',
 'usher',
 'to',
 'a',
 'grammar',
 'school']

In [45]:
# Using stemmers - Porter
porter = nltk.PorterStemmer()
my_list = ["cat", "cats", "lie", "lying", "run", "running", "city", "cities", "woman", "women", "organize", "organization", "organizations"]
print([porter.stem(word) for word in my_list])

['cat', 'cat', 'lie', 'lie', 'run', 'run', 'citi', 'citi', 'woman', 'women', 'organ', 'organ', 'organ']


In [46]:
# Using stemmers - Lancester
lanc = nltk.LancasterStemmer()
print([lanc.stem(word) for word in my_list])

['cat', 'cat', 'lie', 'lying', 'run', 'run', 'city', 'city', 'wom', 'wom', 'org', 'org', 'org']


In [47]:
# Using Lemmitization
wn_lem = nltk.WordNetLemmatizer()
print([wn_lem.lemmatize(word) for word in my_list])

['cat', 'cat', 'lie', 'lying', 'run', 'running', 'city', 'city', 'woman', 'woman', 'organize', 'organization', 'organization']
