# NLP Toolkits and Preprocessing Techniques

### Tokenization = splitting raw text into small, indivisible units for processing
These units can be:  Words,Sentences, N-grams, Other characters defined by regular expressions

In [1]:
import nltk

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\750010524\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
my_text = "Hi Mr. Smith! I’m going to buy some vegetables (tomatoes and cucumbers) from the store. Should I pick up 2lbs of black-eyed peas as well?"

In [4]:
from nltk.tokenize import word_tokenize

print(word_tokenize(my_text))

['Hi', 'Mr.', 'Smith', '!', 'I', '’', 'm', 'going', 'to', 'buy', 'some', 'vegetables', '(', 'tomatoes', 'and', 'cucumbers', ')', 'from', 'the', 'store', '.', 'Should', 'I', 'pick', 'up', '2lbs', 'of', 'black-eyed', 'peas', 'as', 'well', '?']


In [5]:
from nltk.tokenize import sent_tokenize

print(sent_tokenize(my_text))

['Hi Mr. Smith!', 'I’m going to buy some vegetables (tomatoes and cucumbers) from the store.', 'Should I pick up 2lbs of black-eyed peas as well?']


In [6]:
from nltk.util import ngrams

my_words = word_tokenize(my_text)
twograms = list(ngrams(my_words, 2))
print(twograms)
print('------------------------------------------')

threegrams = list(ngrams(my_words, 3))
print(threegrams)

[('Hi', 'Mr.'), ('Mr.', 'Smith'), ('Smith', '!'), ('!', 'I'), ('I', '’'), ('’', 'm'), ('m', 'going'), ('going', 'to'), ('to', 'buy'), ('buy', 'some'), ('some', 'vegetables'), ('vegetables', '('), ('(', 'tomatoes'), ('tomatoes', 'and'), ('and', 'cucumbers'), ('cucumbers', ')'), (')', 'from'), ('from', 'the'), ('the', 'store'), ('store', '.'), ('.', 'Should'), ('Should', 'I'), ('I', 'pick'), ('pick', 'up'), ('up', '2lbs'), ('2lbs', 'of'), ('of', 'black-eyed'), ('black-eyed', 'peas'), ('peas', 'as'), ('as', 'well'), ('well', '?')]
------------------------------------------
[('Hi', 'Mr.', 'Smith'), ('Mr.', 'Smith', '!'), ('Smith', '!', 'I'), ('!', 'I', '’'), ('I', '’', 'm'), ('’', 'm', 'going'), ('m', 'going', 'to'), ('going', 'to', 'buy'), ('to', 'buy', 'some'), ('buy', 'some', 'vegetables'), ('some', 'vegetables', '('), ('vegetables', '(', 'tomatoes'), ('(', 'tomatoes', 'and'), ('tomatoes', 'and', 'cucumbers'), ('and', 'cucumbers', ')'), ('cucumbers', ')', 'from'), (')', 'from', 'the'), 

In [7]:
from nltk.tokenize import RegexpTokenizer

#with white space tokenize
whitespace_tokenize = RegexpTokenizer("\s+", gaps=True)

print(whitespace_tokenize.tokenize(my_text))

['Hi', 'Mr.', 'Smith!', 'I’m', 'going', 'to', 'buy', 'some', 'vegetables', '(tomatoes', 'and', 'cucumbers)', 'from', 'the', 'store.', 'Should', 'I', 'pick', 'up', '2lbs', 'of', 'black-eyed', 'peas', 'as', 'well?']


In [8]:
# RegexpTokenizer to match only capitalized words

cap_tokenizer = RegexpTokenizer("[A-Z][\w]+")
cap_tokenizer.tokenize(my_text)

['Hi', 'Mr', 'Smith', 'Should']

In [9]:
import re
import string


clean_text = re.sub('[% s]' % re.escape(string.punctuation), ' ', my_text)
clean_text 

'Hi Mr  Smith  I’m going to buy some vegetables  tomatoes and cucumbers  from the store  Should I pick up 2lbs of black eyed peas as well '

In [10]:
clean_text.lower()

'hi mr  smith  i’m going to buy some vegetables  tomatoes and cucumbers  from the store  should i pick up 2lbs of black eyed peas as well '

In [11]:
# Removes all words containing digits
clean_text = re.sub('\w*\d\w*', ' ', my_text)
clean_text

'Hi Mr. Smith! I’m going to buy some vegetables (tomatoes and cucumbers) from the store. Should I pick up   of black-eyed peas as well?'

###### Lambda & Maps

In [12]:
text1 = 'tomotos weighs 10lb'
text2 = 'i am going to 111 Wall st'
text3 = 'Box has 10 bottels of water'

text = [text1, text2, text3]

remove_numbers = lambda x : re.sub('\w*\d\w*', ' ' , x)

text = list(map(remove_numbers, text))
print(text)

['tomotos weighs  ', 'i am going to   Wall st', 'Box has   bottels of water']


###### Stemming

In [13]:
from nltk.stem.lancaster import LancasterStemmer

stmr1 = LancasterStemmer()

In [14]:
format(stmr1.stem('cities'))

'city'

In [15]:
print('drive: {}' .format(stmr1.stem('drive')))
print('drive: {}' .format(stmr1.stem('drives')))
print('drive: {}' .format(stmr1.stem('driver')))
print('drive: {}' .format(stmr1.stem('drivers')))
print('drive: {}' .format(stmr1.stem('driven')))

drive: driv
drive: driv
drive: driv
drive: driv
drive: driv


###### Parts of Speech

In [16]:
from nltk.tag import pos_tag

In [17]:
nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\750010524\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [18]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\750010524\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [19]:
my_text = 'Donald Trumph Lives in United States'

token = pos_tag(word_tokenize(my_text))

print(token)

[('Donald', 'NNP'), ('Trumph', 'NNP'), ('Lives', 'VBZ'), ('in', 'IN'), ('United', 'NNP'), ('States', 'NNPS')]


In [20]:
nltk.download('tagsets')

[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\750010524\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

In [21]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

###### Named Entity Recognition  (Entity Extraction)

In [22]:
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\750010524\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


True

In [23]:
from nltk import ne_chunk

tokens = pos_tag(word_tokenize(my_text))

entities = ne_chunk(tokens)
entities.draw()

###### Compound Term Extraction

In [24]:
from nltk.tokenize import MWETokenizer       # multi-word expression

my_text = "You all are the greatest students of all time."

mwe_tokenizer = MWETokenizer([('You', 'all'), ('of', 'all', 'time')])
mwe_tokens    = mwe_tokenizer.tokenize(word_tokenize(my_text))

print(mwe_tokens)

['You_all', 'are', 'the', 'greatest', 'students', 'of_all_time', '.']
