<a href="https://colab.research.google.com/github/saurabh-maurya/NLP-Simple-Implementation/blob/master/NLP_Toolkit_and_Preprocessing_Techniques.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import

In [1]:
import nltk
nltk.download()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> all
    Downloading collection 'all'
       | 
       | Downloading package abc to /root/nltk_data...
       |   Package abc is already up-to-date!
       | Downloading package alpino to /root/nltk_data...
       |   Package alpino is already up-to-date!
       | Downloading package biocreative_ppi to /root/nltk_data...
       |   Package biocreative_ppi is already up-to-date!
       | Downloading package brown to /root/nltk_data...
       |   Package brown is already up-to-date!
       | Downloading package brown_tei to /root/nltk_data...
       |   Package brown_tei is already up-to-date!
       | Downloading package cess_cat to /root/nltk_data...
       |   Package cess_cat is 

True

# Preprocessing Techniques



---

# 1. Tokenization
 - Turn text into a meaningful format for analysis

In [2]:
# Tokenization (Words)

from nltk.tokenize import word_tokenize
my_text = "Hi Mr. Smith! I’m going to buy some vegetables (tomatoes and cucumbers) from the store. Should I pick up some black-eyed peas as well?" 
print(word_tokenize(my_text))

['Hi', 'Mr.', 'Smith', '!', 'I', '’', 'm', 'going', 'to', 'buy', 'some', 'vegetables', '(', 'tomatoes', 'and', 'cucumbers', ')', 'from', 'the', 'store', '.', 'Should', 'I', 'pick', 'up', 'some', 'black-eyed', 'peas', 'as', 'well', '?']


In [3]:
# Tokenization (Sentences)

from nltk.tokenize import sent_tokenize
my_text = "Hi Mr. Smith! I’m going to buy some vegetables (tomatoes and cucumbers) from the store. Should I pick up some black-eyed peas as well?" 
print(sent_tokenize(my_text))

['Hi Mr. Smith!', 'I’m going to buy some vegetables (tomatoes and cucumbers) from the store.', 'Should I pick up some black-eyed peas as well?']


In [4]:
# Tokenization (N-Grams)

from nltk.util import ngrams
my_words = word_tokenize(my_text) # This is the list of all words
twograms = list(ngrams(my_words,2)) # This is for two-word combos, but can pick any n
print(twograms)

[('Hi', 'Mr.'), ('Mr.', 'Smith'), ('Smith', '!'), ('!', 'I'), ('I', '’'), ('’', 'm'), ('m', 'going'), ('going', 'to'), ('to', 'buy'), ('buy', 'some'), ('some', 'vegetables'), ('vegetables', '('), ('(', 'tomatoes'), ('tomatoes', 'and'), ('and', 'cucumbers'), ('cucumbers', ')'), (')', 'from'), ('from', 'the'), ('the', 'store'), ('store', '.'), ('.', 'Should'), ('Should', 'I'), ('I', 'pick'), ('pick', 'up'), ('up', 'some'), ('some', 'black-eyed'), ('black-eyed', 'peas'), ('peas', 'as'), ('as', 'well'), ('well', '?')]


In [5]:
# Tokenization (Regular Expressions)

from nltk.tokenize import RegexpTokenizer
my_text = "Hi Mr. Smith! I’m going to buy some vegetables (tomatoes and cucumbers) from the store. Should I pick up some black-eyed peas as well?" 
whitespace_tokenize = RegexpTokenizer("\s+", gaps = True)
print(whitespace_tokenize.tokenize(my_text))

# RegexpTokenizer to match only capitalized words
cap_tokenizer = RegexpTokenizer("[A-Z]['\w]+")
print(cap_tokenizer.tokenize(my_text))

['Hi', 'Mr.', 'Smith!', 'I’m', 'going', 'to', 'buy', 'some', 'vegetables', '(tomatoes', 'and', 'cucumbers)', 'from', 'the', 'store.', 'Should', 'I', 'pick', 'up', 'some', 'black-eyed', 'peas', 'as', 'well?']
['Hi', 'Mr', 'Smith', 'Should']


# 2. Clean the Data

**1. Remove: capital letters, punctuation, numbers, stop words**

In [6]:
# Replace Punctuations With A White Space

import re # Regular expression library
import string
clean_text = re.sub('[%s]' % re.escape(string.punctuation), ' ', my_text)
print(clean_text)

Hi Mr  Smith  I’m going to buy some vegetables  tomatoes and cucumbers  from the store  Should I pick up some black eyed peas as well 


In [7]:
# Make All Text Lowercase

clean_text = clean_text.lower()
clean_text


'hi mr  smith  i’m going to buy some vegetables  tomatoes and cucumbers  from the store  should i pick up some black eyed peas as well '

In [8]:
# Removes all words containing digits

clean_text = re.sub('\w*\d\w*', ' ', clean_text)
clean_text

'hi mr  smith  i’m going to buy some vegetables  tomatoes and cucumbers  from the store  should i pick up some black eyed peas as well '

In [10]:
#  Stop Words
from nltk.corpus import stopwords

my_text = ["Hi Mr. Smith! I’m going to buy some vegetables (tomatoes and cucumbers) from the store. Should I pick up some black-eyed peas as well?"]

# Incorporate stop words when creating the count vectorizer
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words='english')
X = cv.fit_transform(my_text)
pd.DataFrame(X.toarray(), columns=cv.get_feature_names())

Unnamed: 0,black,buy,cucumbers,eyed,going,hi,mr,peas,pick,smith,store,tomatoes,vegetables
0,1,1,1,1,1,1,1,1,1,1,1,1,1


**2. Stemming**

In [11]:
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()
# Try some stems
print('drive: {}'.format(stemmer.stem('drive')))
print('drives: {}'.format(stemmer.stem('drives')))
print('driver: {}'.format(stemmer.stem('driver')))
print('drivers: {}'.format(stemmer.stem('drivers')))
print('driven: {}'.format(stemmer.stem('driven')))

words = cv.get_feature_names()
for word in words:
  print(stemmer.stem(word), end = " ")
  

drive: driv
drives: driv
driver: driv
drivers: driv
driven: driv
black buy cucumb ey going hi mr pea pick smi stor tomato veget 

**3. Parts of Speech Tagging**

In [12]:
from nltk.tag import pos_tag
my_text = "Hi Mr. Smith! I’m going to buy some vegetables (tomatoes and cucumbers) from the store. Should I pick up some black-eyed peas as well?" 
tokens = pos_tag(word_tokenize(my_text))
print(tokens)


[('Hi', 'NNP'), ('Mr.', 'NNP'), ('Smith', 'NNP'), ('!', '.'), ('I', 'PRP'), ('’', 'VBP'), ('m', 'RB'), ('going', 'VBG'), ('to', 'TO'), ('buy', 'VB'), ('some', 'DT'), ('vegetables', 'NNS'), ('(', '('), ('tomatoes', 'NNS'), ('and', 'CC'), ('cucumbers', 'NNS'), (')', ')'), ('from', 'IN'), ('the', 'DT'), ('store', 'NN'), ('.', '.'), ('Should', 'MD'), ('I', 'PRP'), ('pick', 'VB'), ('up', 'RP'), ('some', 'DT'), ('black-eyed', 'JJ'), ('peas', 'NNS'), ('as', 'IN'), ('well', 'RB'), ('?', '.')]


In [13]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

**4.  Named Entity Recognition**

In [None]:

from nltk.chunk import ne_chunk
my_text = "James Smith lives in the United States."
tokens = pos_tag(word_tokenize(my_text)) # this labels each word as a part of speech
entities = ne_chunk(tokens) # this extracts entities from the list of words
entities

**5. Compound Term Extraction**

In [15]:
from nltk.tokenize import MWETokenizer # multi-word expression
my_text = "You all are the greatest students of all time."
mwe_tokenizer = MWETokenizer([('You','all'), ('of', 'all', 'time')])
mwe_tokens = mwe_tokenizer.tokenize(word_tokenize(my_text))
mwe_tokens

['You_all', 'are', 'the', 'greatest', 'students', 'of_all_time', '.']