<a href="https://colab.research.google.com/github/sudeepjd/Data-Analytics/blob/master/09-Natural%20Language%20Processing/NLP_PreProcessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Pre-Processing for NLP

## Import the Libraries

In [90]:
import numpy as np
import nltk
nltk.download('punkt') #For word splitting
nltk.download('averaged_perceptron_tagger') #For POS tagging

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [91]:
sent = "The quick brown fox jumped over the lazy dog"

## To Uppercase

In [92]:
sent_upper = sent.upper()
print(sent_upper)

THE QUICK BROWN FOX JUMPED OVER THE LAZY DOG


## To Lowercase

In [93]:
sent_lower = sent.lower()
print(sent_lower)

the quick brown fox jumped over the lazy dog


##  Tokenise to words

In [94]:
words = nltk.word_tokenize(sent)
print(words)

['The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog']


## Part of Speech Tagging

In [95]:
words = nltk.word_tokenize(sent)
pos_tags = nltk.pos_tag(words)
print(pos_tags)

[('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumped', 'VBD'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN')]


## Sentence Chunking

In [96]:
grammar = "NP: {<DT>?<JJ>*<NN>*}"
words = nltk.word_tokenize(sent)
pos_tags = nltk.pos_tag(words)
cp = nltk.RegexpParser(grammar)
result = cp.parse(pos_tags)
print(result)

(S
  (NP The/DT quick/JJ brown/NN fox/NN)
  jumped/VBD
  over/IN
  (NP the/DT lazy/JJ dog/NN))


## Stemming

In [97]:
sent = "Laziness is not going to be any form of governance for women"
words = nltk.word_tokenize(sent)

porter = nltk.PorterStemmer()
sent_porter_stem = [porter.stem(w) for w in words]
print ("Porter Stemmer")
print(sent_porter_stem)

lancester = nltk.LancasterStemmer()
sent_lanc_stem = [lancester.stem(w) for w in words]
print ("\nLancester Stemmer")
print(sent_lanc_stem)

Porter Stemmer
['lazi', 'is', 'not', 'go', 'to', 'be', 'ani', 'form', 'of', 'govern', 'for', 'women']

Lancester Stemmer
['lazy', 'is', 'not', 'going', 'to', 'be', 'any', 'form', 'of', 'govern', 'for', 'wom']


## Remove Stop Words

In [98]:
#nltk.download('stopwords')
from nltk.corpus import stopwords

print('Stopwords')
stop_words = set(stopwords.words('english')) 
print(stop_words)

print("\nRemoved Stop Words")
filtered_words = [w for w in words if not w in stop_words]
print(words)
print(filtered_words) 

Stopwords
{"should've", 'had', 'more', "you've", 'while', "you'll", "didn't", 'me', 'all', 'mustn', 'other', 'o', 'up', 'yours', 'of', 'shouldn', 'each', 'down', 'once', 'into', 'aren', 'mightn', 'is', 'his', 'just', 'yourself', 'them', 'wouldn', 'couldn', "mustn't", 'wasn', 've', 'ours', 'didn', 'themselves', 'very', "shouldn't", 'such', 'my', 'don', 'll', 'about', 'shan', 'why', 'are', 'did', 'at', "isn't", 'haven', 'itself', 'was', 'herself', 'will', 'if', 'too', 'so', 'been', 'needn', 'being', 'd', 'which', 'what', 'on', 'when', 'through', 'ma', 'and', 'during', 'having', 'the', 'its', 'with', 'most', 'few', 'has', 'here', 't', 'by', 'hadn', 'between', 'over', 'or', "that'll", "wasn't", 'hers', 'for', 'whom', 'those', 's', 'before', 'not', 'our', 'from', 'your', "couldn't", "doesn't", "weren't", 'now', 'should', 'nor', "don't", 'ourselves', 'but', "haven't", 'who', 'does', 'against', 'out', 'how', 'only', 'he', "aren't", 'after', 'do', 'an', 'any', 'weren', 'doesn', 'i', 'theirs', 