<a href="https://colab.research.google.com/github/sudeepjd/Data-Analytics/blob/master/09-Natural%20Language%20Processing/NLP_PreProcessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Pre-Processing for NLP

## Import the Libraries

In [None]:
import numpy as np
import nltk
nltk.download('punkt') #For word splitting
nltk.download('averaged_perceptron_tagger') #For POS tagging

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
sent = "The quick brown fox jumped over the lazy dog"

## To Uppercase

In [None]:
sent_upper = sent.upper()
print(sent_upper)

THE QUICK BROWN FOX JUMPED OVER THE LAZY DOG


## To Lowercase

In [None]:
sent_lower = sent.lower()
print(sent_lower)

the quick brown fox jumped over the lazy dog


##  Tokenise to words

In [None]:
words = nltk.word_tokenize(sent)
print(words)

['The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog']


## Part of Speech Tagging

In [None]:
words = nltk.word_tokenize(sent)
pos_tags = nltk.pos_tag(words)
print(pos_tags)

[('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumped', 'VBD'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN')]


## Sentence Chunking

In [None]:
grammar = "NP: {<DT>?<JJ>*<NN>*}"
words = nltk.word_tokenize(sent)
pos_tags = nltk.pos_tag(words)
cp = nltk.RegexpParser(grammar)
result = cp.parse(pos_tags)
print(result)

(S
  (NP The/DT quick/JJ brown/NN fox/NN)
  jumped/VBD
  over/IN
  (NP the/DT lazy/JJ dog/NN))


## Stemming

In [None]:
sent = "Laziness is not going to be any form of governance for women"
words = nltk.word_tokenize(sent)

porter = nltk.PorterStemmer()
sent_porter_stem = [porter.stem(w) for w in words]
print ("-- Porter Stemmer")
print(sent_porter_stem)

lancester = nltk.LancasterStemmer()
sent_lanc_stem = [lancester.stem(w) for w in words]
print ("\n-- Lancester Stemmer")
print(sent_lanc_stem)

-- Porter Stemmer
['lazi', 'is', 'not', 'go', 'to', 'be', 'ani', 'form', 'of', 'govern', 'for', 'women']

-- Lancester Stemmer
['lazy', 'is', 'not', 'going', 'to', 'be', 'any', 'form', 'of', 'govern', 'for', 'wom']


## Remove Stop Words

In [None]:
#nltk.download('stopwords')
from nltk.corpus import stopwords

print('-- Stopwords')
stop_words = set(stopwords.words('english')) 
print(stop_words)

print("\n-- Removed Stop Words")
filtered_words = [w for w in words if not w in stop_words]
print(words)
print(filtered_words) 

-- Stopwords
{"should've", 'had', 'more', "you've", 'while', "you'll", "didn't", 'me', 'all', 'mustn', 'other', 'o', 'up', 'yours', 'of', 'shouldn', 'each', 'down', 'once', 'into', 'aren', 'mightn', 'is', 'his', 'just', 'yourself', 'them', 'wouldn', 'couldn', "mustn't", 'wasn', 've', 'ours', 'didn', 'themselves', 'very', "shouldn't", 'such', 'my', 'don', 'll', 'about', 'shan', 'why', 'are', 'did', 'at', "isn't", 'haven', 'itself', 'was', 'herself', 'will', 'if', 'too', 'so', 'been', 'needn', 'being', 'd', 'which', 'what', 'on', 'when', 'through', 'ma', 'and', 'during', 'having', 'the', 'its', 'with', 'most', 'few', 'has', 'here', 't', 'by', 'hadn', 'between', 'over', 'or', "that'll", "wasn't", 'hers', 'for', 'whom', 'those', 's', 'before', 'not', 'our', 'from', 'your', "couldn't", "doesn't", "weren't", 'now', 'should', 'nor', "don't", 'ourselves', 'but', "haven't", 'who', 'does', 'against', 'out', 'how', 'only', 'he', "aren't", 'after', 'do', 'an', 'any', 'weren', 'doesn', 'i', 'theirs

## Bag of Words - Words to Vectors

### Count Vectorizer

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

# list of text documents
text = ["The quick brown fox jumped over the lazy dog."]

# create the transform
cv = CountVectorizer()
vector = cv.fit_transform(text).toarray()

# summarize
print("-- Vocab")
print(cv.vocabulary_)
print("\n-- Vector")
print(vector)

-- Vocab
{'the': 7, 'quick': 6, 'brown': 0, 'fox': 2, 'jumped': 3, 'over': 5, 'lazy': 4, 'dog': 1}

-- Vector
[[1 1 1 1 1 1 1 2]]


### TF-IDF Vectorizer

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

# list of text documents
text = ["The quick brown fox jumped over the lazy dog.",
		"The dog.",
		"The fox"]

# create the transform
tfv = TfidfVectorizer()
vector = tfv.fit_transform(text)

# summarize
print("-- Vocab")
print(tfv.vocabulary_)
print("\n-- IDF")
print(tfv.idf_)

# encode document
vector0 = tfv.transform([text[0]])

# summarize encoded vector
print("\n-- Vector 0")
print(vector0)

-- Vocab
{'the': 7, 'quick': 6, 'brown': 0, 'fox': 2, 'jumped': 3, 'over': 5, 'lazy': 4, 'dog': 1}

-- IDF
[1.69314718 1.28768207 1.28768207 1.69314718 1.69314718 1.69314718
 1.69314718 1.        ]

-- Vector 0
  (0, 7)	0.4298344050159891
  (0, 6)	0.3638864554802418
  (0, 5)	0.3638864554802418
  (0, 4)	0.3638864554802418
  (0, 3)	0.3638864554802418
  (0, 2)	0.27674502873103346
  (0, 1)	0.27674502873103346
  (0, 0)	0.3638864554802418
