## Natural Language - Text Processing

# Tokenization

In [52]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize , sent_tokenize

text = "I believe this would help the reader understand how tokenization \
works. as well as realize its importance."


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [53]:
sents = (sent_tokenize(text))
print(sents)

['I believe this would help the reader understand how tokenization works.', 'as well as realize its importance.']


In [54]:
words = (word_tokenize(text))
print(words)

['I', 'believe', 'this', 'would', 'help', 'the', 'reader', 'understand', 'how', 'tokenization', 'works', '.', 'as', 'well', 'as', 'realize', 'its', 'importance', '.']


In [55]:
words = [word_tokenize(sent) for sent in sents]
print (words)

[['I', 'believe', 'this', 'would', 'help', 'the', 'reader', 'understand', 'how', 'tokenization', 'works', '.'], ['as', 'well', 'as', 'realize', 'its', 'importance', '.']]


# Stop Word

In [56]:
import nltk
nltk.download('stopwords')
from nltk.tokenize import word_tokenize , sent_tokenize
from nltk.corpus import stopwords
from string import punctuation
text = "I believe this would help the reader understand how tokenization \
works. as well as realize its importance (text)."


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [57]:
print(punctuation)
# These are the punctuations available

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [58]:
custom_list = set(stopwords.words('english')+list(punctuation))
print(custom_list)

{'by', 'hers', 'yourself', "you're", 'won', 'her', 't', 'this', 'o', 'he', 've', 'can', 'needn', 'an', 'hasn', '/', "she's", 'she', 'do', 'as', 'not', 'be', 'will', 'after', 'other', 'on', 'further', '<', 'off', 'should', 'so', "aren't", 'doesn', 'now', 'hadn', 'for', 'with', 'while', 'some', 'y', 'who', 'haven', 'mustn', "'", 'ain', 'very', '^', '@', ';', '>', 'those', 'me', 'through', 'until', '}', 'the', 'ours', "didn't", 'shan', 'if', "hasn't", "should've", 'against', 'has', 'my', 'where', 'just', 'm', 'theirs', "couldn't", 'been', "shouldn't", '*', 'ma', 'don', 's', '-', 'any', '(', 'had', '$', 'couldn', 'does', 'again', 'doing', 'i', 'isn', ']', 'weren', "you'll", 'of', 'at', 'same', "needn't", 'or', ')', 'which', 'himself', "isn't", 'its', 'here', 'few', 'to', '%', 'myself', 'but', 'down', "mightn't", 'each', 'too', "mustn't", ',', 'below', 'how', 'have', 'their', 'in', 'over', '#', 'our', 'and', 'such', "hadn't", 'out', '_', 'both', 'up', 'from', 'ourselves', 'herself', 'a', 'd

In [59]:
word_list = [word for word in word_tokenize(text) if word not in custom_list ]
print(word_list)

['I', 'believe', 'would', 'help', 'reader', 'understand', 'tokenization', 'works', 'well', 'realize', 'importance', 'text']


# N-Grams

Bigram

In [73]:
import nltk
from nltk.collocations import BigramCollocationFinder

word_list = ['I', 'believe', 'would', 'help', 'reader', 'understand', 'tokenization', 'works', 'well', 'realize', 'importance', 'text']

In [74]:
finder = BigramCollocationFinder.from_words(word_list)
print(finder.ngram_fd.items())

dict_items([(('I', 'believe'), 1), (('believe', 'would'), 1), (('would', 'help'), 1), (('help', 'reader'), 1), (('reader', 'understand'), 1), (('understand', 'tokenization'), 1), (('tokenization', 'works'), 1), (('works', 'well'), 1), (('well', 'realize'), 1), (('realize', 'importance'), 1), (('importance', 'text'), 1)])


Trigram

In [75]:
import nltk
from nltk.collocations import TrigramCollocationFinder

word_list = ['I', 'believe', 'would', 'help', 'reader', 'understand', 'tokenization', 'works', 'well', 'realize', 'importance', 'text']

In [76]:
finder = TrigramCollocationFinder.from_words(word_list)
print(finder.ngram_fd.items())

dict_items([(('I', 'believe', 'would'), 1), (('believe', 'would', 'help'), 1), (('would', 'help', 'reader'), 1), (('help', 'reader', 'understand'), 1), (('reader', 'understand', 'tokenization'), 1), (('understand', 'tokenization', 'works'), 1), (('tokenization', 'works', 'well'), 1), (('works', 'well', 'realize'), 1), (('well', 'realize', 'importance'), 1), (('realize', 'importance', 'text'), 1)])


# Stemming

In [83]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer

new_text = "It is important to by very pythonly while you are pythoning) with python. All pythoners have pythoned poorly at least once]"


In [84]:
l_s = LancasterStemmer()
stem_lan = [l_s.stem(word)for word in word_tokenize(new_text)]
print(stem_lan)

['it', 'is', 'import', 'to', 'by', 'very', 'python', 'whil', 'you', 'ar', 'python', ')', 'with', 'python', '.', 'al', 'python', 'hav', 'python', 'poor', 'at', 'least', 'ont', ']']


# WSD- Word Sense Disambiguation

In [6]:
import nltk
from nltk.corpus import wordnet
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [7]:
for ss in wordnet.synsets('mouse') :
  print(ss , ss.definition())

Synset('mouse.n.01') any of numerous small rodents typically resembling diminutive rats having pointed snouts and small ears on elongated bodies with slender usually hairless tails
Synset('shiner.n.01') a swollen bruise caused by a blow to the eye
Synset('mouse.n.03') person who is quiet or timid
Synset('mouse.n.04') a hand-operated electronic device that controls the coordinates of a cursor on your computer screen as you move it around on a pad; on the bottom of the device is a ball that rolls on the surface of the pad
Synset('sneak.v.01') to go stealthily or furtively
Synset('mouse.v.02') manipulate the mouse of a computer


In [8]:
import nltk

nltk.download('punkt')
nltk.download('wordnet')

from nltk.wsd import lesk
from nltk.tokenize import word_tokenize

context_1 = lesk(word_tokenize("Sing in a lower tone, along with the bass"), "bass")
print(context_1, context_1.definition())

context_2 = lesk(word_tokenize("The sea bass is really very hard to catch"), "bass")
print(context_2, context_2.definition())


Synset('bass.n.07') the member with the lowest range of a family of musical instruments
Synset('sea_bass.n.01') the lean flesh of a saltwater fish of the family Serranidae


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Count Vectorizer - Python

In [1]:
import pandas as pd
corpus = [
'This is the first document from heaven',
'but the second document is from mars',
'And this is the third one from nowhere',
'Is this the first document from nowhere?',
]

df = pd.DataFrame({'Text': corpus})
print(df)


                                       Text
0    This is the first document from heaven
1      but the second document is from mars
2    And this is the third one from nowhere
3  Is this the first document from nowhere?


In [4]:
from sklearn.feature_extraction.text import CountVectorizer

count_v = CountVectorizer()
X = count_v.fit_transform(df.Text).toarray()
feature_names = count_v.get_feature_names_out()
print(feature_names)

['and' 'but' 'document' 'first' 'from' 'heaven' 'is' 'mars' 'nowhere'
 'one' 'second' 'the' 'third' 'this']


In [5]:
print(X)
print(count_v.vocabulary_)

[[0 0 1 1 1 1 1 0 0 0 0 1 0 1]
 [0 1 1 0 1 0 1 1 0 0 1 1 0 0]
 [1 0 0 0 1 0 1 0 1 1 0 1 1 1]
 [0 0 1 1 1 0 1 0 1 0 0 1 0 1]]
{'this': 13, 'is': 6, 'the': 11, 'first': 3, 'document': 2, 'from': 4, 'heaven': 5, 'but': 1, 'second': 10, 'mars': 7, 'and': 0, 'third': 12, 'one': 9, 'nowhere': 8}


In [7]:
count_v = CountVectorizer(stop_words = ['this','is'])
X = count_v.fit_transform(df.Text).toarray()

In [8]:
print(X)
print(count_v.vocabulary_)

[[0 0 1 1 1 1 0 0 0 0 1 0]
 [0 1 1 0 1 0 1 0 0 1 1 0]
 [1 0 0 0 1 0 0 1 1 0 1 1]
 [0 0 1 1 1 0 0 1 0 0 1 0]]
{'the': 10, 'first': 3, 'document': 2, 'from': 4, 'heaven': 5, 'but': 1, 'second': 9, 'mars': 6, 'and': 0, 'third': 11, 'one': 8, 'nowhere': 7}


# TF-IDF Vectorizer

In [19]:
corpus = [
'This is the first document from heaven',
'but the second document is from mars',
'And this is the third one from nowhere',
'Is this the first document from nowhere?',
]

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)

In [20]:
print (vectorizer.vocabulary_)

{'this': 13, 'is': 6, 'the': 11, 'first': 3, 'document': 2, 'from': 4, 'heaven': 5, 'but': 1, 'second': 10, 'mars': 7, 'and': 0, 'third': 12, 'one': 9, 'nowhere': 8}


In [22]:
print(vectorizer.idf_)

[1.91629073 1.91629073 1.22314355 1.51082562 1.         1.91629073
 1.         1.91629073 1.51082562 1.91629073 1.91629073 1.
 1.91629073 1.22314355]


# Hashing Vectorizer

In [28]:
corpus = [
'This is the first document from heaven',
'but the second document is from mars',
'And this is the third one from nowhere',
'Is this the first document from nowhere?',
]

from sklearn.feature_extraction.text import HashingVectorizer
import pandas as pd

df = pd.DataFrame({'Text' :corpus})
hash_v = HashingVectorizer(n_features =9 , norm = None , alternate_sign = False)
print(hash_v.fit_transform(df.Text).toarray())

[[0. 1. 2. 2. 1. 0. 1. 0. 0.]
 [0. 1. 2. 1. 2. 1. 0. 0. 0.]
 [1. 1. 0. 2. 1. 2. 1. 0. 0.]
 [0. 1. 2. 2. 1. 0. 1. 0. 0.]]


In [29]:
hash_v = HashingVectorizer(n_features =15 , norm = None , alternate_sign = False)
print(hash_v.fit_transform(df.Text).toarray())

[[0. 0. 1. 2. 0. 0. 0. 2. 0. 0. 0. 1. 1. 0. 0.]
 [0. 0. 2. 1. 0. 1. 0. 2. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 1. 1. 2. 0. 0. 0. 0. 2. 0. 1.]
 [0. 0. 1. 1. 0. 0. 0. 2. 0. 0. 0. 1. 2. 0. 0.]]


In [31]:
hash_v = HashingVectorizer(n_features =15 , norm = None , alternate_sign = True)
print(hash_v.fit_transform(df.Text).toarray())

[[ 0.  0. -1.  0.  0.  0.  0.  0.  0.  0.  0. -1.  1.  0.  0.]
 [ 0.  0.  0. -1.  0. -1.  0.  0.  0.  0.  1.  0.  0.  0.  0.]
 [ 0.  0.  0. -1.  0. -1.  1.  0.  0.  0.  0.  0.  2.  0. -1.]
 [ 0.  0. -1. -1.  0.  0.  0.  0.  0.  0.  0. -1.  2.  0.  0.]]


# Example's

# Spam filter using CountVectorizer

In [34]:
corpus = [
'i earn 20 lakh rupees per month just chitchating on the net!' , #spam Mail
'are you free for a meeting anytime tomorrow?' # Useful
]


df = pd.DataFrame({'Text': corpus})
print(df)

from sklearn.feature_extraction.text import CountVectorizer

count_v = CountVectorizer()
X = count_v.fit_transform(df.Text).toarray()
print(X)
feature_names = count_v.get_feature_names_out()

print(X)
print(count_v.vocabulary_)


[[1 0 0 1 1 0 0 1 1 0 1 1 1 1 1 1 0 0]
 [0 1 1 0 0 1 1 0 0 1 0 0 0 0 0 0 1 1]]
{'earn': 4, '20': 0, 'lakh': 8, 'rupees': 14, 'per': 13, 'month': 10, 'just': 7, 'chitchating': 3, 'on': 12, 'the': 15, 'net': 11, 'are': 2, 'you': 17, 'free': 6, 'for': 5, 'meeting': 9, 'anytime': 1, 'tomorrow': 16}


In [42]:
# New mail -> same as not spam with new words with exiting count vector
new_txt = ['io etrn are you free ruppee for a monnth meeting chitcchting anytime tomorrow neet']
df_new = pd.DataFrame({'new_txt':new_txt})
y = count_v.transform(df_new.new_txt).toarray()
print(y)

[[0 1 1 0 0 1 1 0 0 1 0 0 0 0 0 0 1 1]]


In [46]:
# New Count Vector
corpus = [
'io etrn are you free ruppee for a monnth meeting chitcchting anytime tomorrow neet'
]

df = pd.DataFrame({'Text': corpus})
from sklearn.feature_extraction.text import CountVectorizer
count_v = CountVectorizer()
X = count_v.fit_transform(df.Text).toarray()
feature_names = count_v.get_feature_names_out()
print(X)


[[1 1 1 1 1 1 1 1 1 1 1 1 1]]


We Have to train the model again and again so that span and not span would be classified properly

# Spam Filter using Hashing

In [54]:
corpus = [
'i earn 20 lakh rupees per month just chitchating on the net!' , #spam Mail
'are you free for a meeting anytime tomorrow?' # Useful
]

from sklearn.feature_extraction.text import HashingVectorizer
import pandas as pd

df = pd.DataFrame({'Text' :corpus})
hash_v = HashingVectorizer(n_features =60 , norm = None , alternate_sign = False)
print(hash_v.fit_transform(df.Text).toarray())


[[0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0.
  1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0.]]


In [55]:
corpus = ['io etrn are you free ruppee for a monnth meeting chitcchting anytime tomorrow neet']
from sklearn.feature_extraction.text import HashingVectorizer
import pandas as pd

df = pd.DataFrame({'Text' :corpus})
hash_v = HashingVectorizer(n_features =60 , norm = None , alternate_sign = False)
print(hash_v.fit_transform(df.Text).toarray())


[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1.
  0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0.]]
