In [42]:
from nltk.tag import pos_tag

In [43]:
ex = 'The big yellow bird flew over my house'

In [44]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return(sent)

In [45]:
sent = preprocess(ex)

In [46]:
sent

[('The', 'DT'),
 ('big', 'JJ'),
 ('yellow', 'JJ'),
 ('bird', 'NN'),
 ('flew', 'VBD'),
 ('over', 'IN'),
 ('my', 'PRP$'),
 ('house', 'NN')]

In [7]:
nltk.help.upenn_tagset("JJ")

JJ: adjective or numeral, ordinal
    third ill-mannered pre-war regrettable oiled calamitous first separable
    ectoplasmic battery-powered participatory fourth still-to-be-named
    multilingual multi-disciplinary ...


In [48]:
nltk.help.upenn_tagset("VBD")

VBD: verb, past tense
    dipped pleaded swiped regummed soaked tidied convened halted registered
    cushioned exacted snubbed strode aimed adopted belied figgered
    speculated wore appreciated contemplated ...


In [8]:
nltk.help.upenn_tagset("NN")

NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...


In [9]:
nltk.help.upenn_tagset("VBD")

VBD: verb, past tense
    dipped pleaded swiped regummed soaked tidied convened halted registered
    cushioned exacted snubbed strode aimed adopted belied figgered
    speculated wore appreciated contemplated ...


## Noun Phrase Chunking

In [49]:
np_rule = "NP: {<DT>?<JJ>*<NN>}"
# A noun pharase should be formed when a chunker finds an 
# optional determiner DT followed by any number of Adjectives JJ and then and Noun

In [51]:
cp = nltk.RegexpParser(np_rule)
cs = cp.parse(sent)
print(cs)

(S
  (NP The/DT big/JJ yellow/JJ bird/NN)
  flew/VBD
  over/IN
  my/PRP$
  (NP house/NN))


In [53]:
cs.draw()

In [54]:
text = "the little yellow dog barked at cat"

In [55]:
sent = preprocess(text)

In [57]:
cp = nltk.RegexpParser(np_rule)
cs = cp.parse(sent)
print(cs)

(S
  (NP the/DT little/JJ yellow/JJ dog/NN)
  barked/VBD
  at/IN
  (NP cat/NN))


In [58]:
cs.draw()

In [17]:
sentences = nltk.sent_tokenize(text)

In [18]:
sentences

['the little yellow dog barked at cat']

In [19]:
sentences = [nltk.word_tokenize(sent) for sent in sentences]


In [20]:
sentences

[['the', 'little', 'yellow', 'dog', 'barked', 'at', 'cat']]

In [21]:
sentences = [nltk.pos_tag(sent) for sent in sentences]
print(sentences)

[[('the', 'DT'), ('little', 'JJ'), ('yellow', 'JJ'), ('dog', 'NN'), ('barked', 'VBD'), ('at', 'IN'), ('cat', 'NN')]]


In [22]:
sentences

[[('the', 'DT'),
  ('little', 'JJ'),
  ('yellow', 'JJ'),
  ('dog', 'NN'),
  ('barked', 'VBD'),
  ('at', 'IN'),
  ('cat', 'NN')]]

In [23]:
grammer = ('''
            NP: {<DT>?<JJ>*<NN>} # NP
            ''')

In [24]:
cp = nltk.RegexpParser(grammer)

In [25]:
cp

<chunk.RegexpParser with 1 stages>

## Regex

In [62]:
#Loading the regex package to find number
import re

#identify user input
input_str = "Team A has 6 batsman and 5 bowlers, while team b has 5 batsman and 6 bowlers"

#remove numbers by using regex
output = re.sub(r"\d+", "", input_str)

#print the sentence after removal of numbers
print("remove numbers :", output)


remove numbers : Team A has  batsman and  bowlers, while team b has  batsman and  bowlers


In [63]:
# Removing accent, punctuations marks, and other diacritics

#Load the regex package and string package
import re, string

#define user input
input_str = "Sentence. having. string with. Punctuation?"

#remove punctuation
result = re.sub('[%s]' % re.escape(string.punctuation), '', input_str)

#print the sentence after removal of punctuation
print("result after removing punctuation :", result)


result after removing punctuation : Sentence having string with Punctuation


In [69]:
#Load the stopwords package
from nltk.corpus import stopwords

#Load the word tokenizer package
from nltk.tokenize import word_tokenize

#define the user input
input_str = "Stop words are the words that are filtered before and after processing of text."

#crete object for stopwords
stop_word = set(stopwords.words("english"))
#stop_word = stopwords.words("english")

#convert word into tokens
token = word_tokenize(input_str)

#remove stopwords from the list of tokens
output = [i for i in token if not i in stop_word]
stop_text = [i for i in token if  i in stop_word]

#remove the stopwords and print the sentence
print("original text :", input_str )
print("-------------------------------------------------------------------------------------")
print("remove stopwords :", output)
print("-------------------------------------------------------------------------------------")
print("stopwords :", set(stop_text))

original text : Stop words are the words that are filtered before and after processing of text.
-------------------------------------------------------------------------------------
remove stopwords : ['Stop', 'words', 'words', 'filtered', 'processing', 'text', '.']
-------------------------------------------------------------------------------------
stopwords : {'are', 'before', 'of', 'the', 'that', 'after', 'and'}


In [73]:
from nltk.corpus import conll2000
chunked_sent = conll2000.chunked_sents()[10]
print(chunked_sent)

(S
  (NP He/PRP)
  (VP reckons/VBZ)
  (NP the/DT current/JJ account/NN deficit/NN)
  (VP will/MD narrow/VB)
  (PP to/TO)
  (NP only/RB #/# 1.8/CD billion/CD)
  (PP in/IN)
  (NP September/NNP)
  ./.)


In [74]:
chunked_sent.draw()

In [75]:
# inside outside begning format
from nltk.chunk import tree2conlltags, conlltags2tree
iob_tageed = tree2conlltags(chunked_sent)
print(iob_tageed)

[('He', 'PRP', 'B-NP'), ('reckons', 'VBZ', 'B-VP'), ('the', 'DT', 'B-NP'), ('current', 'JJ', 'I-NP'), ('account', 'NN', 'I-NP'), ('deficit', 'NN', 'I-NP'), ('will', 'MD', 'B-VP'), ('narrow', 'VB', 'I-VP'), ('to', 'TO', 'B-PP'), ('only', 'RB', 'B-NP'), ('#', '#', 'I-NP'), ('1.8', 'CD', 'I-NP'), ('billion', 'CD', 'I-NP'), ('in', 'IN', 'B-PP'), ('September', 'NNP', 'B-NP'), ('.', '.', 'O')]


In [76]:
print(len(conll2000.chunked_words()))
print(len(conll2000.chunked_sents()))

166433
10948


### *Chinking* is basicaaly identifying chunk of information, you would like to remove from chunks identified by chunk identifier

In [79]:
import nltk
from nltk.tokenize import word_tokenize

In [80]:
sent = """We are going to chink this sentence to remove all nouns. All of the other
    words will be there. Except for the nouns"""

In [81]:
print(nltk.pos_tag(word_tokenize(sent)))

[('We', 'PRP'), ('are', 'VBP'), ('going', 'VBG'), ('to', 'TO'), ('chink', 'VB'), ('this', 'DT'), ('sentence', 'NN'), ('to', 'TO'), ('remove', 'VB'), ('all', 'DT'), ('nouns', 'NNS'), ('.', '.'), ('All', 'DT'), ('of', 'IN'), ('the', 'DT'), ('other', 'JJ'), ('words', 'NNS'), ('will', 'MD'), ('be', 'VB'), ('there', 'RB'), ('.', '.'), ('Except', 'IN'), ('for', 'IN'), ('the', 'DT'), ('nouns', 'NNS')]


In [82]:
chunkRule = r"Chunk: {<.*>+}"

In [83]:
chunkParser = nltk.RegexpParser(chunkRule)
chunkSent = chunkParser.parse(nltk.pos_tag(word_tokenize(sent)))

In [84]:
chunkSent.draw()

In [None]:
{}
}{

In [39]:
chunkRule = r"""Chunk: {<.*>+}
                        }<NN.?|NNS|NNP|NNPS>+{"""

In [85]:
chunkParser = nltk.RegexpParser(chunkRule)
chunkSent = chunkParser.parse(nltk.pos_tag(word_tokenize(sent)))

In [86]:
chunkSent.draw()

In [66]:
import nltk
from nltk.tokenize import word_tokenize
sent = "We are going to chink this sentence to remoe all verbs"
chunkRule = r"""Chunk: {<.*>+}
                        }<VB.?|VBP|VBG>+{"""
chunkParser = nltk.RegexpParser(chunkRule)
chunkSent = chunkParser.parse(nltk.pos_tag(word_tokenize(sent)))
chunkSent.draw()