# Nltk for NLP by www.youtube.com/sentdex



#### Video 1 - Tokenization


In [1]:

import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize 

example_text = "December, it unexpectedly headed southeast toward Futuna. The system peaked at Category 3 on 28 December, with sustained winds of around 150 km/h (90 mph). It turned southwest the next day, toward Fiji and several smaller islands in the Lau group. The storm dissipated on 5 January over the north Tasman Sea. Raja caused two deaths as it impacted the island nations of Tuvalu, Wallis and Futuna, Tonga and Fiji. Gusty winds and rough seas caused extensive damage to crops, coastal installations and buildings in Tuvalu, and greater destruction in Futuna. Raja was responsible for the worst flood of the Labasa River in Fiji since 1929."

# print (sent_tokenize(example_text))
# print (word_tokenize(example_text))


## word tokenizing 
for i in word_tokenize (example_text):
    print (i)
    

December
,
it
unexpectedly
headed
southeast
toward
Futuna
.
The
system
peaked
at
Category
3
on
28
December
,
with
sustained
winds
of
around
150
km/h
(
90
mph
)
.
It
turned
southwest
the
next
day
,
toward
Fiji
and
several
smaller
islands
in
the
Lau
group
.
The
storm
dissipated
on
5
January
over
the
north
Tasman
Sea
.
Raja
caused
two
deaths
as
it
impacted
the
island
nations
of
Tuvalu
,
Wallis
and
Futuna
,
Tonga
and
Fiji
.
Gusty
winds
and
rough
seas
caused
extensive
damage
to
crops
,
coastal
installations
and
buildings
in
Tuvalu
,
and
greater
destruction
in
Futuna
.
Raja
was
responsible
for
the
worst
flood
of
the
Labasa
River
in
Fiji
since
1929
.


#### Video 2 - Stopwords 


In [2]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

example_sentence  = "This is an example showing off stop word filtraions."
stop_words = set(stopwords.words("english"))

#print (stop_words)    #Full collection of english stopwords 

words = word_tokenize(example_sentence)


# filtered_sentence = []
# for w in words :
#     if w not in stop_words:
#         filtered_sentence.append(w)
# print (filtered_sentence)


## also by using list comprehension
filtered_sentence  =[w for w in words if not w in stop_words]
print (filtered_sentence)

['This', 'example', 'showing', 'stop', 'word', 'filtraions', '.']



#### Video 3 - Stemming  


In [3]:
# removing sentenence that have same meaning

from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

ps = PorterStemmer()
example_words  = ["Pyhton","Pythoner","Pythoning","Pythonly"]

# for w in example_words:
#     print (ps.stem(w))


new_text = "It is very import to be pythonly while you are pythoning the python . All pyhton have pyhton at least poorly"
words    =  word_tokenize(new_text)
for w in words:
    print (ps.stem(w))


It
is
veri
import
to
be
pythonli
while
you
are
python
the
python
.
all
pyhton
have
pyhton
at
least
poorli



#### Video 4 - Parts of speech tagging  


In [4]:
import nltk 
from nltk.corpus import state_union
from nltk.tokenize  import PunktSentenceTokenizer



"""
POS tag list:

CC	coordinating conjunction
CD	cardinal digit
DT	determiner
EX	existential there (like: "there is" ... think of it like "there exists")
FW	foreign word
IN	preposition/subordinating conjunction
JJ	adjective	'big'
JJR	adjective, comparative	'bigger'
JJS	adjective, superlative	'biggest'
LS	list marker	1)
MD	modal	could, will
NN	noun, singular 'desk'
NNS	noun plural	'desks'
NNP	proper noun, singular	'Harrison'
NNPS	proper noun, plural	'Americans'
PDT	predeterminer	'all the kids'
POS	possessive ending	parent\'s
PRP	personal pronoun	I, he, she
PRP$	possessive pronoun	my, his, hers
RB	adverb	very, silently,
RBR	adverb, comparative	better
RBS	adverb, superlative	best
RP	particle	give up
TO	to	go 'to' the store.
UH	interjection	errrrrrrrm
VB	verb, base form	take
VBD	verb, past tense	took
VBG	verb, gerund/present participle	taking
VBN	verb, past participle	taken
VBP	verb, sing. present, non-3d	take
VBZ	verb, 3rd person sing. present	takes
WDT	wh-determiner	which
WP	wh-pronoun	who, what
WP$	possessive wh-pronoun	whose
WRB	wh-abverb	where, when

"""

train_text  =  state_union.raw("2005-GWBush.txt")
sample_text =  state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer =  PunktSentenceTokenizer(train_text)

tokenized  =  custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized[:5]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)

    except Exception as e:
        print(str(e))


process_content()

[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Thank', 'NNP'), ('you', 'PRP'), ('all', 'DT'), ('.', '.')]
[('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Supreme', 'NNP'), ('Court', 'NNP'), ('and', 'CC'), ('diplomatic', 'JJ'), ('corps', 'NN'), (',', ','), ('distinguished', 'JJ'), ('guests', 'NNS'), (',', ','), ('and', 'CC'), ('fellow', 'JJ'), ('citizens', 'NNS'), (':', ':'), ('Today', 'VB'), ('our', 'PRP$'), ('nat

### video - 5  Chunking  

In [5]:
import nltk 
from nltk.corpus import state_union
from nltk.tokenize  import PunktSentenceTokenizer

train_text  =  state_union.raw("2005-GWBush.txt")
sample_text =  state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer =  PunktSentenceTokenizer(train_text)
tokenized  =  custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized[:5]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            chunkGram = r""" Chunk: {<RB.?>*<VB,?>*<NNP.?>+<NN>?} """ #finding adverb,verb,propernoun,noun 
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked    = chunkParser.parse(tagged)
            print(chunked)
            #chunked.draw()  #for drawing graph of chunked data
                       
    except Exception as e:
        print(str(e))


process_content()



(S
  (Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP)
  'S/POS
  (Chunk ADDRESS/NNP)
  BEFORE/IN
  (Chunk A/NNP JOINT/NNP SESSION/NNP)
  OF/IN
  (Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
  OF/IN
  (Chunk THE/NNP UNION/NNP January/NNP)
  31/CD
  ,/,
  2006/CD
  (Chunk THE/NNP PRESIDENT/NNP)
  :/:
  (Chunk Thank/NNP)
  you/PRP
  all/DT
  ./.)
(S
  (Chunk Mr./NNP Speaker/NNP)
  ,/,
  (Chunk Vice/NNP President/NNP Cheney/NNP)
  ,/,
  members/NNS
  of/IN
  (Chunk Congress/NNP)
  ,/,
  members/NNS
  of/IN
  the/DT
  (Chunk Supreme/NNP Court/NNP)
  and/CC
  diplomatic/JJ
  corps/NN
  ,/,
  distinguished/JJ
  guests/NNS
  ,/,
  and/CC
  fellow/JJ
  citizens/NNS
  :/:
  Today/VB
  our/PRP$
  nation/NN
  lost/VBD
  a/DT
  beloved/VBN
  ,/,
  graceful/JJ
  ,/,
  courageous/JJ
  woman/NN
  who/WP
  called/VBD
  (Chunk America/NNP)
  to/TO
  its/PRP$
  founding/NN
  ideals/NNS
  and/CC
  carried/VBD
  on/IN
  a/DT
  noble/JJ
  dream/NN
  ./.)
(S
  Tonight/NN
  we/PRP
  are/VBP
  comfort

### Video-6 Chinking 

In [6]:
#removal of something except something 
import nltk 
from nltk.corpus import state_union
from nltk.tokenize  import PunktSentenceTokenizer

train_text  =  state_union.raw("2005-GWBush.txt")
sample_text =  state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer =  PunktSentenceTokenizer(train_text)
tokenized  =  custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized[:5]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            chunkGram = r""" Chunk: {<.*>+}
                                     }<VB.?|IN|DT|TO>+{"""   #chunk everything and keep verb out or prepostion,deteminant 
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked    = chunkParser.parse(tagged)
            #print(chunked)
            chunked.draw()  #for drawing graph of chunked data
                       
    except Exception as e:
        print(str(e))


process_content()



### video - 7 Name entity recognition

In [7]:
#removal of something except something 
import nltk 
from nltk.corpus import state_union
from nltk.tokenize  import PunktSentenceTokenizer

train_text  =  state_union.raw("2005-GWBush.txt")
sample_text =  state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer =  PunktSentenceTokenizer(train_text)
tokenized  =  custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized[:5]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            namedEnt  = nltk.ne_chunk(tagged,binary = True ) #binary = true ,it classify everthing as name entity period
            
            #namedEnt.draw()
            print(namedEnt)
            
    except Exception as e:
        print(str(e))


process_content()


"""
Name Entity  Type and Examples
ORGANIZATION - Georgia-Pacific Corp., WHO
PERSON - Eddy Bonte, President Obama
LOCATION - Murray River, Mount Everest
DATE - June, 2008-06-29
TIME - two fifty a m, 1:30 p.m.
MONEY - 175 million Canadian Dollars, GBP 10.40
PERCENT - twenty pct, 18.75 %
FACILITY - Washington Monument, Stonehenge
GPE - South East Asia, Midlothian
"""



'\nName Entity  Type and Examples\nORGANIZATION - Georgia-Pacific Corp., WHO\nPERSON - Eddy Bonte, President Obama\nLOCATION - Murray River, Mount Everest\nDATE - June, 2008-06-29\nTIME - two fifty a m, 1:30 p.m.\nMONEY - 175 million Canadian Dollars, GBP 10.40\nPERCENT - twenty pct, 18.75 %\nFACILITY - Washington Monument, Stonehenge\nGPE - South East Asia, Midlothian\n'

### video-8 lemmitizing 

In [1]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
# print (lemmatizer.lemmatize("cats"))
# print (lemmatizer.lemmatize("cacti"))
# print (lemmatizer.lemmatize("geeze"))
# print (lemmatizer.lemmatize("rocks"))
# print (lemmatizer.lemmatize("pythoning "))

print (lemmatizer.lemmatize("better",pos="a"))
print (lemmatizer.lemmatize("best",pos="a"))

print (lemmatizer.lemmatize("ran","v"))

print (lemmatizer.lemmatize("better"))



good
best
run
better


###  video - 9 corpora

In [12]:
from nltk.corpus import gutenberg
from nltk.tokenize import sent_tokenize

sample =  gutenberg.raw("bible-kjv.txt")
tok    = sent_tokenize(sample)

print (tok[5:15])


['1:5 And God called the light Day, and the darkness he called Night.', 'And the evening and the morning were the first day.', '1:6 And God said, Let there be a firmament in the midst of the waters,\nand let it divide the waters from the waters.', '1:7 And God made the firmament, and divided the waters which were\nunder the firmament from the waters which were above the firmament:\nand it was so.', '1:8 And God called the firmament Heaven.', 'And the evening and the\nmorning were the second day.', '1:9 And God said, Let the waters under the heaven be gathered together\nunto one place, and let the dry land appear: and it was so.', '1:10 And God called the dry land Earth; and the gathering together of\nthe waters called he Seas: and God saw that it was good.', '1:11 And God said, Let the earth bring forth grass, the herb yielding\nseed, and the fruit tree yielding fruit after his kind, whose seed is\nin itself, upon the earth: and it was so.', '1:12 And the earth brought forth grass, and

### video - 10  wordnet 