In [1]:
from nltk.tokenize import word_tokenize, sent_tokenize
# tokenize means grouping of text either words or group of sentences
#lexicon - words and their meanings
# corpors = body of text
example_sent= "Hello, Mr. Smith, how are you doing today? The weather is great and Python is awesome. The sky is Pinkish-blue. Don't eat the cardboard."

print(word_tokenize(example_sent))
print(sent_tokenize(example_sent))

['Hello', ',', 'Mr.', 'Smith', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'great', 'and', 'Python', 'is', 'awesome', '.', 'The', 'sky', 'is', 'Pinkish-blue', '.', 'Do', "n't", 'eat', 'the', 'cardboard', '.']
['Hello, Mr. Smith, how are you doing today?', 'The weather is great and Python is awesome.', 'The sky is Pinkish-blue.', "Don't eat the cardboard."]


In [2]:
from nltk.corpus import stopwords
# stopwords are most common words in english language
stop_words= set(stopwords.words("english"))

print(stop_words)

filtered_sent= []
words= word_tokenize(example_sent)
for w in words:
    if w not in stop_words:
        filtered_sent.append(w)
        
print(filtered_sent)        

{'to', "you'll", 'yourself', 'in', "shouldn't", 'you', 'why', 'few', 'for', 'than', 'she', "it's", 'is', 't', 'will', 'all', 'the', 'these', 'been', 'themselves', 'aren', 'where', 'out', 'has', 'each', 'own', 'o', 'ain', 'because', 'mightn', 'by', 'under', 'so', 'not', "couldn't", 'or', 'once', 'again', 'having', "you'd", 'from', 'how', 'shan', 'up', 'hers', "won't", 'myself', 'while', 'here', 'wouldn', 'of', 'too', "didn't", 'its', 'on', "haven't", 'doesn', 'have', 'then', 've', 'whom', 'some', 'into', 'a', 'down', 'had', 'same', 'against', 'between', 'hasn', 'himself', "weren't", 'if', 'what', 'other', "hasn't", 'your', 'he', 'herself', 'doing', 'wasn', 'won', 'them', 'm', 'needn', 'my', 'shouldn', 'through', 'any', "mustn't", 'haven', "don't", 'below', 'ma', 'at', 'their', 'no', "you've", 'can', "aren't", 'nor', 'itself', 'further', 'him', 'those', 'ours', 'now', 'only', "should've", 'that', 'was', "she's", 'there', 'about', 'just', 's', 'which', 'we', 'until', 'didn', 'with', 'don'

In [7]:
'''
A stemmer for English, for example, should identify the string "cats" (and possibly "catlike", "catty" etc.) 
as based on the root "cat", and "stems", "stemmer", "stemming", "stemmed" as based on "stem". 
A stemming algorithm reduces the words "fishing", "fished", and "fisher" to the root word, "fish"
'''

from nltk.stem import PorterStemmer

ps= PorterStemmer()
new_txt= "It is very important to be pythonly while you are pythoning with python. All pythoners have pythoned poorly in python atleast once."
words= word_tokenize(new_txt)

for w in words:
    print(ps.stem(w))

It
is
veri
import
to
be
pythonli
while
you
are
python
with
python
.
all
python
have
python
poorli
in
python
atleast
onc
.


#### Part of Speech tagging does exactly what it sounds like, it tags each word in a sentence with the part of speech for that word. This means it labels words as noun, adjective, verb, etc. PoS tagging also covers tenses of the parts of speech. 

In [6]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer    # this tokenizer is ml based and it first get trained and then predicts

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)  # train the tokenizer first and then use it
tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)
    except Exception as e:
        print(str(e))
        
process_content()        

[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Thank', 'NNP'), ('you', 'PRP'), ('all', 'DT'), ('.', '.')]
[('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Supreme', 'NNP'), ('Court', 'NNP'), ('and', 'CC'), ('diplomatic', 'JJ'), ('corps', 'NN'), (',', ','), ('distinguished', 'JJ'), ('guests', 'NNS'), (',', ','), ('and', 'CC'), ('fellow', 'JJ'), ('citizens', 'NNS'), (':', ':'), ('Today', 'VB'), ('our', 'PRP$'), ('nat

[('So', 'IN'), ('the', 'DT'), ('United', 'NNP'), ('States', 'NNPS'), ('of', 'IN'), ('America', 'NNP'), ('supports', 'NNS'), ('democratic', 'JJ'), ('reform', 'NN'), ('across', 'IN'), ('the', 'DT'), ('broader', 'JJR'), ('Middle', 'NNP'), ('East', 'NNP'), ('.', '.')]
[('Elections', 'NNS'), ('are', 'VBP'), ('vital', 'JJ'), (',', ','), ('but', 'CC'), ('they', 'PRP'), ('are', 'VBP'), ('only', 'RB'), ('the', 'DT'), ('beginning', 'NN'), ('.', '.')]
[('Raising', 'VBG'), ('up', 'RP'), ('a', 'DT'), ('democracy', 'NN'), ('requires', 'VBZ'), ('the', 'DT'), ('rule', 'NN'), ('of', 'IN'), ('law', 'NN'), (',', ','), ('and', 'CC'), ('protection', 'NN'), ('of', 'IN'), ('minorities', 'NNS'), (',', ','), ('and', 'CC'), ('strong', 'JJ'), (',', ','), ('accountable', 'JJ'), ('institutions', 'NNS'), ('that', 'IN'), ('last', 'JJ'), ('longer', 'JJR'), ('than', 'IN'), ('a', 'DT'), ('single', 'JJ'), ('vote', 'NN'), ('.', '.')]
[('The', 'DT'), ('great', 'JJ'), ('people', 'NNS'), ('of', 'IN'), ('Egypt', 'NNP'), ('ha

[('And', 'CC'), ('we', 'PRP'), ('must', 'MD'), ('have', 'VB'), ('a', 'DT'), ('rational', 'JJ'), (',', ','), ('humane', 'JJ'), ('guest', 'JJS'), ('worker', 'NN'), ('program', 'NN'), ('that', 'WDT'), ('rejects', 'VBZ'), ('amnesty', 'JJ'), (',', ','), ('allows', 'VBZ'), ('temporary', 'JJ'), ('jobs', 'NNS'), ('for', 'IN'), ('people', 'NNS'), ('who', 'WP'), ('seek', 'VBP'), ('them', 'PRP'), ('legally', 'RB'), (',', ','), ('and', 'CC'), ('reduces', 'NNS'), ('smuggling', 'VBG'), ('and', 'CC'), ('crime', 'NN'), ('at', 'IN'), ('the', 'DT'), ('border', 'NN'), ('.', '.')]
[('(', '('), ('Applause', 'NNP'), ('.', '.'), (')', ')')]
[('Keeping', 'VBG'), ('America', 'NNP'), ('competitive', 'JJ'), ('requires', 'VBZ'), ('affordable', 'JJ'), ('health', 'NN'), ('care', 'NN'), ('.', '.')]
[('(', '('), ('Applause', 'NNP'), ('.', '.'), (')', ')')]
[('Our', 'PRP$'), ('government', 'NN'), ('has', 'VBZ'), ('a', 'DT'), ('responsibility', 'NN'), ('to', 'TO'), ('provide', 'VB'), ('health', 'NN'), ('care', 'NN'), (

[('We', 'PRP'), ('see', 'VBP'), ('great', 'JJ'), ('changes', 'NNS'), ('in', 'IN'), ('science', 'NN'), ('and', 'CC'), ('commerce', 'NN'), ('that', 'WDT'), ('will', 'MD'), ('influence', 'VB'), ('all', 'DT'), ('our', 'PRP$'), ('lives', 'NNS'), ('.', '.')]
[('Sometimes', 'RB'), ('it', 'PRP'), ('can', 'MD'), ('seem', 'VB'), ('that', 'DT'), ('history', 'NN'), ('is', 'VBZ'), ('turning', 'VBG'), ('in', 'IN'), ('a', 'DT'), ('wide', 'JJ'), ('arc', 'NN'), (',', ','), ('toward', 'IN'), ('an', 'DT'), ('unknown', 'JJ'), ('shore', 'NN'), ('.', '.')]
[('Yet', 'CC'), ('the', 'DT'), ('destination', 'NN'), ('of', 'IN'), ('history', 'NN'), ('is', 'VBZ'), ('determined', 'VBN'), ('by', 'IN'), ('human', 'JJ'), ('action', 'NN'), (',', ','), ('and', 'CC'), ('every', 'DT'), ('great', 'JJ'), ('movement', 'NN'), ('of', 'IN'), ('history', 'NN'), ('comes', 'VBZ'), ('to', 'TO'), ('a', 'DT'), ('point', 'NN'), ('of', 'IN'), ('choosing', 'NN'), ('.', '.')]
[('Lincoln', 'NNP'), ('could', 'MD'), ('have', 'VB'), ('accepte

#### Chunking in Natural Language Processing (NLP) is the process by which we group various words together by their part of speech tags. 

![image.png](attachment:image.png)

In [4]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer    # this tokenizer is ml based and it first get trained and then predicts

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)  # train the tokenizer first and then use it
tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            # r before the string is used for Raw text and not string where we have to use \\ instead of \
            # first part of regular expression is chunking and second part is chinking
            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?} 
                                    }<RB.?|VB.?>+{"""     # this is called chinking which is removing something from chunk
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            #chunked.draw()
            for subtree in chunked.subtrees(filter=lambda t: t.label() == 'Chunk'):
                print(subtree)
            print(chunked)
            
    except Exception as e:
        print(str(e))
        
process_content() 

(Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP)
(Chunk ADDRESS/NNP)
(Chunk A/NNP JOINT/NNP SESSION/NNP)
(Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
(Chunk THE/NNP UNION/NNP January/NNP)
(Chunk THE/NNP PRESIDENT/NNP)
(Chunk Thank/NNP)
(S
  (Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP)
  'S/POS
  (Chunk ADDRESS/NNP)
  BEFORE/IN
  (Chunk A/NNP JOINT/NNP SESSION/NNP)
  OF/IN
  (Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
  OF/IN
  (Chunk THE/NNP UNION/NNP January/NNP)
  31/CD
  ,/,
  2006/CD
  (Chunk THE/NNP PRESIDENT/NNP)
  :/:
  (Chunk Thank/NNP)
  you/PRP
  all/DT
  ./.)
(Chunk Mr./NNP Speaker/NNP)
(Chunk Vice/NNP President/NNP Cheney/NNP)
(Chunk Congress/NNP)
(Chunk Supreme/NNP Court/NNP)
(Chunk America/NNP)
(S
  (Chunk Mr./NNP Speaker/NNP)
  ,/,
  (Chunk Vice/NNP President/NNP Cheney/NNP)
  ,/,
  members/NNS
  of/IN
  (Chunk Congress/NNP)
  ,/,
  members/NNS
  of/IN
  the/DT
  (Chunk Supreme/NNP Court/NNP)
  and/CC
  diplomatic/JJ
  corps/NN
  ,/,
  distinguishe

(Chunk Applause/NNP)
(S (/( (Chunk Applause/NNP) ./. )/))
(Chunk Hindsight/NNP)
(S
  (Chunk Hindsight/NNP)
  alone/RB
  is/VBZ
  not/RB
  wisdom/JJ
  ,/,
  and/CC
  second-guessing/NN
  is/VBZ
  not/RB
  a/DT
  strategy/NN
  ./.)
(Chunk Applause/NNP)
(S (/( (Chunk Applause/NNP) ./. )/))
(S
  With/IN
  so/RB
  much/JJ
  in/IN
  the/DT
  balance/NN
  ,/,
  those/DT
  of/IN
  us/PRP
  in/IN
  public/JJ
  office/NN
  have/VBP
  a/DT
  duty/NN
  to/TO
  speak/VB
  with/IN
  candor/NN
  ./.)
(Chunk Iraq/NNP)
(Chunk Iraqi/NNP)
(Chunk Laden/NNP)
(Chunk Zarqawi/NNP)
(Chunk America/NNP)
(S
  A/DT
  sudden/JJ
  withdrawal/NN
  of/IN
  our/PRP$
  forces/NNS
  from/IN
  (Chunk Iraq/NNP)
  would/MD
  abandon/VB
  our/PRP$
  (Chunk Iraqi/NNP)
  allies/NNS
  to/TO
  death/NN
  and/CC
  prison/NN
  ,/,
  would/MD
  put/VB
  men/NNS
  like/IN
  bin/NN
  (Chunk Laden/NNP)
  and/CC
  (Chunk Zarqawi/NNP)
  in/IN
  charge/NN
  of/IN
  a/DT
  strategic/JJ
  country/NN
  ,/,
  and/CC
  show/VBP
  that/IN
  a/

(S
  Protectionists/NNS
  want/VBP
  to/TO
  escape/VB
  competition/NN
  ,/,
  pretending/VBG
  that/IN
  we/PRP
  can/MD
  keep/VB
  our/PRP$
  high/JJ
  standard/NN
  of/IN
  living/NN
  while/IN
  walling/VBG
  off/RP
  our/PRP$
  economy/NN
  ./.)
(Chunk Washington/NNP)
(S
  Others/NNS
  say/VBP
  that/IN
  the/DT
  government/NN
  needs/VBZ
  to/TO
  take/VB
  a/DT
  larger/JJR
  role/NN
  in/IN
  directing/VBG
  the/DT
  economy/NN
  ,/,
  centralizing/VBG
  more/JJR
  power/NN
  in/IN
  (Chunk Washington/NNP)
  and/CC
  increasing/VBG
  taxes/NNS
  ./.)
(S
  We/PRP
  hear/VBP
  claims/NNS
  that/IN
  immigrants/NNS
  are/VBP
  somehow/RB
  bad/JJ
  for/IN
  the/DT
  economy/NN
  --/:
  even/RB
  though/IN
  this/DT
  economy/NN
  could/MD
  not/RB
  function/VB
  without/IN
  them/PRP
  ./.)
(Chunk Applause/NNP)
(S (/( (Chunk Applause/NNP) ./. )/))
(S
  All/PDT
  these/DT
  are/VBP
  forms/NNS
  of/IN
  economic/JJ
  retreat/NN
  ,/,
  and/CC
  they/PRP
  lead/VBP
  in/IN
  the

(S
  Violent/JJ
  crime/NN
  rates/NNS
  have/VBP
  fallen/VBN
  to/TO
  their/PRP$
  lowest/JJS
  levels/NNS
  since/IN
  the/DT
  1970s/CD
  ./.)
(S
  Welfare/NN
  cases/NNS
  have/VBP
  dropped/VBN
  by/IN
  more/JJR
  than/IN
  half/NN
  over/IN
  the/DT
  past/JJ
  decade/NN
  ./.)
(S
  Drug/NN
  use/NN
  among/IN
  youth/NN
  is/VBZ
  down/RB
  19/CD
  percent/NN
  since/IN
  2001/CD
  ./.)
(Chunk America/NNP)
(S
  There/EX
  are/VBP
  fewer/JJR
  abortions/NNS
  in/IN
  (Chunk America/NNP)
  than/IN
  at/IN
  any/DT
  point/NN
  in/IN
  the/DT
  last/JJ
  three/CD
  decades/NNS
  ,/,
  and/CC
  the/DT
  number/NN
  of/IN
  children/NNS
  born/VBN
  to/TO
  teenage/VB
  mothers/NNS
  has/VBZ
  been/VBN
  falling/VBG
  for/IN
  a/DT
  dozen/NN
  years/NNS
  in/IN
  a/DT
  row/NN
  ./.)
(Chunk Applause/NNP)
(S (/( (Chunk Applause/NNP) ./. )/))
(S
  These/DT
  gains/NNS
  are/VBP
  evidence/NN
  of/IN
  a/DT
  quiet/JJ
  transformation/NN
  --/:
  a/DT
  revolution/NN
  of/IN
  cons

#### Named entity recognition is useful to quickly find out what the subjects of discussion are. NLTK comes packed full of options for us. We can find just about any named entity, or we can look for specific ones.

#### NLTK can either recognize a general named entity, or it can even recognize locations, names, monetary amounts, dates, and more. 

In [5]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer    # this tokenizer is ml based and it first get trained and then predicts

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)  # train the tokenizer first and then use it
tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            # look for "white house" with and without binary true
            # chunking is gouping words based on their pos tag
            # binary true removes named entity type and can group continuous words
            namedEnt = nltk.ne_chunk(tagged, binary = True) 
            print(namedEnt)
            
    except Exception as e:
        print(str(e))
        
process_content() 

(S
  PRESIDENT/NNP
  (PERSON GEORGE/NNP W./NNP BUSH/NNP)
  'S/POS
  (ORGANIZATION ADDRESS/NNP)
  BEFORE/IN
  A/NNP
  (ORGANIZATION JOINT/NNP)
  SESSION/NNP
  OF/IN
  (ORGANIZATION THE/NNP)
  (ORGANIZATION CONGRESS/NNP)
  ON/NNP
  THE/NNP
  (ORGANIZATION STATE/NNP OF/IN)
  (ORGANIZATION THE/NNP)
  (ORGANIZATION UNION/NNP)
  January/NNP
  31/CD
  ,/,
  2006/CD
  (ORGANIZATION THE/NNP)
  PRESIDENT/NNP
  :/:
  Thank/NNP
  you/PRP
  all/DT
  ./.)
(S
  (PERSON Mr./NNP Speaker/NNP)
  ,/,
  Vice/NNP
  President/NNP
  (PERSON Cheney/NNP)
  ,/,
  members/NNS
  of/IN
  (ORGANIZATION Congress/NNP)
  ,/,
  members/NNS
  of/IN
  the/DT
  (ORGANIZATION Supreme/NNP Court/NNP)
  and/CC
  diplomatic/JJ
  corps/NN
  ,/,
  distinguished/JJ
  guests/NNS
  ,/,
  and/CC
  fellow/JJ
  citizens/NNS
  :/:
  Today/VB
  our/PRP$
  nation/NN
  lost/VBD
  a/DT
  beloved/VBN
  ,/,
  graceful/JJ
  ,/,
  courageous/JJ
  woman/NN
  who/WP
  called/VBD
  (GPE America/NNP)
  to/TO
  its/PRP$
  founding/NN
  ideals/NNS
  

(S
  We/PRP
  are/VBP
  the/DT
  nation/NN
  that/IN
  saved/VBD
  liberty/NN
  in/IN
  (GPE Europe/NNP)
  ,/,
  and/CC
  liberated/VBD
  death/NN
  camps/NNS
  ,/,
  and/CC
  helped/VBD
  raise/VB
  up/RP
  democracies/NNS
  ,/,
  and/CC
  faced/VBD
  down/IN
  an/DT
  evil/JJ
  empire/NN
  ./.)
(S
  Once/RB
  again/RB
  ,/,
  we/PRP
  accept/VBP
  the/DT
  call/NN
  of/IN
  history/NN
  to/TO
  deliver/VB
  the/DT
  oppressed/VBN
  and/CC
  move/VB
  this/DT
  world/NN
  toward/IN
  peace/NN
  ./.)
(S
  We/PRP
  remain/VBP
  on/IN
  the/DT
  offensive/JJ
  against/IN
  terror/NN
  networks/NNS
  ./.)
(S
  We/PRP
  have/VBP
  killed/VBN
  or/CC
  captured/VBN
  many/JJ
  of/IN
  their/PRP$
  leaders/NNS
  --/:
  and/CC
  for/IN
  the/DT
  others/NNS
  ,/,
  their/PRP$
  day/NN
  will/MD
  come/VB
  ./.)
(S
  President/NNP
  (PERSON George/NNP W./NNP Bush/NNP)
  greets/VBZ
  members/NNS
  of/IN
  (ORGANIZATION Congress/NNP)
  after/IN
  his/PRP$
  State/NN
  of/IN
  the/DT
  (ORGANIZAT

(S
  Democracies/NNS
  in/IN
  the/DT
  (GPE Middle/NNP East/NNP)
  will/MD
  not/RB
  look/VB
  like/IN
  our/PRP$
  own/JJ
  ,/,
  because/IN
  they/PRP
  will/MD
  reflect/VB
  the/DT
  traditions/NNS
  of/IN
  their/PRP$
  own/JJ
  citizens/NNS
  ./.)
(S
  Yet/RB
  liberty/NN
  is/VBZ
  the/DT
  future/NN
  of/IN
  every/DT
  nation/NN
  in/IN
  the/DT
  (GPE Middle/NNP East/NNP)
  ,/,
  because/IN
  liberty/NN
  is/VBZ
  the/DT
  right/NN
  and/CC
  hope/NN
  of/IN
  all/DT
  humanity/NN
  ./.)
(S (/( (ORGANIZATION Applause/NNP) ./. )/))
(S
  President/NNP
  (PERSON George/NNP W./NNP Bush/NNP)
  waves/VBZ
  toward/IN
  the/DT
  upper/JJ
  visitors/NNS
  gallery/NN
  of/IN
  the/DT
  (ORGANIZATION House/NNP)
  Chamber/NNP
  following/VBG
  his/PRP$
  State/NN
  of/IN
  the/DT
  (ORGANIZATION Union/NNP)
  remarks/NNS
  Tuesday/NNP
  ,/,
  (PERSON Jan/NNP)
  ./.)
(S
  31/CD
  ,/,
  2006/CD
  at/IN
  the/DT
  (GPE United/NNP States/NNPS)
  Capitol/NNP
  ./.)
(S
  (FACILITY White/NNP)


  ./.)
(S
  In/IN
  a/DT
  dynamic/JJ
  world/NN
  economy/NN
  ,/,
  we/PRP
  are/VBP
  seeing/VBG
  new/JJ
  competitors/NNS
  ,/,
  like/IN
  (GPE China/NNP)
  and/CC
  (GPE India/NNP)
  ,/,
  and/CC
  this/DT
  creates/VBZ
  uncertainty/NN
  ,/,
  which/WDT
  makes/VBZ
  it/PRP
  easier/JJR
  to/TO
  feed/VB
  people/NNS
  's/POS
  fears/NNS
  ./.)
(S
  So/IN
  we/PRP
  're/VBP
  seeing/VBG
  some/DT
  old/JJ
  temptations/NNS
  return/NN
  ./.)
(S
  Protectionists/NNS
  want/VBP
  to/TO
  escape/VB
  competition/NN
  ,/,
  pretending/VBG
  that/IN
  we/PRP
  can/MD
  keep/VB
  our/PRP$
  high/JJ
  standard/NN
  of/IN
  living/NN
  while/IN
  walling/VBG
  off/RP
  our/PRP$
  economy/NN
  ./.)
(S
  Others/NNS
  say/VBP
  that/IN
  the/DT
  government/NN
  needs/VBZ
  to/TO
  take/VB
  a/DT
  larger/JJR
  role/NN
  in/IN
  directing/VBG
  the/DT
  economy/NN
  ,/,
  centralizing/VBG
  more/JJR
  power/NN
  in/IN
  (GPE Washington/NNP)
  and/CC
  increasing/VBG
  taxes/NNS
  ./.)
(S


(S
  And/CC
  here/RB
  we/PRP
  have/VBP
  a/DT
  serious/JJ
  problem/NN
  :/:
  (GPE America/NNP)
  is/VBZ
  addicted/VBN
  to/TO
  oil/NN
  ,/,
  which/WDT
  is/VBZ
  often/RB
  imported/VBN
  from/IN
  unstable/JJ
  parts/NNS
  of/IN
  the/DT
  world/NN
  ./.)
(S
  The/DT
  best/JJS
  way/NN
  to/TO
  break/VB
  this/DT
  addiction/NN
  is/VBZ
  through/IN
  technology/NN
  ./.)
(S
  Since/IN
  2001/CD
  ,/,
  we/PRP
  have/VBP
  spent/VBN
  nearly/RB
  $/$
  10/CD
  billion/CD
  to/TO
  develop/VB
  cleaner/JJR
  ,/,
  cheaper/JJR
  ,/,
  and/CC
  more/RBR
  reliable/JJ
  alternative/JJ
  energy/NN
  sources/NNS
  --/:
  and/CC
  we/PRP
  are/VBP
  on/IN
  the/DT
  threshold/NN
  of/IN
  incredible/JJ
  advances/NNS
  ./.)
(S
  So/RB
  tonight/JJ
  ,/,
  I/PRP
  announce/VBP
  the/DT
  (ORGANIZATION Advanced/NNP Energy/NNP)
  Initiative/NNP
  --/:
  a/DT
  22-percent/JJ
  increase/NN
  in/IN
  clean-energy/JJ
  research/NN
  --/:
  at/IN
  the/DT
  (ORGANIZATION Department/NNP)
 

(S (/( (ORGANIZATION Applause/NNP) ./. )/))
(S
  Today/NN
  marks/VBZ
  the/DT
  official/JJ
  retirement/NN
  of/IN
  a/DT
  very/RB
  special/JJ
  (GPE American/NNP)
  ./.)
(S
  For/IN
  24/CD
  years/NNS
  of/IN
  faithful/JJ
  service/NN
  to/TO
  our/PRP$
  nation/NN
  ,/,
  the/DT
  (GPE United/NNP States/NNPS)
  is/VBZ
  grateful/JJ
  to/TO
  (ORGANIZATION Justice/NNP Sandra/NNP)
  Day/NNP
  O'Connor/NNP
  ./.)
(S (/( (ORGANIZATION Applause/NNP) ./. )/))
(S
  A/DT
  hopeful/JJ
  society/NN
  has/VBZ
  institutions/NNS
  of/IN
  science/NN
  and/CC
  medicine/NN
  that/WDT
  do/VBP
  not/RB
  cut/VB
  ethical/JJ
  corners/NNS
  ,/,
  and/CC
  that/IN
  recognize/VBP
  the/DT
  matchless/NN
  value/NN
  of/IN
  every/DT
  life/NN
  ./.)
(S
  Tonight/NNP
  I/PRP
  ask/VBP
  you/PRP
  to/TO
  pass/VB
  legislation/NN
  to/TO
  prohibit/VB
  the/DT
  most/RBS
  egregious/JJ
  abuses/NNS
  of/IN
  medical/JJ
  research/NN
  :/:
  human/JJ
  cloning/VBG
  in/IN
  all/DT
  its/PRP$
  fo

In [8]:
import nltk
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\Sudhanshu\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping chunkers\maxent_ne_chunker.zip.


True

#### A very similar operation to stemming is called lemmatizing. The major difference between these is, as you saw earlier, stemming can often create non-existent words.

#### So, your root stem, meaning the word you end up with, is not something you can just look up in a dictionary.

#### A root lemma, on the other hand, is a real word. Many times, you will wind up with a very similar word, but sometimes, you will wind up with a completely different word.

In [25]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("kitten"))
print(lemmatizer.lemmatize("better"))
print(lemmatizer.lemmatize("cacti"))
print(lemmatizer.lemmatize("better", pos='a'))  # a for adjective
# for many words to lemmatize use POS tagger and then use lemmatizer to get better stemmed word

cat
kitten
better
cactus
good


In [12]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Sudhanshu\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

#### Part of the NLTK Corpora is WordNet. I wouldn't totally classify WordNet as a Corpora, if anything it is really a giant Lexicon, but, either way, it is super useful. With WordNet we can do things like look up words and their meaning according to their parts of speech, we can find synonyms, antonyms, and even examples of the word in use. 

In [41]:
from nltk.corpus import wordnet

syns = wordnet.synsets('program')
# synset
print(syns)
# first word of synset
print(syns[0].name())
# just name of 1st synset
print(syns[0].lemmas()[0].name())
# definition
print(syns[0].definition())
#example
print(syns[0].examples())

synonyms = []
antonyms = []

for syn in wordnet.synsets("good"):
    for l in syn.lemmas():
        # Synsets represent the set of different senses of a particular word. Whereas lemmas as the synonyms within each sense
        synonyms.append(l.name())
        # The lemmas will be synonyms, and then you can use .antonyms to find the antonyms to the lemmas
        if(l.antonyms()):
            antonyms.append(l.antonyms()[0].name())
            
print(set(synonyms))
print(set(antonyms))

w1 = wordnet.synset("ship.n.01")   # it is synset not synsets ( diff of plural )
w2 = wordnet.synset("boat.n.01")
print(w1.wup_similarity(w2))

w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("car.n.01")
print(w1.wup_similarity(w2))

w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("cat.n.01")
print(w1.wup_similarity(w2))

[Synset('plan.n.01'), Synset('program.n.02'), Synset('broadcast.n.02'), Synset('platform.n.02'), Synset('program.n.05'), Synset('course_of_study.n.01'), Synset('program.n.07'), Synset('program.n.08'), Synset('program.v.01'), Synset('program.v.02')]
plan.n.01
plan
a series of steps to be carried out or goals to be accomplished
['they drew up a six-step plan', 'they discussed plans for a new bond issue']
{'right', 'near', 'secure', 'dependable', 'serious', 'good', 'unspoilt', 'full', 'salutary', 'estimable', 'well', 'honorable', 'soundly', 'just', 'thoroughly', 'beneficial', 'practiced', 'in_effect', 'commodity', 'respectable', 'safe', 'adept', 'upright', 'sound', 'undecomposed', 'goodness', 'honest', 'dear', 'unspoiled', 'skillful', 'trade_good', 'in_force', 'expert', 'ripe', 'effective', 'proficient', 'skilful'}
{'ill', 'badness', 'evilness', 'bad', 'evil'}
0.9090909090909091
0.6956521739130435
0.32


#### Now that we understand some of the basics of of natural language processing with the Python NLTK module, we're ready to try out text classification. This is where we attempt to identify a body of text with some sort of label. 

#### To start, we're going to use some sort of binary label. Examples of this could be identifying text as spam or not, or, like what we'll be doing, positive sentiment or negative sentiment. 

In [6]:
import nltk
import random  # for random shuffling of sentences
from nltk.corpus import movie_reviews

# documents contain list of tuples of words of a fileid of particular category and their movie category
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((list(movie_reviews.words(fileid)), category))

# first 1000 fileids in each category of corpora are positive sentiment and next 1000 are negative, so random shuffle is used
random.shuffle(documents)    
print(documents[2])        

all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())

# converting list of words (each word can occur multiple times) to frequency distribution (dictionary of words and their count) 
all_words = nltk.FreqDist(all_words)  
print(all_words.most_common(15))
print(all_words["stupid"])

(['capsule', ':', 'dumb', 'dud', 'of', 'an', 'entry', 'in', 'the', 'body', 'heat', 'sweepstakes', ',', 'and', 'now', 'something', 'of', 'a', 'landmark', 'for', 'having', 'spawned', 'a', 'jillion', 'clones', '.', 'basic', 'instinct', "'", 's', 'worst', 'crime', 'is', 'that', 'it', "'", 's', 'not', 'just', 'a', 'bad', 'movie', ',', 'but', 'in', 'many', 'ways', ',', 'an', 'incompetent', 'one', 'as', 'well', '.', 'it', "'", 's', 'stupid', 'in', 'itself', ',', 'and', 'stupid', 'to', 'think', 'its', 'audience', 'is', 'as', 'stupid', 'as', 'it', 'is', '.', 'it', "'", 's', 'supposed', 'to', 'be', 'a', 'thriller', ',', 'but', 'the', 'plot', 'is', 'dead', 'on', 'its', 'feet', ',', 'and', 'ultimately', 'arbitrary', ';', 'it', "'", 's', 'only', 'put', 'into', 'the', 'movie', 'to', 'jerk', 'us', 'around', '.', 'it', "'", 's', 'also', 'supposed', 'to', 'be', 'sexy', ',', 'but', 'it', "'", 's', 'instead', 'got', 'the', 'unpleasant', 'rawness', 'of', 'a', 'teenager', 'showing', 'younger', 'kids', 'his

In [43]:
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\Sudhanshu\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\movie_reviews.zip.


True

#### For our text classification, we have to find some way to "describe" bits of data, which are labeled as either positive or negative for machine learning training purposes. 

#### These descriptions are called "features" in machine learning. For our project, we're just going to simply classify each word within a positive or negative review as a "feature" of that review. 

#### Then, as we go on, we can train a classifier by showing it all of the features of positive and negative reviews (all the words), and let it try to figure out the more meaningful differences between a positive review and a negative review, by simply looking for common negative review words and common positive review words.

### Naive Bayes Algorithm
#### The algorithm of choice, at least at a basic level, for text analysis is often the Naive Bayes classifier. Part of the reason for this is that text data is almost always massive in size. The Naive Bayes algorithm is so simple that it can be used at scale very easily with minimal process requirements

In [8]:
import nltk
import random
from nltk.corpus import movie_reviews

documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((list(movie_reviews.words(fileid)), category))

# first 1000 fileids in corpora are positive sentiment and next 1000 are negative, so random shuffle is used
random.shuffle(documents)    
#print(documents[2])        

all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())
    
all_words = nltk.FreqDist(all_words)  # converting list of words to frequency distribution dictionary
#print(all_words.most_common(15))
#print(all_words["stupid"])

# taking only top 3000 words as features
word_features = [w[0] for w in all_words.most_common(3000)]
#word_features = list(all_words.keys())[:3000]

# function to find words which are present in a document
def find_features(document):
    words= set(document)   # this is all unique words in the document
    features = {}
    for w in word_features:
        features[w] = (w in words)  # boolean of whether w is present in the document or not
        
    return features 

#print(find_features(movie_reviews.words('neg/cv000_29416.txt')))
# featureset is a list of tuples of category and corresponding words boolean in documents
featureset = [(find_features(rev), category) for (rev, category) in documents]   # rev is review words
#featureset[:2]
print(len(featureset))
training_set = featureset[:1900]
testing_set = featureset[1900:]

classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Naive bayes Accuracy is : ", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)

2000
Naive bayes Accuracy is :  88.0
Most Informative Features
              schumacher = True              neg : pos    =     11.2 : 1.0
             outstanding = True              pos : neg    =     11.2 : 1.0
                  seagal = True              neg : pos    =      8.3 : 1.0
                   mulan = True              pos : neg    =      8.2 : 1.0
                  finest = True              pos : neg    =      8.0 : 1.0
             wonderfully = True              pos : neg    =      6.7 : 1.0
                 idiotic = True              neg : pos    =      6.7 : 1.0
                   jolie = True              neg : pos    =      5.9 : 1.0
                   flynt = True              pos : neg    =      5.6 : 1.0
                lebowski = True              pos : neg    =      5.6 : 1.0
                   damon = True              pos : neg    =      5.6 : 1.0
                   inept = True              neg : pos    =      5.5 : 1.0
                   awful = True      

### Saving the classifier using pickle and loading it up again to predict on new values

In [10]:
import nltk
import random
from nltk.corpus import movie_reviews
import pickle

documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((list(movie_reviews.words(fileid)), category))

# first 1000 fileids in corpora are positive sentiment and next 1000 are negative, so random shuffle is used
random.shuffle(documents)    
#print(documents[2])        

all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())
    
all_words = nltk.FreqDist(all_words)  # converting list of words to frequency distribution dictionary
#print(all_words.most_common(15))
#print(all_words["stupid"])

# taking only top 3000 words as features
word_features = [w[0] for w in all_words.most_common(3000)]
#word_features = list(all_words.keys())[:3000]
# function to find words which are present in a document
def find_features(document):
    words= set(document)   # this is all unique words in the document
    features = {}
    for w in word_features:
        features[w] = (w in words)  # boolean of whether w in present in the document
        
    return features 

#print(find_features(movie_reviews.words('neg/cv000_29416.txt')))
featureset = [(find_features(rev), category) for (rev, category) in documents]
#featureset[:2]
print(len(featureset))
training_set = featureset[:1900]
testing_set = featureset[1900:]

#classifier = nltk.NaiveBayesClassifier.train(training_set)

classifier_f= open("naivebayes.pickle", 'rb')
classifier= pickle.load(classifier_f)
classifier_f.close()


print("Naive bayes Accuracy is : ", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)
#uncomment below for saving the classifier
'''
# save_classifier is the file opened in write byte mode and file name is naivebayes.pickle
save_classifier = open("naivebayes.pickle", "wb")
pickle.dump(classifier, save_classifier)   # .dump() will save classifier in opened file
save_classifier.close()
'''

2000
Naive bayes Accuracy is :  87.0
Most Informative Features
             outstanding = True              pos : neg    =     11.2 : 1.0
                   mulan = True              pos : neg    =      8.4 : 1.0
                   damon = True              pos : neg    =      7.9 : 1.0
                  seagal = True              neg : pos    =      7.8 : 1.0
                  finest = True              pos : neg    =      7.6 : 1.0
                 idiotic = True              neg : pos    =      7.4 : 1.0
             wonderfully = True              pos : neg    =      7.3 : 1.0
              schumacher = True              neg : pos    =      7.0 : 1.0
                   inept = True              neg : pos    =      6.1 : 1.0
                   jolie = True              neg : pos    =      5.8 : 1.0
                 flubber = True              neg : pos    =      5.6 : 1.0
                    lame = True              neg : pos    =      5.6 : 1.0
                   waste = True      

'\n# save_classifier is the file opened in write byte mode and file name is naivebayes.pickle\nsave_classifier = open("naivebayes.pickle", "wb")\npickle.dump(classifier, save_classifier)   # .dump() will save classifier in opened file\nsave_classifier.close()\n'

#### Despite coming packed with some classifiers, NLTK is mainly a toolkit focused on natural language processing, and not machine learning specifically. 
#### A module that is focused on machine learning is scikit-learn, which is packed with a large array of machine learning algorithms which are optimized in C. 

#### Luckily NLTK has recognized this and comes packaged with a special classifier that wraps around scikit learn. In NLTK, this is: nltk.classify.scikitlearn, specifically the class:  SklearnClassifier is what we're interested in.
#### This allows us to port over any of the scikit-learn classifiers that are compatible, which is most.

In [11]:
import nltk
import random
from nltk.corpus import movie_reviews
import pickle
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((list(movie_reviews.words(fileid)), category))

# first 1000 fileids in corpora are positive sentiment and next 1000 are negative, so random shuffle is used
random.shuffle(documents)    
#print(documents[2])        

all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())
    
all_words = nltk.FreqDist(all_words)  # converting list of words to frequency distribution dictionary

# taking only top 3000 words as features
word_features = [w[0] for w in all_words.most_common(3000)]

# function to find words which are present in a document
def find_features(document):
    words= set(document)   # this is all unique words in the document
    features = {}
    for w in word_features:
        features[w] = (w in words)  # boolean of whether w in present in the document
        
    return features 

featureset = [(find_features(rev), category) for (rev, category) in documents]

training_set = featureset[:1900]
testing_set = featureset[1900:]

#classifier = nltk.NaiveBayesClassifier.train(training_set)

classifier_f= open("naivebayes.pickle", 'rb')
classifier= pickle.load(classifier_f)
classifier_f.close()


print("Original Naive bayes Accuracy is : ", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier Accuracy is : ", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)


BNB_classifier = SklearnClassifier(BernoulliNB())
BNB_classifier.train(training_set)
print("BNB_classifier Accuracy is : ", (nltk.classify.accuracy(BNB_classifier, testing_set))*100)


Original Naive bayes Accuracy is :  86.0
Most Informative Features
             outstanding = True              pos : neg    =     11.2 : 1.0
                   mulan = True              pos : neg    =      8.4 : 1.0
                   damon = True              pos : neg    =      7.9 : 1.0
                  seagal = True              neg : pos    =      7.8 : 1.0
                  finest = True              pos : neg    =      7.6 : 1.0
                 idiotic = True              neg : pos    =      7.4 : 1.0
             wonderfully = True              pos : neg    =      7.3 : 1.0
              schumacher = True              neg : pos    =      7.0 : 1.0
                   inept = True              neg : pos    =      6.1 : 1.0
                   jolie = True              neg : pos    =      5.8 : 1.0
                 flubber = True              neg : pos    =      5.6 : 1.0
                    lame = True              neg : pos    =      5.6 : 1.0
                   waste = True  