In [None]:
import nltk 

In [None]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [None]:
example_txt="The focus of such an essay predicts its structure. It dictates the information readers need to know and the order in which they need to receive it. Thus your essay's structure is necessarily unique to the main claim you're making. Although there are guidelines for constructing certain classic essay types (e.g., comparative analysis), there are no set formula."
print(sent_tokenize(example_txt))


In [None]:
words=word_tokenize(example_txt)

In [None]:
# stopwords
from nltk.corpus import stopwords
stop_words=set(stopwords.words("english"))
print(stop_words)

In [None]:
# filtered_sentence=[]
# for w in words:
#     if w not in stop_words:
#         filtered_sentence.append(w)
# print(filtered_sentence)
filtered_sentence=[w for w in words if w not in stop_words]
print(filtered_sentence)

In [None]:
# stemmer
from nltk.stem import PorterStemmer
ps=PorterStemmer()
example_words=["python","pythoner","pythoning","pythoned","pythonly"]
for w in example_words:
    print(ps.stem(w))

In [None]:
#pos tagging
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

In [None]:
train_txt=state_union.raw("aboutapj1.txt")
sample_txt=state_union.raw("aboutapj.txt")


In [10]:
custom_sent_tokenizer=PunktSentenceTokenizer(train_txt)
tokenized=custom_sent_tokenizer.tokenize(sample_txt)

In [11]:
def process_content():
    try:
        for i in tokenized:
            words=nltk.word_tokenize(i)
            tagged=nltk.pos_tag(words)
            print(tagged)
    except Exception as e:
        print(str(e))
process_content()

[('Kizie', 'NNP'), ('Basu', 'NNP'), ('is', 'VBZ'), ('fighting', 'VBG'), ('thyroid', 'JJ'), ('cancer', 'NN'), ('when', 'WRB'), ('she', 'PRP'), ('meets', 'VBZ'), ('Immanuel', 'NNP'), ("'Manny", 'NNP'), ("'", 'POS'), ('Rajkumar', 'NNP'), ('Junior', 'NNP'), (',', ','), ('who', 'WP'), ('has', 'VBZ'), ('previously', 'RB'), ('suffered', 'VBN'), ('from', 'IN'), ('osteosarcoma', 'NN'), ('and', 'CC'), ('is', 'VBZ'), ('in', 'IN'), ('remission', 'NN'), ('.', '.')]
[('Manny', 'NNP'), ('and', 'CC'), ('his', 'PRP$'), ('friend', 'NN'), ('JP', 'NNP'), (',', ','), ('who', 'WP'), ('is', 'VBZ'), ('suffering', 'VBG'), ('from', 'IN'), ('glaucoma', 'NN'), ('and', 'CC'), ('is', 'VBZ'), ('blind', 'VBN'), ('in', 'IN'), ('one', 'CD'), ('eye', 'NN'), (',', ','), ('are', 'VBP'), ('making', 'VBG'), ('a', 'DT'), ('movie', 'NN'), ('together', 'RB'), (',', ','), ('inspired', 'VBN'), ('by', 'IN'), ('Rajinikanth', 'NNP'), ("'s", 'POS'), ('films', 'NNS'), ('.', '.')]
[('Manny', 'NNP'), ('invites', 'VBZ'), ('Kizie', 'NNP'

In [12]:
# chunking
def process_content():
    try:
        for i in tokenized:
            words=nltk.word_tokenize(i)
            tagged=nltk.pos_tag(words)
            chunkGram=r"""Chunk:{<RB.?>*<VB.?>*<NNP><NN>?}"""
            chunkParser=nltk.RegexpParser(chunkGram)
            chunked=chunkParser.parse(tagged)
            print(chunked)
            chunked.draw()
    except Exception as e:
        print(str(e))
process_content()

(S
  (Chunk Kizie/NNP)
  (Chunk Basu/NNP)
  is/VBZ
  fighting/VBG
  thyroid/JJ
  cancer/NN
  when/WRB
  she/PRP
  (Chunk meets/VBZ Immanuel/NNP)
  (Chunk 'Manny/NNP)
  '/POS
  (Chunk Rajkumar/NNP)
  (Chunk Junior/NNP)
  ,/,
  who/WP
  has/VBZ
  previously/RB
  suffered/VBN
  from/IN
  osteosarcoma/NN
  and/CC
  is/VBZ
  in/IN
  remission/NN
  ./.)
(S
  (Chunk Manny/NNP)
  and/CC
  his/PRP$
  friend/NN
  (Chunk JP/NNP)
  ,/,
  who/WP
  is/VBZ
  suffering/VBG
  from/IN
  glaucoma/NN
  and/CC
  is/VBZ
  blind/VBN
  in/IN
  one/CD
  eye/NN
  ,/,
  are/VBP
  making/VBG
  a/DT
  movie/NN
  together/RB
  ,/,
  inspired/VBN
  by/IN
  (Chunk Rajinikanth/NNP)
  's/POS
  films/NNS
  ./.)
(S
  (Chunk Manny/NNP)
  (Chunk invites/VBZ Kizie/NNP)
  to/TO
  be/VB
  the/DT
  female/JJ
  lead/NN
  ./.)
(S
  The/DT
  two/CD
  bond/NN
  over/IN
  his/PRP$
  love/NN
  for/IN
  (Chunk Rajnikanth/NNP)
  's/POS
  movies/NNS
  and/CC
  her/PRP$
  love/NN
  for/IN
  music/NN
  ,/,
  specifically/RB
  an/DT
  inc

In [13]:
# chinking
def process_content():
    try:
        for i in tokenized:
            words=nltk.word_tokenize(i)
            tagged=nltk.pos_tag(words)
            chunkGram=r"""Chunk:{<.*>+}
                                }<VB.?|IN|DT|TO>+{"""
            chunkParser=nltk.RegexpParser(chunkGram)
            chunked=chunkParser.parse(tagged)
            print(chunked)
            chunked.draw()
    except Exception as e:
        print(str(e))
process_content()

(S
  (Chunk Kizie/NNP Basu/NNP)
  is/VBZ
  fighting/VBG
  (Chunk thyroid/JJ cancer/NN when/WRB she/PRP)
  meets/VBZ
  (Chunk
    Immanuel/NNP
    'Manny/NNP
    '/POS
    Rajkumar/NNP
    Junior/NNP
    ,/,
    who/WP)
  has/VBZ
  (Chunk previously/RB)
  suffered/VBN
  from/IN
  (Chunk osteosarcoma/NN and/CC)
  is/VBZ
  in/IN
  (Chunk remission/NN ./.))
(S
  (Chunk Manny/NNP and/CC his/PRP$ friend/NN JP/NNP ,/, who/WP)
  is/VBZ
  suffering/VBG
  from/IN
  (Chunk glaucoma/NN and/CC)
  is/VBZ
  blind/VBN
  in/IN
  (Chunk one/CD eye/NN ,/,)
  are/VBP
  making/VBG
  a/DT
  (Chunk movie/NN together/RB ,/,)
  inspired/VBN
  by/IN
  (Chunk Rajinikanth/NNP 's/POS films/NNS ./.))
(S
  (Chunk Manny/NNP)
  invites/VBZ
  (Chunk Kizie/NNP)
  to/TO
  be/VB
  the/DT
  (Chunk female/JJ lead/NN ./.))
(S
  The/DT
  (Chunk two/CD bond/NN)
  over/IN
  (Chunk his/PRP$ love/NN)
  for/IN
  (Chunk Rajnikanth/NNP 's/POS movies/NNS and/CC her/PRP$ love/NN)
  for/IN
  (Chunk music/NN ,/, specifically/RB)
  an/DT

In [17]:
# nammmed entity
def process_content():
    try:
        for i in tokenized:
            words=nltk.word_tokenize(i)
            tagged=nltk.pos_tag(words)
            named_ent=nltk.ne_chunk(tagged,binary=True)
            print(named_ent)
            named_ent.draw()
    except Exception as e:
        print(str(e))
process_content()

(S
  (NE Kizie/NNP Basu/NNP)
  is/VBZ
  fighting/VBG
  thyroid/JJ
  cancer/NN
  when/WRB
  she/PRP
  meets/VBZ
  (NE Immanuel/NNP)
  'Manny/NNP
  '/POS
  (NE Rajkumar/NNP Junior/NNP)
  ,/,
  who/WP
  has/VBZ
  previously/RB
  suffered/VBN
  from/IN
  osteosarcoma/NN
  and/CC
  is/VBZ
  in/IN
  remission/NN
  ./.)
(S
  (NE Manny/NNP)
  and/CC
  his/PRP$
  friend/NN
  JP/NNP
  ,/,
  who/WP
  is/VBZ
  suffering/VBG
  from/IN
  glaucoma/NN
  and/CC
  is/VBZ
  blind/VBN
  in/IN
  one/CD
  eye/NN
  ,/,
  are/VBP
  making/VBG
  a/DT
  movie/NN
  together/RB
  ,/,
  inspired/VBN
  by/IN
  (NE Rajinikanth/NNP)
  's/POS
  films/NNS
  ./.)
(S
  (NE Manny/NNP)
  invites/VBZ
  (NE Kizie/NNP)
  to/TO
  be/VB
  the/DT
  female/JJ
  lead/NN
  ./.)
(S
  The/DT
  two/CD
  bond/NN
  over/IN
  his/PRP$
  love/NN
  for/IN
  (NE Rajnikanth/NNP)
  's/POS
  movies/NNS
  and/CC
  her/PRP$
  love/NN
  for/IN
  music/NN
  ,/,
  specifically/RB
  an/DT
  incomplete/NN
  song/NN
  by/IN
  retired/JJ
  songwriter/N

In [19]:
# lemmatizing
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()
# default position is noun
print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("cacti"))
print(lemmatizer.lemmatize("geese"))
print(lemmatizer.lemmatize("rocks"))
# position adjective
print(lemmatizer.lemmatize("better",pos="a"))
print(lemmatizer.lemmatize("best",pos="a"))
print(lemmatizer.lemmatize("run"))
# position verb
print(lemmatizer.lemmatize("run","v"))

cat
cactus
goose
rock
good
best
run
run


In [22]:
# corpora
# corpus data search for %appdata% ->nltk_data
from nltk.corpus import abc
sample=abc.raw("science.txt")
tokens=sent_tokenize(sample)
print(tokens[1:10])


["That's the conclusion of two studies published in this week's issue of The New England Journal of Medicine.", 'They found that inhaling a mist with a salt content of 7 or 9% improved lung function and, in some cases, produced less absenteeism from school or work.', 'Cystic fibrosis, a progressive and frequently fatal genetic disease that affects about 30,000 young adults and children in the US alone, is marked by a thickening of the mucus which makes it harder to clear the lungs of debris and bacteria.', 'The salt water solution "really opens up a new avenue for approaching patients with cystic fibrosis and how to treat them," says Dr Gail Weinmann, of the US National Heart, Lung, and Blood Institute, which sponsored one of the studies.', 'Mark Elkins of the Royal Prince Alfred Hospital in Sydney, Australia and colleagues authored one of the new published studies.', 'The team found that the 83 volunteers who regularly inhaled a 7% mist of salty water had fewer breathing problems and 

In [24]:
# Wordnet
# synset - interface to look up words in wordnet 
from nltk.corpus import wordnet
syns=wordnet.synsets("great")

# whole synset
print(syns[0].name())

# only the word
print(syns[0].lemmas()[0].name())

# definition
print(syns[0].definition())

# examples
print(syns[0].examples())

great.n.01
great
a person who has achieved distinction and honor in some field
['he is one of the greats of American music']


In [25]:
# synonyms and antonyms using wordnet
synonyms=[]
antonyms=[]
for syn in wordnet.synsets("good"):
    for l in syn.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())
print(set(synonyms))
print(set(antonyms))

{'estimable', 'honorable', 'adept', 'dear', 'near', 'undecomposed', 'expert', 'goodness', 'soundly', 'in_effect', 'beneficial', 'unspoilt', 'salutary', 'sound', 'in_force', 'skillful', 'well', 'upright', 'proficient', 'full', 'trade_good', 'secure', 'unspoiled', 'good', 'just', 'right', 'honest', 'respectable', 'thoroughly', 'commodity', 'dependable', 'effective', 'serious', 'practiced', 'safe', 'ripe', 'skilful'}
{'bad', 'evilness', 'ill', 'badness', 'evil'}


In [26]:
# semantic similarity between words
w1=wordnet.synset("ship.n.01")
w2=wordnet.synset("boat.n.01")
print(w1.wup_similarity(w2))

w1=wordnet.synset("ship.n.01")
w2=wordnet.synset("car.n.01")
print(w1.wup_similarity(w2))

w1=wordnet.synset("ship.n.01")
w2=wordnet.synset("cactus.n.01")
print(w1.wup_similarity(w2))


0.9090909090909091
0.6956521739130435
0.38095238095238093
