## Find synonym


In [38]:
import nltk
from nltk.corpus import wordnet 

# Find synonym of "aspirin"
syns = wordnet.synsets("fever")

for k,syn in enumerate(syns):
    print("-----\nSyn #{}:".format(k))        
    print(syn.name()) 

    # Just the word: 
    print(syn.lemmas()[0].name()) 

    # Definition of that first synset: 
    print(syn.definition()) 

    # Examples of the word in use in sentences: 
    print("Examples:", syn.examples()) 

-----
Syn #0:
fever.n.01
fever
a rise in the temperature of the body; frequently a symptom of infection
Examples: []
-----
Syn #1:
fever.n.02
fever
intense nervous anticipation
Examples: ['in a fever of resentment']


## Stopwords

In [10]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

## Using Name Entity   ## Chunking

In [55]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

sample = """
I have a fever
what is aspirin use for?
"""

def preprocess(sent):
    sent = word_tokenize(sent)
    sent = pos_tag(sent)
    return sent


ex = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'
ex = "What can aspirin be used for?"
ex = "What does aspirin be used for?"
ex = "Acetaminophen is a pain reliever and a fever reducer"
ex = "Acne is most common among teenagers, though it affects people of all ages."
sent = preprocess(ex)
print("Tagged:", sent)


# Chunking, by noun phrase, usually by this pattern
# determiner, adjectives, noun
pattern = 'NP: {<DT>?<JJ>*<NN>*}'
chunk_parser = nltk.RegexpParser(pattern)
chunk = chunk_parser.parse(sent)
print("Chunk", type(chunk))
print(chunk.subtrees)

# rule1 = ChunkRUle()


for subtree in chunk.subtrees(filter=lambda t: t.label() == 'NP'):
    # print the noun phrase as a list of part-of-speech tagged words
    print("===", subtree.leaves())
    
    for leaf in subtree.leaves():
        print(leaf[0])



Tagged: [('Acne', 'NNP'), ('is', 'VBZ'), ('most', 'RBS'), ('common', 'JJ'), ('among', 'IN'), ('teenagers', 'NNS'), (',', ','), ('though', 'IN'), ('it', 'PRP'), ('affects', 'VBZ'), ('people', 'NNS'), ('of', 'IN'), ('all', 'DT'), ('ages', 'NNS'), ('.', '.')]
Chunk <class 'nltk.tree.Tree'>
<bound method Tree.subtrees of Tree('S', [('Acne', 'NNP'), ('is', 'VBZ'), ('most', 'RBS'), Tree('NP', [('common', 'JJ')]), ('among', 'IN'), ('teenagers', 'NNS'), (',', ','), ('though', 'IN'), ('it', 'PRP'), ('affects', 'VBZ'), ('people', 'NNS'), ('of', 'IN'), Tree('NP', [('all', 'DT')]), ('ages', 'NNS'), ('.', '.')])>
=== [('common', 'JJ')]
common
=== [('all', 'DT')]
all


## Chunking

In [None]:
"""
sentences = nltk.sent_tokenize(sample)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

def extract_entity_names(t):
    entity_names = []

    if hasattr(t, 'label') and t.label:
        if t.label() == 'NE':
            entity_names.append(' '.join([child[0] for child in t]))
        else:
            for child in t:
                entity_names.extend(extract_entity_names(child))

    return entity_names

entity_names = []
for tree in chunked_sentences:
    # Print results per sentence
    # print extract_entity_names(tree)

    entity_names.extend(extract_entity_names(tree))

# Print all entity names
#print entity_names

# Print unique entity names
print(set(entity_names))
"""


        # # Chunking, by noun phrase, usually by this pattern
        # # determiner, adjectives, noun
        # chunk_parser = RegexpParser('''
        #                                             NP: {<DT>+<JJ>*<NN>*}
        #                                             ''')
        # chunk = chunk_parser.parse(tagged)
        #
        # match_chunks = []
        # for subtree in chunk.subtrees(filter=lambda t: t.label() == 'NP'):
        #     match_chunks.append(subtree)
        #
        # if len(match_chunks) != 0:
        #     passages.append(raw_sent)
        # pass


## Wordnet Hypernym

In [58]:
from nltk.corpus import wordnet as wn

word = 'aspirin'
synset = wn.synsets(word)
print("Hypernyms of {}:", synset)
for k,syn in enumerate(synset):
    print("============")
    hypernyms = syn.hypernyms()
    print("------Hypernyms of {}:".format(syn), hypernyms)    
    #print("Root hypernym of {}:".format(syn), syn.root_hypernyms)
    
    for j, hyp in enumerate(hypernyms):                
        hypernyms = hyp.hypernyms()
        print("----Hypernyms of {}:".format(hyp), hyp.hypernyms())        
        for a, hyp in enumerate(hypernyms):        
            print("--Hypernyms of {}:".format(hyp), hyp.hypernyms())
        
print(wn.synsets('liquid_body_substance.n.01'))

Hypernyms of {}: [Synset('aspirin.n.01')]
------Hypernyms of Synset('aspirin.n.01'): [Synset('analgesic.n.01'), Synset('salicylate.n.01')]
----Hypernyms of Synset('analgesic.n.01'): [Synset('medicine.n.02')]
--Hypernyms of Synset('medicine.n.02'): [Synset('drug.n.01')]
----Hypernyms of Synset('salicylate.n.01'): [Synset('nonsteroidal_anti-inflammatory.n.01'), Synset('salt.n.01')]
--Hypernyms of Synset('nonsteroidal_anti-inflammatory.n.01'): [Synset('anti-inflammatory.n.01')]
--Hypernyms of Synset('salt.n.01'): [Synset('compound.n.02')]
[]


In [13]:
custom_stop_words = ["few", "little", "much", "more", "cause", "symptom", "treatment", "prevent"]
wn_stop_words = stopwords.words('english')
custom_stop_words.extend(wn_stop_words)
custom_stop_words

['few',
 'little',
 'much',
 'more',
 'cause',
 'symptom',
 'treatment',
 'prevent',
 'i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',


In [59]:

def find_hypernym_names(syn):
        names = set()
            
        # Find hypernyms of each syn
        for syn in syns:
            hypernyms = syn.hypernyms()            
            for hypernym in hypernyms:
                names.add(hypernym.name())
            
        return names

syns = wn.synsets("fever")
print(find_hypernym_names(syns))

{'symptom.n.01', 'anticipation.n.01'}


In [60]:
# find hypernyms 2 level up
def find_hypernyms(syn):
    names = set()
    print("hypernym")
    # Find hypernyms of each syn
    for syn in syns:
        hypernyms = syn.hypernyms()        
        # find hypernyms one more level up
        for hypernym in hypernyms: 
            names.add(hypernym.name())
            hypernyms_second = hypernym.hypernyms()
            for h in hypernyms_second:
                names.add(h.name())
            
    return names

syns = wn.synsets("amitriptyline")
print(find_hypernyms(syns))

hypernym
{'antidepressant.n.01', 'tricyclic.n.01'}


In [37]:
syns = wn.synsets("tumor.n.01")
print(syns)

[]


In [None]:
Keep looping until it reach entity