In [32]:
import spacy
import nltk
from nltk.tag import PerceptronTagger
import string
import re

spacy_nlp = spacy.load("en_core_web_sm")

np_patterns = r"""
                NP: {<DT|PP\$>?<JJ>*<NN>+}
                    {<NNP>+}
                    {<NNS>+}
                    """

np_chunker = nltk.RegexpParser(np_patterns)
pos_tagger = PerceptronTagger()



adj_stopwords = [
            'able', 'available', 'brief', 'certain',
            'different', 'due', 'enough', 'especially', 'few', 'fifth',
            'former', 'his', 'howbeit', 'immediate', 'important', 'inc',
            'its', 'last', 'latter', 'least', 'less', 'likely', 'little',
            'many', 'ml', 'more', 'most', 'much', 'my', 'necessary',
            'new', 'next', 'non', 'old', 'other', 'our', 'ours', 'own',
            'particular', 'past', 'possible', 'present', 'proud', 'recent',
            'same', 'several', 'significant', 'similar', 'such', 'sup', 'sure'
        ]

def chunk_spacy(text):
    doc = spacy_nlp(text)
    chunks = []
    for sentence in doc.sents:
        sentence_text = sentence.lemma_
        for chunk in sentence.noun_chunks:
            if chunk.lemma_.lower() == "example":
                start = chunk.start
                pre_token = sentence[start - 1].lemma_.lower()
                post_token = sentence[start + 1].lemma_.lower()
                if start > 0 and\
                        (pre_token == "for" or post_token == "of"):
                    continue
            if chunk.lemma_.lower() == "type":
                continue
            chunk_arr = []
            replace_arr = []
            for token in chunk:
                if token.lemma_ in adj_stopwords + ["i.e.", "e.g."]:
                    continue
                chunk_arr.append(token.lemma_)
                if token.lemma_.isalnum():
                    replace_arr.append(token.lemma_)
                else:
                    replace_arr.append(''.join(
                        char for char in token.lemma_ if char.isalnum()
                    ))
            if len(chunk_arr) == 0:
                chunk_arr.append(chunk[-1].lemma_)
            chunk_lemma = ' '.join(chunk_arr)
            replacement_value = 'NP_' + '_'.join(replace_arr)
            if chunk_lemma:
                sentence_text = re.sub(r'\b%s\b' % re.escape(chunk_lemma),
                                        r'%s' % replacement_value,
                                        sentence_text)
        chunks.append(sentence_text)
#         print(chunks)
    return chunks
    
def chunk_nltk(text):
    sentences = nltk.sent_tokenize(text.strip()) 
    sentences = [nltk.word_tokenize(sent) for sent in sentences] 
    sentences = [pos_tagger.tag(sent) for sent in sentences]

    all_chunks = []
    for sentence in sentences:
        chunks = np_chunker.parse(sentence)
        all_chunks.append(prepare_chunks(chunks))
#         print(all_chunks)
    return all_chunks


def prepare_chunks(chunks):
    terms = []
    for chunk in chunks:
        label = None
        try: 
            label = chunk.label()
        except:
            pass
        if label is None: 
            token = chunk[0]
            pos = chunk[1]
            if pos in ['.', ':', '-', '_']:
                continue
            terms.append(token)
        else:
            np = "NP_"+"_".join([a[0] for a in chunk]) 
            terms.append(np)
    return ' '.join(terms)

def get_hyponyms(text, lib_type):

    hyponyms = []
    np_tagged_sentences = []
    if lib_type == 'spacy':
        np_tagged_sentences = chunk_spacy(text)
    elif lib_type == 'nltk':
        np_tagged_sentences = chunk_nltk(text)
    hyponym_dict = {}

    for tagged_sentence in np_tagged_sentences:
        general_term=[]
        specific_term=[]
        sentence = re.sub(r"(NP_\w+ NP_\w+)+", lambda x: x.expand(r'\1').replace(" NP_", "_"), tagged_sentence)
        hearst_pattern =  "((NP_\w+ ?(, )?)+(and )?other NP_\w+)"  #((NP_\w+ ?(, )?)+(and )?other NP_\w+)
        matches = re.search(hearst_pattern, sentence)
        if matches:
            match_str = matches.group(0)
            nps = [a for a in match_str.split() if a.startswith("NP_")]
            general = nps[-1]
            specifics = nps[:-1]
            general_term.append(clean_term(general))
            for i in range(len(specifics)):
                if 'federal' in general_term:
                    print (specifics[i], '\n\n')
                specific_term.append(clean_term(specifics[i]))
                hyponyms.append((clean_term(specifics[i]), clean_term(general)))
            print(general_term, specific_term, '\n')
    return hyponyms

def clean_term(term):
    return term.replace("NP_","").replace("_", " ")
    
    
def process_file(infile, lib_type):
    try:
        text = open(infile).read()
    except:
        print('Failed when reading file', infile, sys.exc_info()[0])
        return [ ]
    hyponyms = get_hyponyms(text, lib_type)
    print('Count is ', len(hyponyms))
    return hyponyms

 

infile = 'and_other_sentences_NYT_sum.txt'  #and_other_sentences_NYT_sum.txt
process_file(infile, 'spacy')


['element'] ['an american troop increase'] 

['mistreatment'] ['sexual abuse'] 

['wild animal'] ['president', 'candidate', 'donor', 'activist', 'alligator'] 

['market observer'] ['he'] 

['officer'] ['he'] 

['blogger'] ['he'] 

['item'] ['material', 'furniture'] 

['unauthorized absence'] ['desertion'] 

['piece'] ['32 painting'] 

['director'] ['she'] 

['large airport'] ['la guardia airport'] 

['online video site'] ['youtube'] 

['gift'] ['vacation trip'] 

['payment'] ['fine'] 

['favor'] ['gold coin'] 

['artifact'] ['gold coin'] 

['top aide'] ['karl rove'] 

['agency'] ['the white house'] 

['envoy'] ['monday'] 

['gas'] ['carbon dioxide'] 

['profession'] ['politic', 'law', 'the news medium'] 

['dangerous item'] ['bomb', 'chemical'] 

['government agency'] ['the president', 'the prime minister', 'parliament'] 

['protection'] ['a strong real estate market'] 

['financial firm'] ['investment company'] 

['offense'] ['justice'] 

['agricultural pollutant'] ['manure'] 

['trai

[('an american troop increase', 'element'),
 ('sexual abuse', 'mistreatment'),
 ('president', 'wild animal'),
 ('candidate', 'wild animal'),
 ('donor', 'wild animal'),
 ('activist', 'wild animal'),
 ('alligator', 'wild animal'),
 ('he', 'market observer'),
 ('he', 'officer'),
 ('he', 'blogger'),
 ('material', 'item'),
 ('furniture', 'item'),
 ('desertion', 'unauthorized absence'),
 ('32 painting', 'piece'),
 ('she', 'director'),
 ('la guardia airport', 'large airport'),
 ('youtube', 'online video site'),
 ('vacation trip', 'gift'),
 ('fine', 'payment'),
 ('gold coin', 'favor'),
 ('gold coin', 'artifact'),
 ('karl rove', 'top aide'),
 ('the white house', 'agency'),
 ('monday', 'envoy'),
 ('carbon dioxide', 'gas'),
 ('politic', 'profession'),
 ('law', 'profession'),
 ('the news medium', 'profession'),
 ('bomb', 'dangerous item'),
 ('chemical', 'dangerous item'),
 ('the president', 'government agency'),
 ('the prime minister', 'government agency'),
 ('parliament', 'government agency'),
 (