In [134]:
#!python -m spacy download en_core_web_lg

In [135]:
from nltk.corpus import wordnet as wn
import nltk
from pprint import pprint
import spacy
import re

nlp = spacy.load('en_core_web_lg')

In [136]:
verbs = ['say', 'tell', 'speak', 'claim', 'communicate']

In [137]:
def get_synonims_lemmas(verbs):
    synonims_lemmas = set()
    for word in verbs:
        #print(word, '---')
        for syn in wn.synsets(word):
            if (syn.pos() == 'v'):
                synonims_lemmas = synonims_lemmas | set(syn.lemma_names())
                #print(syn, syn.lemma_names(), syn.definition())
                #pprint(syn.tree(lambda s:s.hypernyms()))
    return synonims_lemmas
print(len(get_synonims_lemmas(verbs)), get_synonims_lemmas(verbs))

48 {'mouth', 'intercommunicate', 'communicate', 'enounce', 'address', 'tell', 'pronounce', 'severalize', 'convey', 'separate', 'exact', 'say', 'enunciate', 'claim', 'verbalize', 'secern', 'differentiate', 'verbalise', 'articulate', 'recite', 'take', 'assure', 'commune', 'distinguish', 'narrate', 'tell_apart', 'state', 'suppose', 'talk', 'arrogate', 'enjoin', 'transmit', 'evidence', 'sound_out', 'severalise', 'read', 'order', 'speak', 'secernate', 'allege', 'utter', 'recount', 'pass_on', 'aver', 'lay_claim', 'pass_along', 'pass', 'put_across'}


In [185]:
def child_ly_adverbs_in_sentence(sentence_doc, verb_lemma):
    def get_child_ly_tokens(token):
        return [child_token for child_token in token.children if child_token.pos_ == 'ADV' and child_token.text.endswith('ly')]

    childs = []
    for token in sentence_doc:
        if token.pos_ == 'VERB' and token.lemma_ == verb_lemma:
            child_ly_tokens = get_child_ly_tokens(token)
            childs += [token.lemma_ for token in child_ly_tokens]
            for token in child_ly_tokens:
                for ch in token.children:
                    if ch.dep_ == 'conj':
                        childs += [ch.lemma_]

    return childs

assert child_ly_adverbs_in_sentence(nlp('say very Delightfully, tell quietly and seriously'), 'say') == ['delightfully']
assert set(child_ly_adverbs_in_sentence(nlp('I speak quietly, seriously'), 'speak')) == set(['quietly', 'seriously'])
assert set(child_ly_adverbs_in_sentence(nlp('He speaks calmly and confidently'), 'speak')) == set(['calmly', 'confidently'])

In [186]:
import time

with open('blog2008.txt') as f:
    blogs_text = f.readlines()

start = time.time()

verbs_lemmas = get_synonims_lemmas(verbs)
#verbs_lemmas =  ['say', 'tell'] # 77seconds on blogs_text = blogs_text[:10000]
stats = { verb : {} for verb in verbs_lemmas }# 77seconds on blogs_text = blogs_text[:10000]

for doc in nlp.pipe(blogs_text, n_threads=16):
    for stat_word in stats:
        adverbs = child_ly_adverbs_in_sentence(doc, stat_word)
        for adv in adverbs:
            if adv in stats[stat_word]:
                 stats[stat_word][adv] += 1
            else:
                 stats[stat_word][adv] = 1
end = time.time()
print('time:', end - start)

time: 2311.156805038452


In [191]:
import json
with open('all_stats.json', 'w') as f:
    json.dump(stats, f)

In [192]:
short_stats = {}
for verb_and_stats in sorted(stats.items(), key=lambda kv: -len(kv[1])):
    verb = verb_and_stats[0]
    verb_stat = sorted(verb_and_stats[1].items(), key=lambda kv: -kv[1])[:10]
    if len(verb_stat) > 4 and verb_stat[0][1] > 4:
        short_stats[verb] = verb_stat

with open('short_stats.json', 'w') as f:
    json.dump(short_stats, f)