In [1]:
import re
from tqdm import tqdm
from collections import defaultdict

import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy import displacy

In [2]:
nlp = spacy.load('en_core_web_md', disable=['ner'])

In [3]:
verbs = [
    'say', 'tell', 'speak', 'claim', 
    'communicate', 'convey', 'inform', 
    'declare', 'explain', 'announce', 
    'instruct', 'mention', 'broadcast', 'disclose'
]

verb_adv_stats = dict((v, defaultdict(int)) for v in verbs)

verb_pattern = lambda verb: [{'LEMMA': verb, 'POS': 'VERB'}]
is_ly_adv = lambda t: (t.pos_ == 'ADV') and (t.lower_[-2:] == 'ly')


def process_match(matcher, doc, i, matches):
    verb = doc[matches[i][1]]
    advs = find_advs(verb)
    count_verb_adv_pairs(verb, advs)
    
def find_advs(verb):
    advs = list(filter(is_ly_adv, verb.rights))
    for adv in advs:
        advs.extend(filter(is_ly_adv, adv.conjuncts))
    return advs

def count_verb_adv_pairs(verb, advs):
    for adv in advs:
        verb_adv_stats[verb.lemma_][adv.lower_] += 1
        

matcher = Matcher(nlp.vocab)
for verb in verbs:
    matcher.add(verb.upper(), process_match, verb_pattern(verb))

In [4]:
# filtering for faster processing
verb_pattern = '''(
      say|said|
      tell|told|
      speak|spoke|
      claim|communicate|convey|
      inform|declare|explain|
      announce|instruct|mention|
      broadcast|disclose'
)'''

def line_reader(file_name):
    with open(file_name, 'r') as f:
        for line in f:
            if re.search(verb_pattern, line, re.VERBOSE):
                yield line.strip()

In [5]:
BLOG_FILE = '../../../tasks/02-structural-linguistics/blog2008.txt'
n_lines = sum(1 for _ in line_reader(BLOG_FILE))

In [6]:
for doc in tqdm(nlp.pipe(line_reader(BLOG_FILE), disable=['ner']), total=n_lines):
    matcher(doc)

100%|██████████| 44630/44630 [04:31<00:00, 164.53it/s]


In [7]:
for verb, adv_counts in verb_adv_stats.items():
    top_adv = sorted(adv_counts.items(), key=lambda x: -x[1])[:10]
    top_adv_str = ''.join(['{:>15}: {:<3}\n'.format(w, c) for w, c in top_adv])
    print('-== {} ==-'.format(verb.upper()))
    print(top_adv_str)
    print('\n')

-== SAY ==-
       recently: 35 
     repeatedly: 28 
       publicly: 26 
     explicitly: 12 
      privately: 10 
        clearly: 9  
         simply: 8  
        bluntly: 8  
         flatly: 7  
   definitively: 6  



-== TELL ==-
       recently: 10 
     personally: 4  
        exactly: 3  
       reliably: 3  
        frankly: 2  
     repeatedly: 2  
         slowly: 2  
   specifically: 2  
      privately: 2  
   definitively: 2  



-== SPEAK ==-
       directly: 31 
       publicly: 12 
       fiercely: 12 
        briefly: 8  
     forcefully: 8  
         loudly: 7  
        clearly: 7  
         openly: 7  
     eloquently: 6  
    anonymously: 5  



-== CLAIM ==-
       publicly: 2  
     previously: 1  
        falsely: 1  
         loudly: 1  
    indignantly: 1  
         really: 1  
    erroneously: 1  
    idiotically: 1  
    incorrectly: 1  
       recently: 1  



-== COMMUNICATE ==-
       directly: 3  
    effectively: 2  
         freely: 1  
     indirec