In [76]:
import spacy
import en_core_web_lg
from sklearn.externals import joblib
import pandas as pd

In [3]:
nlp = en_core_web_lg.load()

In [75]:
%run ArgExtraction.ipynb

phrase                 attendees posted at least one post with <sentiment> sentiment
replacements                                               {"sentiment": "negative"}
label                                                                           post
ex                               {"post.sentiment": "negative", "post.count": ">=1"}
final           (attendees, posted, at, least, one, post, with, negative, sentiment)
orig_final                attendees posted at least one post with negative sentiment
grepl                                                                             []
Name: 290, dtype: object
attendees posted at least one post with negative sentiment


attendee posted nsubj
post posted ROOT
at least advmod
least one advmod
one post nummod
post posted dobj
with post prep
negative sentiment amod
sentiment with pobj


In [270]:
def extract(tokens):
    N = len(tokens)
    all_features = []
    for i in range(N):
        features = {
            'text': tokens[i].text,
            'lemma': tokens[i].lemma_,
            'pos': tokens[i].tag_,
            'text-1': tokens[i-1].text if i-1 >= 0 else 'NONE',
            'lemma-1': tokens[i-1].lemma_ if i-1 >= 0 else 'NONE',
            'pos-1': tokens[i-1].pos_ if i-1 >= 0 else 'NONE',
            'text-2': tokens[i-2].text if i-2 >= 0 else 'NONE',
            'lemma-2': tokens[i-2].lemma_ if i-2 >= 0 else 'NONE',
            'pos-2': tokens[i-2].pos_ if i-2 >= 0 else 'NONE',
            'text+1': tokens[i+1].text if i+1 < N else 'NONE',
            'lemma+1': tokens[i+1].lemma_ if i+1 < N else 'NONE',
            'pos+1': tokens[i+1].pos_ if i+1 < N else 'NONE',
            'text+2': tokens[i+2].text if i+2 < N else 'NONE',
            'lemma+2': tokens[i+2].lemma_ if i+2 < N else 'NONE',
            'pos+2': tokens[i+2].pos_ if i+2 < N else 'NONE'
        }
        all_features.append(features)
    return all_features

def extract_all(sentences, nlp):
    all_features = []
    ys = []
    for sent in sentences:
        tokens = spacy.tokens.Doc(nlp.vocab, words=[pair[0] for pair in sent])
        nlp.tagger(tokens)
        nlp.parser(tokens)
        sent_features = extract(tokens)
        ys += [pair[1] for pair in sent]
        all_features += sent_features
    return (all_features, ys)

In [279]:
def sent2features(sent):
    tokens = spacy.tokens.Doc(nlp.vocab, words=[pair[0] for pair in sent])
    nlp.tagger(tokens)
    nlp.parser(tokens)
    return extract(tokens)

def sent2labels(sent):
    return ['E' if pair[1] == 1 else 'N' for pair in sent]

def crf_extract(sents):
    return [sent2features(sent) for sent in tqdm(sents)]

def crf_labels(sents):
    return [sent2labels(sent) for sent in tqdm(sents)]

In [9]:
def spacy_tokenizer(x):
    return [t.lemma_ for t in nlp(x)]

In [284]:
splitter = joblib.load('crf_splitter.pkl')
filter_classifier = joblib.load('filter_classifier.pkl')

In [277]:
CONJUNCTIONS = {'and', 'or', 'but', 'except'}

def split(sent, verbose=False):
    tokens = nlp(sent)
    features = extract(tokens)
    stops = splitter.predict(features)
    nullify = []
    for i in range(len(stops)-1):
        if stops[i] == 1 and stops[i+1] == 1:
            nullify.append(i+1)
    for idx in nullify:
        stops[idx] = 0
    if verbose:
        print(stops)
    parts = []
    part = (0,0)
    conj = (0,0)
    prev = 0
    for token,stop in zip(tokens, stops):
        if stop == 0:
            part = (part[0], part[1]+len(token.text)+1)
        else:
            parts.append({'type': 'phrase',
                          'phrase': sent[part[0]:part[1]]})
            if token.text in CONJUNCTIONS:
                parts.append({'type': 'conj',
                              'conj': token.text})
                part = (part[1]+len(token.text)+1, part[1]+len(token.text)+1)

            else:
                part = (part[1], part[1]+len(token.text)+1)
    parts.append({'type': 'phrase', 
                          'phrase': sent[part[0]:].strip()})
    return parts

split("Find all attendees whose post contains dog but not who liked a positive post", verbose=True)

[0 0 0 1 0 0 0 1 0 1 0 0 0 0]


[{'phrase': 'Find all attendees ', 'type': 'phrase'},
 {'phrase': 'whose post contains dog ', 'type': 'phrase'},
 {'conj': 'but', 'type': 'conj'},
 {'phrase': 'not ', 'type': 'phrase'},
 {'phrase': 'who liked a positive post', 'type': 'phrase'}]

In [286]:
CONJUNCTIONS = {'and', 'or', 'but', 'except'}

def split(sent, verbose=False):
    tokens = nlp(sent)
    features = extract(tokens)
    stops = [1 if stop == 'E' else 0 for stop in splitter.predict([features])[0]]
#     nullify = []
#     for i in range(len(stops)-1):
#         if stops[i] == 1 and stops[i+1] == 1:
#             nullify.append(i+1)
#     for idx in nullify:
#         stops[idx] = 0
    if verbose:
        print(stops)
    parts = []
    part = (0,0)
    conj = (0,0)
    prev = 0
    for token,stop in zip(tokens, stops):
        if stop == 0:
            part = (part[0], part[1]+len(token.text)+1)
        else:
            parts.append({'type': 'phrase',
                          'phrase': sent[part[0]:part[1]]})
            if token.text in CONJUNCTIONS:
                parts.append({'type': 'conj',
                              'conj': token.text})
                part = (part[1]+len(token.text)+1, part[1]+len(token.text)+1)

            else:
                part = (part[1], part[1]+len(token.text)+1)
    parts.append({'type': 'phrase', 
                          'phrase': sent[part[0]:].strip()})
    return parts

split("Find all attendees whose post contains dog but not who liked a positive post except thoose who liked you", verbose=True)

[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]


[{'phrase': 'Find all attendees ', 'type': 'phrase'},
 {'phrase': 'whose post contains dog ', 'type': 'phrase'},
 {'conj': 'but', 'type': 'conj'},
 {'phrase': 'not who liked a positive post ', 'type': 'phrase'},
 {'conj': 'except', 'type': 'conj'},
 {'phrase': 'thoose who liked you', 'type': 'phrase'}]

In [244]:
def classify(part):
    part['class'] = filter_classifier.predict([part['phrase']])[0]
    return part

In [245]:
def extract_args(part, verbose=False):
    if verbose:
        print("Extracting args: {}".format(part))
    if part['class'] != 'post':
        part['args'] = []
    else:
        phrase = part['phrase']
        text = replace_quotes_to_double_quotes(phrase)
        df = pd.DataFrame({'final': [text.strip()]})
        replace_search_strings(df)
        df['final'] = df['final'].apply(nlp)
        part['args'] = run_rules(df['final'], df['grepl'])
    return part

In [246]:
def organize_in_tree_(parts, tree):
    if parts: 
        part = parts[0]
        rest = parts[1:]
        (l,n,r) = tree
        if part['type'] == 'phrase':
            if not l:
                return organize_in_tree_(rest, (part, n, r))
            elif not r:
                return organize_in_tree_(rest, (l,n,part))
            else:
                raise Exception("node in invalid position")
        else:
            if not n:
                return organize_in_tree_(rest, (l, part, r))
            else:
                return organize_in_tree_(rest, (tree, part, None))
    else:
        return tree
        
def organize_in_tree(parts):
    return organize_in_tree_(parts[:], (None, None, None))

In [287]:
def transform(s, verbose=False):
    parts = split(s, verbose)
    if verbose:
        print(parts)
    parts = parts[1:]
    parts = parts[1:] if parts[0]['type'] == 'conj' else parts
    parsed = [extract_args(classify(part), verbose) if part['type'] != 'conj' else part
              for part in parts]
    print(parsed)
    return organize_in_tree(parsed)
transform("Find all attendees whose post contains dog but not who liked a positive post")

[{'type': 'phrase', 'phrase': 'whose post contains dog ', 'class': 'post', 'args': [{'post.text': 'dog'}]}, {'type': 'conj', 'conj': 'but'}, {'type': 'phrase', 'phrase': 'not who liked a positive post', 'class': 'like', 'args': []}]


({'args': [{'post.text': 'dog'}],
  'class': 'post',
  'phrase': 'whose post contains dog ',
  'type': 'phrase'},
 {'conj': 'but', 'type': 'conj'},
 {'args': [],
  'class': 'like',
  'phrase': 'not who liked a positive post',
  'type': 'phrase'})

In [288]:
transform("find attendees who wrote about honey and liked a post but not who wrote 2 positive posts")

[{'type': 'phrase', 'phrase': 'who wrote about honey ', 'class': 'post', 'args': [{'post.text': 'honey'}]}, {'type': 'conj', 'conj': 'and'}, {'type': 'phrase', 'phrase': 'liked a post ', 'class': 'like', 'args': []}, {'type': 'conj', 'conj': 'but'}, {'type': 'phrase', 'phrase': 'not who wrote 2 positive posts', 'class': 'post', 'args': [{'post.count': '2', 'post.sentiment': 'positive'}]}]


(({'args': [{'post.text': 'honey'}],
   'class': 'post',
   'phrase': 'who wrote about honey ',
   'type': 'phrase'},
  {'conj': 'and', 'type': 'conj'},
  {'args': [], 'class': 'like', 'phrase': 'liked a post ', 'type': 'phrase'}),
 {'conj': 'but', 'type': 'conj'},
 {'args': [{'post.count': '2', 'post.sentiment': 'positive'}],
  'class': 'post',
  'phrase': 'not who wrote 2 positive posts',
  'type': 'phrase'})

In [289]:
transform("Find people who wrote posts mentioning Stacy Vaughn and liked any post 9 times and clicked on ad with 'blackout' in it")

[{'type': 'phrase', 'phrase': 'who wrote posts mentioning Stacy Vaughn ', 'class': 'post', 'args': [{'post.mention': 'Stacy Vaughn'}]}, {'type': 'conj', 'conj': 'and'}, {'type': 'phrase', 'phrase': 'liked any post 9 times ', 'class': 'like', 'args': []}, {'type': 'conj', 'conj': 'and'}, {'type': 'phrase', 'phrase': "clicked on ad with 'blackout' in it", 'class': 'ad', 'args': []}]


(({'args': [{'post.mention': 'Stacy Vaughn'}],
   'class': 'post',
   'phrase': 'who wrote posts mentioning Stacy Vaughn ',
   'type': 'phrase'},
  {'conj': 'and', 'type': 'conj'},
  {'args': [],
   'class': 'like',
   'phrase': 'liked any post 9 times ',
   'type': 'phrase'}),
 {'conj': 'and', 'type': 'conj'},
 {'args': [],
  'class': 'ad',
  'phrase': "clicked on ad with 'blackout' in it",
  'type': 'phrase'})

In [290]:
transform("Find eveyrone who wrote at least 4 posts or any positive post and has 'captive' in their bio")

[{'type': 'phrase', 'phrase': 'who wrote at least 4 posts ', 'class': 'post', 'args': [{'post.count': '>=4'}]}, {'type': 'conj', 'conj': 'or'}, {'type': 'phrase', 'phrase': 'any positive post ', 'class': 'post', 'args': [{'post.sentiment': 'positive'}]}, {'type': 'conj', 'conj': 'and'}, {'type': 'phrase', 'phrase': "has 'captive' in their bio", 'class': 'profile', 'args': []}]


(({'args': [{'post.count': '>=4'}],
   'class': 'post',
   'phrase': 'who wrote at least 4 posts ',
   'type': 'phrase'},
  {'conj': 'or', 'type': 'conj'},
  {'args': [{'post.sentiment': 'positive'}],
   'class': 'post',
   'phrase': 'any positive post ',
   'type': 'phrase'}),
 {'conj': 'and', 'type': 'conj'},
 {'args': [],
  'class': 'profile',
  'phrase': "has 'captive' in their bio",
  'type': 'phrase'})