In [155]:
import en_core_web_lg
import pandas as pd
from tqdm import tqdm, tqdm_pandas
import qgrid
import re
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import f1_score, make_scorer
import json
import spacy
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import MultinomialNB
import numpy.random as rnd
import numpy as np
from stanfordcorenlp import StanfordCoreNLP

pd.set_option('display.max_colwidth', 300)

In [40]:
tqdm.pandas()

In [41]:
nlp = en_core_web_lg.load()

### Split

In [2]:
sdf = pd.read_csv('new_splitted.csv', index_col=0)
sdf.head()

Unnamed: 0,text
0,Find people $$who wrote posts mentioning Stacy Vaughn $$and liked any post 9 times $$and clicked on ad with 'blackout' in it
1,Find all attendees $$who wrote at least one post with mention of Stacy Vaughn $$and liked any post 9 or more times $$and clickd on ad with text 'blackout' in it
2,Find all attendees $$who wrote a post about Stacy Vaughn $$and have at least nine likes $$and clickd on add that contained 'blackout'
3,Find all attendees $$who clicked on ad about blackout $$and also liked post 9 times $$and wrote post with Stacy Vaughn's mention
4,Find client $$who wrote Stacy Vaughn's post $$and liked it 9 times $$as well as clicked on ad that contains 'blackout'


#### Try to expand corpus

In [212]:
new_items = []
candidates = ['$$or', '$$and', '$$but not', '$$except who', '$$except those who', '$$and_also', '$$and_not']
weights = [1/5,1/15,1/5,1/5,1/5,1/15,1/15]
for epoch in range(15):
    for i,row in sdf.iterrows():
        choice = rnd.choice(candidates, 1, p=weights)[0]
        new_items.append({'text': re.sub(r'\$\$and\s', choice+' ', row['text'], count=1)})
ex_sdf = pd.DataFrame({'text': sdf.append(new_items)['text'].unique()})
ex_sdf['text'] = ex_sdf['text'].apply(lambda x: re.sub(r'\$\$and_also', '$$and also', x))
ex_sdf['text'] = ex_sdf['text'].apply(lambda x: re.sub(r'\$\$and_not', '$$and not', x))
ex_sdf = ex_sdf.sample(frac=1.0)
len(ex_sdf)

926

In [214]:
def extract_splitter_ids(text):
    idxs = []
    subs = 0
    for match in re.finditer(r'\$\$', text):
        idxs.append(match.start() - subs)
        subs += 2
    return idxs

cdf = ex_sdf.assign(idxs=ex_sdf['text'].apply(extract_splitter_ids))\
         .assign(clean = ex_sdf['text'].apply(lambda t: re.sub(r'\$\$', '', t)))\
         [['idxs', 'clean']]

### Tokenize

In [215]:
tokenized = cdf.assign(tokens=cdf['clean'].progress_apply(lambda text: nlp(text)))
tokenized.head()

100%|██████████| 926/926 [00:13<00:00, 70.86it/s]


Unnamed: 0,idxs,clean,tokens
342,"[18, 62, 98]","Search for people who replied to post written by Andy Chandler, with bio containing word 'beyond' or who viewed page 2 times.","(Search, for, people, who, replied, to, post, written, by, Andy, Chandler, ,, with, bio, containing, word, ', beyond, ', or, who, viewed, page, 2, times, .)"
87,"[15, 41, 69]","Find attendees who reply to Stacy Vaughn and phone number 7699165982 and post has ""rare""","(Find, attendees, who, reply, to, Stacy, Vaughn, and, phone, number, 7699165982, and, post, has, "", rare, "")"
304,"[15, 44, 66, 77]",Find attendees that participated in no poll or never answered yes except who who has 'captive' in bio,"(Find, attendees, that, participated, in, no, poll, or, never, answered, yes, except, who, who, has, ', captive, ', in, bio)"
28,"[15, 54, 81]","Find attendees who rated page containing 'abomination', has phone (895) 592-8974 and not enjoyed the event.","(Find, attendees, who, rated, page, containing, ', abomination, ', ,, has, phone, (, 895, ), 592, -, 8974, and, not, enjoyed, the, event, .)"
200,"[19, 60, 80]","Find all attendees that liked a post with positive sentiment, connected facebook, and downloaded a file with name 'presentation.pptx'.","(Find, all, attendees, that, liked, a, post, with, positive, sentiment, ,, connected, facebook, ,, and, downloaded, a, file, with, name, ', presentation.pptx, ', .)"


In [216]:
examples = []
for i,row in tokenized.iterrows():
    tokens = row['tokens']
    idxs = set(row['idxs'])
    row_labels = []
    #print(idxs)
    #print(list(zip(tokens, [token.idx for token in tokens])))
    for token in tokens:
        if token.idx in idxs:
            row_labels.append(1)
        else:
            row_labels.append(0)
    examples.append(list(zip([token.text for token in tokens], row_labels)))
            
with open('labled_tokenized_v3.json', 'w') as f:
    json.dump(examples, f)

#### Train/test split

In [217]:
examples = []
with open('labled_tokenized_v3.json', 'r') as f:
    examples = json.load(f)
train,test = train_test_split(examples)
with open('labled_tokenized.train.v3.json', 'w') as trf, open('labled_tokenized.test.v3.json', 'w') as tt:
    json.dump(train, trf)
    json.dump(test, tt)

### Feature extraction

In [117]:
snlp = StanfordCoreNLP('http://localhost', port=9000)

In [219]:
train = json.load(open('labled_tokenized.train.v3.json'))

In [119]:
def common_ancestor(tree, i, j):
    max = len(tree.leaves())
    if (i >= max) or (j >= max):
        return 'NONE'
    tp1 = tree.leaf_treeposition(i)
    tp2 = tree.leaf_treeposition(j)
    atp = []
    for tpi,tpj in zip(tp1, tp2):
        if tpi == tpj:
            atp.append(tpi)
        else:
            break
    return tree[tuple(atp)].label()

In [105]:
def extract(tokens):
    N = len(tokens)
    all_features = []
    for i in range(N):
        features = {
            'text': tokens[i].text,
            'lemma': tokens[i].lemma_,
            'pos': tokens[i].tag_,
            'text-1': tokens[i-1].text if i-1 >= 0 else 'NONE',
            'lemma-1': tokens[i-1].lemma_ if i-1 >= 0 else 'NONE',
            'pos-1': tokens[i-1].pos_ if i-1 >= 0 else 'NONE',
            'text-2': tokens[i-2].text if i-2 >= 0 else 'NONE',
            'lemma-2': tokens[i-2].lemma_ if i-2 >= 0 else 'NONE',
            'pos-2': tokens[i-2].pos_ if i-2 >= 0 else 'NONE',
            'text+1': tokens[i+1].text if i+1 < N else 'NONE',
            'lemma+1': tokens[i+1].lemma_ if i+1 < N else 'NONE',
            'pos+1': tokens[i+1].pos_ if i+1 < N else 'NONE',
            'text+2': tokens[i+2].text if i+2 < N else 'NONE',
            'lemma+2': tokens[i+2].lemma_ if i+2 < N else 'NONE',
            'pos+2': tokens[i+2].pos_ if i+2 < N else 'NONE'
        }
        all_features.append(features)
    return all_features

In [220]:
def extract_all(sentences):
    all_features = []
    ys = []
    for sent in sentences:
        tokens = spacy.tokens.Doc(nlp.vocab, words=[pair[0] for pair in sent])
        nlp.tagger(tokens)
        nlp.parser(tokens)
        
        sent_features = extract(tokens)
        ys += [pair[1] for pair in sent]
        all_features += sent_features
    return (all_features, ys)

In [221]:
all_features,ys = extract_all(train)

In [108]:
pipe = Pipeline([
    ('vectorizer', DictVectorizer()),
    ('classifier', MultinomialNB())
])
cv = cross_validate(pipe, all_features, ys, scoring='f1')

In [109]:
cv



{'fit_time': array([0.22493625, 0.20296097, 0.19855189]),
 'score_time': array([0.09168005, 0.10262704, 0.09270811]),
 'test_score': array([0.92124105, 0.92822967, 0.92515924]),
 'train_score': array([0.93002799, 0.92589641, 0.92295473])}

In [110]:
pipe.fit(all_features, ys)
df = pd.DataFrame(all_features)
df = df.assign(label = ys)
df = df.assign(predicted = pipe.predict(all_features))

In [57]:
df[(df['label'] == 1) & (df['predicted'] == 0)]

Unnamed: 0,lemma,lemma+1,lemma-1,pos,pos+1,pos-1,text,text+1,text-1,label,predicted
1735,attendee,name,for,NNS,VERB,ADP,attendees,named,for,1,0
3879,click,5,user,VBD,NUM,NOUN,clicked,5,users,1,0
3895,vida,NONE,find,FW,NONE,VERB,vida,NONE,find,1,0
3966,attendee,name,for,NNS,VERB,ADP,attendees,named,for,1,0
4037,biologist,NONE,every,NN,NONE,DET,biologist,NONE,every,1,0
5124,mention,',who,VBD,PUNCT,NOUN,mentioned,',Who,1,0
6755,who,be,NONE,WP,VERB,NONE,Who,is,NONE,1,0
7541,mention,',who,VBD,PUNCT,NOUN,mentioned,',Who,1,0
8338,attendee,name,for,NNS,VERB,ADP,attendees,named,for,1,0
8816,mention,',who,VBD,PUNCT,NOUN,mentioned,',Who,1,0


In [66]:
df[(df['label'] == 0) & (df['predicted'] == 1)]

Unnamed: 0,lemma,lemma+1,lemma-1,pos,pos+1,pos-1,text,text+1,text-1,label,predicted
30,but,have,",",CC,VERB,PUNCT,but,have,",",0,1
134,as,first,',IN,ADJ,PUNCT,as,first,',0,1
158,as,participate,well,IN,VERB,ADV,as,participated,well,0,1
163,who,answer,except,WP,VERB,ADP,who,answered,except,0,1
227,that,visit,",",WDT,VERB,PUNCT,that,visited,",",0,1
250,who,have,except,WP,VERB,ADP,who,has,except,0,1
273,who,rat,except,WP,VERB,ADP,who,rated,except,0,1
319,who,do,",",WP,VERB,PUNCT,who,did,",",0,1
362,and,be,",",CC,VERB,PUNCT,and,is,",",0,1
387,who,view,or,WP,VERB,CCONJ,who,viewed,or,0,1


In [111]:
test_features, test_ys = extract_all(json.load(open('labled_tokenized.test.v2.json')))

In [112]:
test_predicted = pipe.predict(test_features)

In [113]:
f1_score(test_ys, test_predicted)

0.917741935483871

In [114]:
from sklearn.externals import joblib
joblib.dump(pipe, 'splitter.v2.pkl')

['splitter.v2.pkl']

### CRF

In [223]:
def sent2features(sent):
    tokens = spacy.tokens.Doc(nlp.vocab, words=[pair[0] for pair in sent])
    nlp.tagger(tokens)
    nlp.parser(tokens)
    return extract(tokens)

def sent2labels(sent):
    return ['E' if pair[1] == 1 else 'N' for pair in sent]

def crf_extract(sents):
    return [sent2features(sent) for sent in tqdm(sents)]

def crf_labels(sents):
    return [sent2labels(sent) for sent in tqdm(sents)]

In [224]:
crf_X = [sent2features(sent) for sent in tqdm(train)]
crf_y = [sent2labels(sent) for sent in tqdm(train)]

100%|██████████| 694/694 [00:06<00:00, 111.77it/s]
100%|██████████| 694/694 [00:00<00:00, 261296.86it/s]


In [225]:
import sklearn_crfsuite
from sklearn_crfsuite import metrics

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=200,
    all_possible_transitions=True
)


crf.fit(crf_X, crf_y)
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=list(crf.classes_))
crf_cv = cross_validate(crf, crf_X, crf_y, scoring=f1_scorer, return_train_score=True)
crf_cv

{'fit_time': array([1.08352685, 1.10711503, 1.11112976]),
 'score_time': array([0.0501492 , 0.04611683, 0.048769  ]),
 'test_score': array([0.99887628, 0.99868143, 0.99708823]),
 'train_score': array([0.99980899, 1.        , 0.99981227])}

In [180]:
crf_X_test = [sent2features(sent) for sent in tqdm(test)]
crf_y_test = [sent2labels(sent) for sent in tqdm(test)]

100%|██████████| 210/210 [00:01<00:00, 110.58it/s]
100%|██████████| 210/210 [00:00<00:00, 101498.48it/s]


In [226]:
crf.fit(crf_X, crf_y)
test_predicted = crf.predict(crf_X_test)

In [227]:
def binarize(y):
    return [[1 if l == 'E' else 0 for l in sent] for sent in y]

In [228]:
test = json.load(open('labled_tokenized.test.v3.json'))

In [229]:
crf_X_test = [sent2features(sent) for sent in tqdm(test)]
crf_y_test = [sent2labels(sent) for sent in tqdm(test)]

100%|██████████| 232/232 [00:02<00:00, 109.60it/s]
100%|██████████| 232/232 [00:00<00:00, 131426.06it/s]


In [230]:
crf.fit(crf_X, crf_y)
test_predicted = crf.predict(crf_X_test)

In [231]:
def binarize(y):
    return [[1 if l == 'E' else 0 for l in sent] for sent in y]

In [232]:
labels=list(crf.classes_)
labels.append('O')
metrics.flat_f1_score(binarize(crf_y_test),
                      binarize(test_predicted), labels=[0,1])

0.9992395437262357

In [233]:
joblib.dump(crf, 'crf_splitter.pkl')

['crf_splitter.pkl']

### Organize into tree

In [75]:
def split(cls, sent):
    tokens = nlp(sent)
    features = extract(tokens)
    stops = cls.predict(features)
    parts = []
    part = []
    for token,stop in zip(tokens, stops):
        if stop:
            parts.append(part)
            parts.append(token)
            part = []
        else:
            part.append(token)
    parts.append(part)
    return parts

split(pipe, "Find all attendees who clicked on ad about blackout and also liked post 9 times and wrote post with Stacy Vaughn's mention")

[[Find, all, attendees],
 who,
 [clicked, on, ad, about, blackout],
 and,
 [also, liked, post, 9, times],
 and,
 [wrote, post, with, Stacy, Vaughn, 's, mention]]

In [85]:
from sklearn.externals import joblib
spacy_tokenizer = lambda xs: [x.lemma_ for x in xs]
fclf = joblib.load('filter_classifier.pkl')

In [90]:
parts = split(pipe, "Find all attendees who clicked on ad about blackout and also liked post 9 times and wrote post with Stacy Vaughn's mention")

In [95]:
def organize_in_tree_(parts, tree):
    if parts: 
        part = parts[0]
        rest = parts[1:]
        (l,n,r) = tree
        if isinstance(part, list):
            if not l:
                return organize_in_tree_(rest, (part, n, r))
            elif not r:
                return organize_in_tree_(rest, (l,n,part))
            else:
                raise Exception("node in invalid position")
        else:
            if not n:
                return organize_in_tree_(rest, (l, part, r))
            else:
                return organize_in_tree_(rest, (tree, part, None))
    else:
        return tree
        
def organize_in_tree(parts):
    return organize_in_tree_(parts[2:], (None, None, None))

In [96]:
organize_in_tree(parts)

(([clicked, on, ad, about, blackout], and, [also, liked, post, 9, times]),
 and,
 [wrote, post, with, Stacy, Vaughn, 's, mention])