In [2]:
import en_core_web_md
import pandas as pd
from tqdm import tqdm, tqdm_pandas
import qgrid
import re
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import f1_score
import json
import spacy
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import MultinomialNB

pd.set_option('display.max_colwidth', 300)

In [3]:
tqdm.pandas()

In [4]:
nlp = en_core_web_md.load()

In [4]:
cleaned = pd.read_csv('dataset/phrases_cleaned_1522422847595.csv', usecols=['id', 'phrase'])
cleaned = cleaned.assign(doc=cleaned['phrase'].progress_apply(nlp))

100%|██████████| 287/287 [00:03<00:00, 77.94it/s]


In [21]:
SPLITS = {'who', ',','and'}
cleaned = cleaned.assign(idxs=cleaned['doc'].progress_apply(lambda doc: [tok.idx for tok in doc if tok.lemma_ in SPLITS]))
splitted_again = []
with_indicies = []
for i,row in cleaned.iterrows():
    idxs = set(row['idxs'])
    phrase = row['phrase']
    subphrases = []
    subphrase = ''
    for i,ch in enumerate(phrase):
        if i in idxs:
            subphrases.append(subphrase)
            subphrase = ch
        else:
            subphrase += ch
    splitted_again.append(subphrases)
    with_indicies.append([[tok.idx,tok] for tok in row['doc']])
        

100%|██████████| 287/287 [00:00<00:00, 30910.16it/s]


In [8]:
def add_dollars(phrase):
    doc = nlp(phrase)
    new_phrase = phrase
    acc = 0
    for tok in doc:
        if tok.lemma_ in SPLITS:
            new_phrase = new_phrase[:tok.idx+acc] + '$$' + new_phrase[tok.idx+acc:]
            acc += 2
    return new_phrase

#qdf = pd.DataFrame({'text': cleaned['phrase'].apply(add_dollars)})
qdf = pd.read_csv('new_splitted.csv', index_col=0)
widget = qgrid.show_grid(qdf)
widget

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

In [29]:
splitted = pd.read_csv('dataset/phrases_cleaned_1522422847595.csv')
splitted.iloc[[84]]

Unnamed: 0,id,author,created,modified,search_attrs,phrase
84,98be4665-b3db-4133-a8cf-299dfa8205ab,Kostia Rybnikov,1522249286679,1522249944131,"[{""short"": ""whose bio contains 'biplane'"", ""type"": ""text"", ""name"": ""profile.bio"", ""example"": ""1""}, {""short"": ""1 times"", ""name"": ""events.count""}, {""short"": ""with positive sentiment"", ""type"": ""Sentiment"", ""name"": ""like.postSentiment"", ""human"": ""liked {sentiment} post"", ""example"": ""Attendees who li...","From people with a ""biplane"" in their profile description find those who attended an event once and liked a post that can be said to be a positive one"


In [9]:
widget.get_changed_df().to_csv('new_splitted.csv')

### Split

In [15]:
sdf = pd.read_csv('new_splitted.csv', index_col=0)
sdf.head()

Unnamed: 0,text
0,Find people $$who wrote posts mentioning Stacy Vaughn $$and liked any post 9 times $$and clicked on ad with 'blackout' in it
1,Find all attendees $$who wrote at least one post with mention of Stacy Vaughn $$and liked any post 9 or more times $$and clickd on ad with text 'blackout' in it
2,Find all attendees $$who wrote a post about Stacy Vaughn $$and have at least nine likes $$and clickd on add that contained 'blackout'
3,Find all attendees $$who clicked on ad about blackout $$and also liked post 9 times $$and wrote post with Stacy Vaughn's mention
4,Find client $$who wrote Stacy Vaughn's post $$and liked it 9 times $$as well as clicked on ad that contains 'blackout'


In [36]:
def extract_splitter_ids(text):
    idxs = []
    subs = 0
    for match in re.finditer(r'\$\$', text):
        idxs.append(match.start() - subs)
        subs += 2
    return idxs

sdf = sdf.assign(idxs=sdf['text'].apply(extract_splitter_ids))
sdf = sdf.assign(clean = sdf['text'].apply(lambda t: re.sub(r'\$\$', '', t)))
cdf = sdf[['idxs', 'clean']]

### Tokenize

In [37]:
splitted

Unnamed: 0,idxs,clean
0,"[12, 52, 79]",Find people who wrote posts mentioning Stacy Vaughn and liked any post 9 times and clicked on ad with 'blackout' in it
1,"[19, 76, 111]",Find all attendees who wrote at least one post with mention of Stacy Vaughn and liked any post 9 or more times and clickd on ad with text 'blackout' in it
2,"[19, 55, 84]",Find all attendees who wrote a post about Stacy Vaughn and have at least nine likes and clickd on add that contained 'blackout'
3,"[19, 52, 80]",Find all attendees who clicked on ad about blackout and also liked post 9 times and wrote post with Stacy Vaughn's mention
4,"[12, 42, 63]",Find client who wrote Stacy Vaughn's post and liked it 9 times as well as clicked on ad that contains 'blackout'


In [45]:
tokenized = cdf.assign(tokens=cdf['clean'].apply(lambda text: nlp(text)))
tokenized.head()

Unnamed: 0,idxs,clean,tokens
0,"[12, 52, 79]",Find people who wrote posts mentioning Stacy Vaughn and liked any post 9 times and clicked on ad with 'blackout' in it,"(Find, people, who, wrote, posts, mentioning, Stacy, Vaughn, and, liked, any, post, 9, times, and, clicked, on, ad, with, ', blackout, ', in, it)"
1,"[19, 76, 111]",Find all attendees who wrote at least one post with mention of Stacy Vaughn and liked any post 9 or more times and clickd on ad with text 'blackout' in it,"(Find, all, attendees, who, wrote, at, least, one, post, with, mention, of, Stacy, Vaughn, and, liked, any, post, 9, or, more, times, and, clickd, on, ad, with, text, ', blackout, ', in, it)"
2,"[19, 55, 84]",Find all attendees who wrote a post about Stacy Vaughn and have at least nine likes and clickd on add that contained 'blackout',"(Find, all, attendees, who, wrote, a, post, about, Stacy, Vaughn, and, have, at, least, nine, likes, and, clickd, on, add, that, contained, ', blackout, ')"
3,"[19, 52, 80]",Find all attendees who clicked on ad about blackout and also liked post 9 times and wrote post with Stacy Vaughn's mention,"(Find, all, attendees, who, clicked, on, ad, about, blackout, and, also, liked, post, 9, times, and, wrote, post, with, Stacy, Vaughn, 's, mention)"
4,"[12, 42, 63]",Find client who wrote Stacy Vaughn's post and liked it 9 times as well as clicked on ad that contains 'blackout',"(Find, client, who, wrote, Stacy, Vaughn, 's, post, and, liked, it, 9, times, as, well, as, clicked, on, ad, that, contains, ', blackout, ')"


In [74]:
examples = []
for i,row in tokenized.iterrows():
    tokens = row['tokens']
    idxs = set(row['idxs'])
    row_labels = []
    #print(idxs)
    #print(list(zip(tokens, [token.idx for token in tokens])))
    for token in tokens:
        if token.idx in idxs:
            row_labels.append(1)
        else:
            row_labels.append(0)
    examples.append(list(zip([token.text for token in tokens], row_labels)))
            
with open('labled_tokenized.json', 'w') as f:
    json.dump(examples, f)

#### Train/test split

In [19]:
examples = []
with open('labled_tokenized.json', 'r') as f:
    examples = json.load(f)
train,test = train_test_split(examples)
with open('labled_tokenized.train.json', 'w') as trf, open('labled_tokenized.test.json', 'w') as tt:
    json.dump(train, trf)
    json.dump(test, tt)

### Feature extraction

In [5]:
train = json.load(open('labled_tokenized.train.json'))

In [6]:
def extract(tokens):
    N = len(tokens)
    all_features = []
    for i in range(N):
        features = {
            'text': tokens[i].text,
            'lemma': tokens[i].lemma_,
            'pos': tokens[i].tag_,
            'text-1': tokens[i-1].text if i-1 >= 0 else 'NONE',
            'lemma-1': tokens[i-1].lemma_ if i-1 >= 0 else 'NONE',
            'pos-1': tokens[i-1].pos_ if i-1 >= 0 else 'NONE',
            'text+1': tokens[i+1].text if i+1 < N else 'NONE',
            'lemma+1': tokens[i+1].lemma_ if i+1 < N else 'NONE',
            'pos+1': tokens[i+1].pos_ if i+1 < N else 'NONE'
        }
        all_features.append(features)
    return all_features

In [7]:
def extract_all(sentences):
    all_features = []
    ys = []
    for sent in sentences:
        tokens = spacy.tokens.Doc(nlp.vocab, words=[pair[0] for pair in sent])
        nlp.tagger(tokens)
        nlp.parser(tokens)
        sent_features = extract(tokens)
        ys += [pair[1] for pair in sent]
        all_features += sent_features
    return (all_features, ys)

In [8]:
all_features,ys = extract_all(train)

In [9]:
pipe = Pipeline([
    ('vectorizer', DictVectorizer()),
    ('classifier', MultinomialNB())
])
cv = cross_validate(pipe, all_features, ys, scoring='f1')

In [10]:
cv



{'fit_time': array([0.04539394, 0.04273796, 0.04250598]),
 'score_time': array([0.01965213, 0.01928902, 0.01919818]),
 'test_score': array([0.88038278, 0.88305489, 0.88361045]),
 'train_score': array([0.8973747 , 0.89663462, 0.9031477 ])}

In [11]:
pipe.fit(all_features, ys)
df = pd.DataFrame(all_features)
df = df.assign(label = ys)
df = df.assign(predicted = pipe.predict(all_features))

In [12]:
df[(df['label'] == 1) & (df['predicted'] == 0)]

Unnamed: 0,lemma,lemma+1,lemma-1,pos,pos+1,pos-1,text,text+1,text-1,label,predicted
30,biologist,NONE,every,NN,NONE,DET,biologist,NONE,every,1,0
272,with,',people,IN,PUNCT,NOUN,with,',people,1,0
292,attendee,name,for,NNS,VERB,ADP,attendees,named,for,1,0
642,click,5,user,VBD,NUM,NOUN,clicked,5,users,1,0
855,those,who,find,DT,NOUN,VERB,those,who,find,1,0
880,administrative,generalist,all,NNP,PROPN,DET,Administrative,Generalist,all,1,0
1944,vida,NONE,find,FW,NONE,VERB,vida,NONE,find,1,0
2349,mention,',who,VBD,PUNCT,NOUN,mentioned,',Who,1,0
4092,with,"""",someone,IN,PUNCT,NOUN,with,"""",someone,1,0
4446,rgarcia@optonline.net,NONE,find,.,NONE,VERB,rgarcia@optonline.net,NONE,find,1,0


In [57]:
test_features, test_ys = extract(json.load(open('labled_tokenized.train.json')))

In [58]:
test_predicted = pipe.predict(test_features)

In [60]:
f1_score(test_ys, test_predicted)

0.8997594226142742

In [13]:
from sklearn.externals import joblib
joblib.dump(pipe, 'splitter.pkl')

['splitter.pkl']

### Organize into tree

In [75]:
def split(cls, sent):
    tokens = nlp(sent)
    features = extract(tokens)
    stops = cls.predict(features)
    parts = []
    part = []
    for token,stop in zip(tokens, stops):
        if stop:
            parts.append(part)
            parts.append(token)
            part = []
        else:
            part.append(token)
    parts.append(part)
    return parts

split(pipe, "Find all attendees who clicked on ad about blackout and also liked post 9 times and wrote post with Stacy Vaughn's mention")

[[Find, all, attendees],
 who,
 [clicked, on, ad, about, blackout],
 and,
 [also, liked, post, 9, times],
 and,
 [wrote, post, with, Stacy, Vaughn, 's, mention]]

In [85]:
from sklearn.externals import joblib
spacy_tokenizer = lambda xs: [x.lemma_ for x in xs]
fclf = joblib.load('filter_classifier.pkl')

In [90]:
parts = split(pipe, "Find all attendees who clicked on ad about blackout and also liked post 9 times and wrote post with Stacy Vaughn's mention")

In [95]:
def organize_in_tree_(parts, tree):
    if parts: 
        part = parts[0]
        rest = parts[1:]
        (l,n,r) = tree
        if isinstance(part, list):
            if not l:
                return organize_in_tree_(rest, (part, n, r))
            elif not r:
                return organize_in_tree_(rest, (l,n,part))
            else:
                raise Exception("node in invalid position")
        else:
            if not n:
                return organize_in_tree_(rest, (l, part, r))
            else:
                return organize_in_tree_(rest, (tree, part, None))
    else:
        return tree
        
def organize_in_tree(parts):
    return organize_in_tree_(parts[2:], (None, None, None))

In [96]:
organize_in_tree(parts)

(([clicked, on, ad, about, blackout], and, [also, liked, post, 9, times]),
 and,
 [wrote, post, with, Stacy, Vaughn, 's, mention])