In [1]:
import en_core_web_lg
import pandas as pd
from tqdm import tqdm, tqdm_pandas
import qgrid
import numpy as np
import re
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import f1_score, accuracy_score
import json
import spacy
import spacy.tokenizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import MultinomialNB
from kanren import run, eq, membero, var, conde, Relation, facts, fact, unifiable
import collections
import itertools

pd.set_option('display.max_colwidth', 300)

In [2]:
nlp = en_core_web_lg.load()

In [14]:
def token_match(x):
    return (x.startswith(r'[\<]') and x.endswith(r'[\>]'))

In [337]:
#nlp.tokenizer = spacy.tokenizer.Tokenizer(nlp.vocab, token_match=token_match )

In [5]:
from sklearn.externals import joblib
spacy_tokenizer = lambda xs: [x.lemma_ for x in xs]
fclf = joblib.load('filter_classifier.pkl')

In [6]:
from sklearn.externals import joblib
spacy_tokenizer = lambda xs: [x.lemma_ for x in xs]
splitter = joblib.load('splitter.pkl')

In [170]:
def apply_replacements(df):
    orig = []
    for i,row in df.iterrows():
        p = row['phrase']
        for k,v in json.loads(row['replacements']).items():
            p = re.sub('<'+k+'>', v, p)
        orig.append(p)
    return orig

In [8]:
cl = pd.read_csv('dataset/with_labels_1522902786972.csv', usecols=['phrase', 'label', 'replacements'])
cl = cl.assign(final=apply_replacements(cl))
train, test = train_test_split(cl, test_size=0.3)
#train.to_csv('phrases.arg_ex.train.csv')
#test.to_csv('phrases.arg_ex.test.csv')

In [4]:
train = pd.read_csv('phrases.arg_ex.train.csv', index_col=0)
test = pd.read_csv('phrases.arg_ex.test.csv', index_col=0)

In [5]:
test.assign(final=apply_replacements(test)).to_csv('phrases.arg_ex.test.csv')

#### Preprocessing

In [6]:
# Replace ' -> "
def replace_quotes_to_double_quotes(text):
    return re.sub(r"(\W)'(\w)", r'\1"\2', re.sub(r"([^s])'(\W|$)", r'\1"\2', text))

In [7]:
def replace_search_strings(phrases):
    guitar_replacements = []
    final_with_guitar = []
    for phrase in phrases:
        repl = []
        for i, match in enumerate(re.finditer(r"\"[^\"]*\"", phrase)):
            repl.append({'match': match[i]})
        phrase = re.sub(r"\"[^\"]*\"", '" guitar "', phrase)
        for i, match in enumerate(re.finditer(r'" guitar "', phrase)):
            repl[i]['start'] = match.start()
            repl[i]['end'] = match.end()
        guitar_replacements.append(repl)
        final_with_guitar.append(phrase)
    return final_with_guitar, guitar_replacements

In [8]:
def prep(df):
    df['orig_final'] = df['final'].copy()
    df['final'] = df['final'].apply(lambda x: x.strip()).apply(replace_quotes_to_double_quotes)
    final, grepls = replace_search_strings(df['final'])
    df['final'] = final
    df = df.assign(grepl=grepls)
    df['final'] = df['final'].apply(nlp)
    return df

In [9]:
ptrain = prep(train)
ptest = prep(test)

In [161]:
train['label'].unique()

array(['post', 'poll', 'reply', 'note', 'like', 'profile', 'rate', 'ad',
       'event', 'file', 'view', 'session'], dtype=object)

In [218]:
DEPS = [
    "acl",
    "acomp",
    "advcl",
    "advmod",
    "agent",
    "amod",
    "appos",
    "attr",
    "aux",
    "auxpass",
    "cc",
    "ccomp",
    "compound",
    "complm",
    "conj",
    "cop",
    "csubj",
    "csubjpass",
    "dep",
    "det",
    "dobj",
    "expl",
    "hmod",
    "hyph",
    "infmod",
    "intj",
    "iobj",
    "mark",
    "meta",
    "neg",
    "nmod",
    "nn",
    "npadvmod",
    "nsubj",
    "nsubjpass",
    "num",
    "nummod",
    "number",
    "oprd",
    "obj",
    "obl",
    "parataxis",
    "partmod",
    "pcomp",
    "pobj",
    "poss",
    "possessive",
    "preconj",
    "prep",
    "prt",
    "punct",
    "relcl",
    "quantmod",
    "rcmod",
    "root",
    "xcomp",
    "case",
    "dative"
]

In [10]:
def one_of(R, idx, lst):
    l = var('l'+str(idx))
    return conde((R['LEMMA'](idx, l), membero(l, lst)))

In [11]:
def gather_inside_outside_quote_facts(R, doc):
    inside = False
    R['insideq'] = Relation('insideq')
    R['outsideq'] = Relation('outsideq')
    for token in doc:
        if token.text == '"':
            inside = not inside
        else:
            if inside:
                fact(R['insideq'], token.i)
            else:
                fact(R['outsideq'], token.i)

In [225]:
def gather_facts(doc):
    R = {'LEMMA': Relation('LEMMA'),
         'root': Relation('root'),
         'head': Relation('head'),
         'PERSON': Relation('PERSON'),
         'PROPN': Relation('PROPN')}
    for rel in DEPS:
        R[rel] = Relation(rel)
    for tok in doc:
        facts(R['LEMMA'], (tok.i, tok.lemma_))
        if not tok.pos_ in R:
            R[tok.pos_] = Relation(tok.pos_)
        fact(R[tok.pos_], (tok.i))
        
        facts(R[tok.dep_ if tok.head.i != tok.i else 'root'], 
              (tok.head.i if tok.head.i != tok.i else -1, tok.i))
        facts(R['head'], (tok.head.i if tok.head.i != tok.i else -1, tok.i))
        
        if not tok.ent_type_ in R:
            R[tok.ent_type_] = Relation(tok.ent_type_)
        fact(R[tok.ent_type_], (tok.i))
        
    gather_inside_outside_quote_facts(R, doc)
            
    return R

In [107]:
quantifiers = {'at most {}': '<={}',
               'at least {}': '>={}',
               '{} or more': '>={}',
               '{} or less': '<={}',
               'more than {}': '>{}',
               'less than {}': '<{}'}
s2n = {'one': "1", 'two': '2', 'three': '3', 'four': '4', 'five': '5', 'six': '6',
       'seven': '7', 'eight': '8', 'nine': '9', 'ten': '10', 'eleven': '11', 'twelve': '12'}
number_regex = re.compile(r'\d+')
def extract_count(doc, replacements, i):
    match = doc[i].text
    mi = len(doc)-1
    for q, r in quantifiers.items():
        if q.format(match) in doc.text[doc[i-2].idx:doc[min(i+3, mi)].idx+len(doc[min(i+3, mi)])]:
            res = None
            if re.match(number_regex, match):
                res = match
            else:
                res = s2n.get(match)
            if res:
                return r.format(res)
    return s2n.get(match, match)

In [109]:
def extract_person(doc, replacements, i):
    children = list(doc[i].children)
    compound = list(filter(lambda x: x.dep_ == 'compound', children))
    sort = sorted(compound + [doc[i]], key=lambda tok: tok.i)
    s = sort[0].idx
    e = sort[-1].idx + len(sort[-1].text)
    return doc.text[s:e]

In [14]:
RULES = {}
def N(rule, cls):
    cls_rules = RULES.get(cls, collections.OrderedDict())
    cls_rules[rule.__name__] = rule
    RULES[cls] = cls_rules

In [15]:
def post_replies1(R, not_used):
    def locate():
        action = var('action')
        reply = var('reply')
        post_replies = var('post.repliesCount')
        reply_syns = {'reply', 'comment'}
        return (
            (post_replies,(reply,)),
            [
                conde((R['dobj'](action, reply),),
                      (R['pobj'](action, reply),)),
                one_of(R, reply, reply_syns),
                R['nummod'](reply, post_replies),
                membero(reply, not_used),
                membero(post_replies, not_used)
            ]
        )
    def extract(doc, replacements, i):
        return doc[i].text
    
    return locate, extract_count

N(post_replies1, 'post')

In [242]:
def post_text1(R, not_used):
    def locate():
        about = var('about')
        post_text = var('post.text')
        return (
            (post_text,(about,)),
            [
                R['LEMMA'](about, 'about'),
                R['pobj'](about, post_text),
                membero(post_text, not_used)
            ]
        )
    def extract(doc, replacements, i):
        matches = [repl for repl in replacements if (repl['start'] >= i) and (i < repl['end'])]
        if matches:
            return matches[0]['match']
        else:
            return doc[i].text
    
    return locate, extract_text
    
N(post_text1, 'post')

In [243]:
def post_text2(R, not_used):
    def locate():
        action = var('action')
        post_text = var('post.text')
        return (
            (post_text,(action,)),
            [
                R['LEMMA'](action, 'contain'),
                R['dobj'](action, post_text),
                membero(post_text, not_used)
            ]
        )
      
    def extract(doc, replacements, i):
        matches = [repl for repl in replacements if (repl['start'] >= i) and (i < repl['end'])]
        if matches:
            return matches[0]['match']
        else:
            return doc[i].text
    
    return locate, extract_text
N(post_text2, 'post')

In [244]:
def post_text3(R, not_used):
    def locate():
        post_text = var('post.text')
        prep = var('prep')
        return (
            (post_text, (prep,)),
            [
                one_of(R, prep, {'with'}),
                R['pobj'](prep, post_text),
                R['outsideq'](prep),
                R['insideq'](post_text),
                membero(prep, not_used),
                membero(post_text, not_used)
            ]
        )
    def extract(doc, replacements, i):
        matches = [repl for repl in replacements if (repl['start'] >= i) and (i < repl['end'])]
        if matches:
            return matches[0]['match']
    
    return locate, extract_text
N(post_text3, 'post')

In [245]:
def post_text4(R, not_used):
    def locate():
        post_text = var('post.text')
        post = var('post')
        post_syns = {'message', 'content', 'post'}
        contain = var('contain')
        return (
            (post_text,(contain, post_text)),
            [
                one_of(R, post, post_syns),
                R['LEMMA'](contain, 'contain'),
                conde((R['relcl'](post, contain),),
                      (R['acl'](post, contain),)),
                R['oprd'](contain, post_text),
                membero(contain, not_used),
                membero(post_text, not_used)
            ]
        )
    def extract(doc, replacements, i):
        print(list(doc))
        print(i)
        print(replacements)
        s,e = doc[i].idx, doc[i].idx+len(doc[i])
        repls = [repl for repl in replacements if (repl['start'] >= i) and (i < repl['end'])]
        if matches:
            return repls[0]['match']
    
    return locate, extract_text
N(post_text4, 'post')

In [246]:
def post_likes1(R, not_used):
    def locate():
        with_prep = var('with')
        with_what = var('with_what')
        post_likes = var('post.likes_count')
        return (
            (post_likes,(with_what,)),
            [
                R['LEMMA'](with_prep, 'with'),
                R['pobj'](with_prep, with_what),
                R['LEMMA'](with_what, 'like'),
                R['nummod'](with_what, post_likes),
                membero(post_likes, not_used),
                membero(with_what, not_used)
            ]
        )
    
    def extract(doc, replacements, i):
        return doc[i].text
    
    return locate, extract_count
N(post_likes1, 'post')

In [247]:
def post_likes2(R, not_used):
    def locate():
        like_action = var('like')
        times = var('times')
        post_likes = var('post.likes_count')
        return (
            (post_likes,(like_action, times)),
            [
                R['LEMMA'](like_action, 'like'),
                R['npadvmod'](like_action, times),
                R['nummod'](times, post_likes),
                membero(like_action, not_used),
                membero(times, not_used),
                membero(post_likes, not_used)
            ]
        )
    def extract(doc, replacements, i):
        return doc[i].text
    
    return locate, extract_count
N(post_likes2, 'post')

In [248]:
def post_mention1(R, not_used):
    def locate():
        mention = var('mention')
        post_mention = var('post.mention')
        return (
            (post_mention,(mention,)),
            [
                R['LEMMA'](mention, 'mention'),
                R['dobj'](mention, post_mention),
                membero(mention, not_used),
                membero(post_mention, not_used)
            ]
        ) 
    
    return locate, extract_person
N(post_mention1, 'post')

In [249]:
def post_mention2(R, not_used):
    def locate():
        mention = var('mention')
        post_mention = var('post.mention')
        return (
            (post_mention,(mention,)),
            [
                R['LEMMA'](mention, 'mention'),
                R['poss'](mention, post_mention),
                membero(mention, not_used),
                membero(post_mention, not_used)
            ]
        )
    
    return locate, extract_person
N(post_mention2, 'post')

In [250]:
def post_mention3(R, not_used):
    def locate():
        mention = var('mention')
        prep = var('prep')
        post_mention = var('post.mention')
        return (
            (post_mention,(mention,prep)),
            [
                one_of(R, mention, {'mention'}),
                R['prep'](mention, prep),
                one_of(R, prep, {'of'}),
                R['pobj'](prep, post_mention),
                membero(mention, not_used),
                membero(post_mention, not_used),
                membero(prep, not_used)
            ]
        )
    
    return locate, extract_person
N(post_mention3, 'post')

In [251]:
def post_sentiment1(R, not_used):
    def locate():
        post_sentiment = var('post.sentiment')
        post = var('post')
        return (
            (post_sentiment,()),
            [
                R['LEMMA'](post, 'post'),
                R['amod'](post, post_sentiment),
                membero(post_sentiment, not_used)
            ]
        )
    def extract(doc, replacements, i):
        return doc[i].text
    
    return locate, extract
N(post_sentiment1, 'post')

In [252]:
def post_sentiment2(R, not_used):
    def locate():
        post_sentiment = var('post.sentiment')
        sentiment = var('sentiment')
        sentiment_syns = {'positive', 'negative'}
        return (
            (post_sentiment,(sentiment,)),
            [
                R['LEMMA'](sentiment, 'sentiment'),
                R['amod'](sentiment, post_sentiment),
                one_of(R, post_sentiment, sentiment_syns),
                membero(post_sentiment, not_used)
            ]
        )
    
    def extract(doc, replacements, i):
        return doc[i].text
    
    return locate, extract
N(post_sentiment2, 'post')

In [253]:
def post_count1(R, not_used):
    def locate():
        post_count = var('post.count')
        post = var('post')
        post_syns = {'message', 'content', 'post'}
        return (
            (post_count,()),
            [
                one_of(R, post, post_syns),
                R['nummod'](post, post_count),
                membero(post_count, not_used)
            ]
        )
    def extract(doc, replacements, i):
        return doc[i].text
    
    return locate, extract_count
N(post_count1, 'post')

### Likes rules

In [254]:
def like_count1(R, not_used):
    def locate():
        like_count = var('like.count')
        like = var('like')
        return (
            (like_count,()),
            [
                one_of(R, like, {'like'}),
                R['nummod'](like, like_count),
                membero(like_count, not_used)
            ]
        )
    
    return locate, extract_count
    
N(like_count1, 'like')

In [255]:
def like_count2(R, not_used):
    def locate():
        like_action = var('like')
        times = var('times')
        like_count = var('like.count')
        return (
            (like_count,(like_action, times)),
            [
                R['LEMMA'](like_action, 'like'),
                R['npadvmod'](like_action, times),
                R['nummod'](times, like_count),
                membero(like_action, not_used),
                membero(times, not_used),
                membero(like_count, not_used)
            ]
        )
    
    return locate, extract_count
N(like_count2, 'like')

In [256]:
def like_count3(R, not_used):
    def locate():
        post = var('post')
        post_syns = {'message', 'content', 'post'}
        like_count = var('like.count')
        like = var('like')
        return (
            (like_count,()),
            [
                one_of(R, like, {'like'}),
                one_of(R, post, post_syns),
                R['nsubj'](like, post),
                R['nummod'](post, like_count),
                membero(like_count, not_used)
            ]
        )
    
    return locate, extract_count
    
N(like_count3, 'like')

In [257]:
def like_count4(R, not_used):
    def locate():
        post = var('post')
        post_syns = {'message', 'content', 'post'}
        like_count = var('like.count')
        like = var('like')
        return (
            (like_count,()),
            [
                #one_of(R, like, {'like'}),
                one_of(R, post, post_syns),
                #R['dobj'](like, post),
                R['nummod'](post, like_count),
                membero(like_count, not_used)
            ]
        )
    
    return locate, extract_count
    
N(like_count4, 'like')

In [258]:
def extract_text(doc, replacements, i):
    matches = [repl for repl in replacements if (repl['start'] >= i) and (i < repl['end'])]
    if matches:
        return matches[0]['match']
    else:
        return doc[i].text
    

In [259]:
def like_post_text1(R, not_used):
    def locate():
        about = var('about')
        post_text = var('like.postText')
        return (
            (post_text,(about,)),
            [
                R['LEMMA'](about, 'about'),
                R['pobj'](about, post_text),
                membero(post_text, not_used)
            ]
        )
    
    return locate, extract_text
    
N(like_post_text1, 'like')

In [260]:
def like_post_text2(R, not_used):
    def locate():
        action = var('action')
        post_text = var('like.postText')
        return (
            (post_text,(action,)),
            [
                R['LEMMA'](action, 'contain'),
                R['dobj'](action, post_text),
                membero(post_text, not_used)
            ]
        )
    
    return locate, extract_text
N(like_post_text2, 'like')

In [261]:
def like_post_text3(R, not_used):
    def locate():
        post_text = var('like.postText')
        prep = var('prep')
        return (
            (post_text, (prep,)),
            [
                one_of(R, prep, {'with'}),
                R['pobj'](prep, post_text),
                R['outsideq'](prep),
                R['insideq'](post_text),
                membero(prep, not_used),
                membero(post_text, not_used)
            ]
        )
    def extract(doc, replacements, i):
        matches = [repl for repl in replacements if (repl['start'] >= i) and (i < repl['end'])]
        if matches:
            return matches[0]['match']
    
    return locate, extract_text
N(like_post_text3, 'like')

In [262]:
def like_post_text4(R, not_used):
    def locate():
        post_text = var('like.postText')
        post = var('post')
        post_syns = {'message', 'content', 'post'}
        contain = var('contain')
        return (
            (post_text,(contain, post_text)),
            [
                one_of(R, post, post_syns),
                R['LEMMA'](contain, 'contain'),
                conde((R['relcl'](post, contain),),
                      (R['acl'](post, contain),)),
                R['oprd'](contain, post_text),
                membero(contain, not_used),
                membero(post_text, not_used)
            ]
        )
    def extract(doc, replacements, i):
        s,e = doc[i].idx, doc[i].idx+len(doc[i])
        repls = [repl for repl in replacements if (repl['start'] >= i) and (i < repl['end'])]
        if repls:
            return repls[0]['match']
    
    return locate, extract_text
N(like_post_text4, 'like')

In [263]:
def like_post_sentiment1(R, not_used):
    def locate():
        post_sentiment = var('like.postSentiment')
        post = var('post')
        sentiment_syns = {'positive', 'negative'}
        return (
            (post_sentiment,()),
            [
                R['LEMMA'](post, 'post'),
                R['amod'](post, post_sentiment),
                one_of(R, post_sentiment, sentiment_syns),
                membero(post_sentiment, not_used)
            ]
        )
    def extract(doc, replacements, i):
        return doc[i].text
    
    return locate, extract
N(like_post_sentiment1, 'like')

In [299]:
## This rule looks like redundant. The first is the same
def like_post_sentiment2(R, not_used):
    def locate():
        post_sentiment = var('like.postSentiment')
        sentiment = var('sentiment')
        return (
            (post_sentiment,(sentiment,)),
            [
                R['LEMMA'](sentiment, 'sentiment'),
                R['amod'](sentiment, post_sentiment),
                membero(post_sentiment, not_used)
            ]
        )
    
    def extract(doc, replacements, i):
        return doc[i].text
    
    return locate, extract
N(like_post_sentiment2, 'like')

In [265]:
def like_post_author1(R, not_used):
    def locate():
        post_author = var('like.postAuthor')
        post = var('post')
        post_syns = {'message', 'content', 'post'}
        adp = var('adp')
        return (
            (post_author, (adp,)),
            [
                one_of(R, post, post_syns),
                R['prep'](post, adp),
                R['pobj'](adp, post_author),
                membero(adp, not_used),
                membero(post_author, not_used)
            ]
        )
    
    return locate, extract_person
    
N(like_post_author1, 'like')

In [266]:
def like_post_author2(R, not_used):
    def locate():
        action = var('action')
        action_syns = {'write', 'create'}
        adp = var('adp')
        post_author = var('like.postAuthor')
        return (
            (post_author, (action, adp)),
            [
                one_of(R, action, action_syns),
                R['agent'](action, adp),
                one_of(R, adp, {'by'}),
                R['pobj'](adp, post_author),
                membero(action, not_used),
                membero(adp, not_used),
                membero(post_author, not_used)
            ]
        )
    
    return locate, extract_person

N(like_post_author2, 'like')

In [267]:
def like_post_author3(R, not_used):
    def locate():
        post_author = var('like.postAuthor')
        post = var('post')
        post_syns = {'message', 'content', 'post'}
        return (
            (post_author, ()),
            [
                one_of(R, post, post_syns),
                R['poss'](post, post_author)
            ]
        )
    
    return locate, extract_person

N(like_post_author3, 'like')

### Reply

In [268]:
# (replied, to, Hoe, Griffith, 's, post)
def reply_post_author1(R, not_used):
    def locate():
        post_author = var('reply.postAuthor')
        post = var('post')
        post_syns = {'message', 'content', 'post'}
        return (
            (post_author, ()),
            [
                one_of(R, post, post_syns),
                R['poss'](post, post_author)
            ]
        )
    
    return locate, extract_person

N(reply_post_author1, 'reply')

In [269]:
# (replied, to, post, written, by, Andy, Chandler)
def reply_post_author2(R, not_used):
    def locate():
        action = var('action')
        action_syns = {'write', 'create'}
        adp = var('adp')
        post_author = var('reply.postAuthor')
        return (
            (post_author, (action, adp)),
            [
                one_of(R, action, action_syns),
                R['agent'](action, adp),
                one_of(R, adp, {'by'}),
                R['pobj'](adp, post_author),
                membero(action, not_used),
                membero(adp, not_used),
                membero(post_author, not_used)
            ]
        )
    
    return locate, extract_person

N(reply_post_author2, 'reply')

In [270]:
# (replied, to, a, post, by, Kristin, Hansen)
def reply_post_author3(R, not_used):
    def locate():
        post_author = var('reply.postAuthor')
        post = var('post')
        post_syns = {'message', 'content', 'post'}
        adp = var('adp')
        return (
            (post_author, (adp,)),
            [
                one_of(R, post, post_syns),
                R['prep'](post, adp),
                one_of(R, adp, {'by'}),
                R['pobj'](adp, post_author),
                membero(adp, not_used),
                membero(post_author, not_used)
            ]
        )
    
    return locate, extract_person
    
N(reply_post_author3, 'reply')

In [271]:
# (replied, to, Sherman, Cooper)
def reply_post_author4(R, not_used):
    def locate():
        post_author = var('reply.postAuthor')
        action = var('action')
        adp = var('adp')
        return (
            (post_author, (adp,)),
            [
                one_of(R, action, {'reply', 'comment'}),
                R['prep'](action, adp),
                one_of(R, adp, {'to'}),
                R['pobj'](adp, post_author),
                R['PROPN'](post_author),
                membero(adp, not_used)
            ]
        )
    
    return locate, extract_person

N(reply_post_author4, 'reply')

In [272]:
# (replied, to, posts, about, ", guitar, ")
def reply_post_text1(R, not_used):
    def locate():
        post = var('post')
        post_syns = {'message', 'content', 'post'}
        about = var('about')
        post_text = var('reply.postText')
        return (
            (post_text,(about,)),
            [
                one_of(R, post, post_syns),
                R['prep'](post, about),
                one_of(R, about, {'about'}),
                R['pobj'](about, post_text),
                membero(post_text, not_used)
            ]
        )
    def extract(doc, replacements, i):
        matches = [repl for repl in replacements if (repl['start'] >= i) and (i < repl['end'])]
        if matches:
            return matches[0]['match']
        else:
            return doc[i].text
    
    return locate, extract
    
N(reply_post_text1, 'reply')

In [295]:
# (replied, to, the, post, that, contains, word, ", guitar, "
def reply_post_text2(R, not_used):
    def locate():
        post = var('post')
        post_syns = {'message', 'content', 'post'}
        action = var('action')
        action_syns = {'contain'}
        post_text = var('reply.postText')
        return (
            (post_text, (action,)),
            [
                one_of(R, post, post_syns),
                conde((R['relcl'](post, action),),
                      (R['acl'](post, action),)),
                one_of(R, action, action_syns),
                R['dobj'](action, post_text),
                membero(action, not_used)
            ]
        )
    
    return locate, extract_text

N(reply_post_text2, 'reply')

In [286]:
def reply_post_text3(R, not_used):
    def locate():
        post = var('post')
        post_syns = {'message', 'content', 'post'}
        post_text = var('reply.postText')
        adp = var('adp')
        return (
            (post_text, (prep,)),
            [
                one_of(R, post, post_syns),
                R['prep'](post, adp),
                one_of(R, adp, {'with'}),
                R['pobj'](adp, post_text),
                R['outsideq'](adp),
                R['insideq'](post_text),
                membero(adp, not_used),
                membero(post_text, not_used)
            ]
        )
    
    return locate, extract_text
N(reply_post_text3, 'reply')

In [298]:
def reply_post_sentiment1(R, not_used):
    def locate():
        post_sentiment = var('reply.postSentiment')
        post = var('post')
        post_syns = {'message', 'content', 'post'}
        sentiment_syns = {'positive', 'negative'}
        return (
            (post_sentiment,()),
            [
                R['LEMMA'](post, 'post'),
                R['amod'](post, post_sentiment),
                one_of(R, post_sentiment, sentiment_syns),
                membero(post_sentiment, not_used)
            ]
        )
    def extract(doc, replacements, i):
        return doc[i].text
    
    return locate, extract
N(reply_post_sentiment1, 'reply')

In [305]:
def reply_post_sentiment2(R, not_used):
    def locate():
        post_sentiment = var('reply.postSentiment')
        post = var('post')
        post_syns = {'message', 'content', 'post'}
        sentiment = var('sentiment')
        sentiment_pol_syns = {'positive', 'negative'}
        adp = var('adp')
        return (
            (post_sentiment,()),
            [
                one_of(R, post, post_syns),
                R['prep'](post, adp),
                one_of(R, adp, {'with'}),
                R['pobj'](adp, sentiment),
                one_of(R, sentiment, {'sentiment'}),
                R['amod'](sentiment, post_sentiment),
                one_of(R, post_sentiment, sentiment_pol_syns),
                membero(sentiment, not_used)
            ]
        )
    def extract(doc, replacements, i):
        return doc[i].text
    
    return locate, extract
N(reply_post_sentiment2, 'reply')

In [287]:
def run_rule(R, not_used, locate_fn):
    (vars, stmnts) = locate_fn()
    r = run(1, vars, *stmnts)
    if r:
        results = itertools.takewhile(lambda x: not isinstance(x, tuple), r[0])
        used = list(itertools.dropwhile(lambda x: not isinstance(x, tuple), r[0]))[0]
        return {variable.token: val for variable,val in zip(vars,results)},used
    else:
        return {},[]

In [288]:
def normalize(doc):
    if doc[0].pos_ == 'VERB':
        return nlp('who ' + doc.text)
    else:
        return doc

In [208]:
def run_rules_on_doc(doc, grepls, cls, rules=RULES, verbose=False):
    #doc = normalize(doc)
    R = gather_facts(doc)
    bindings = {}
    not_used = set(range(len(doc)))
    for name, rule in rules.get(cls, {}).items():
        locate_fn, extract_fn = rule(R, not_used)
        new_bindings, used = run_rule(R, not_used, locate_fn)
        if verbose:
            print("New binding:", new_bindings)
        new_bindings = {variable: extract_fn(doc, grepls, val) for variable,val in new_bindings.items()}
        if new_bindings and verbose:
            print('Applied rule', name, new_bindings)
        new_bindings.update(bindings)
        bindings = new_bindings
        not_used = not_used - (set(used) | set(new_bindings.values()))
    return bindings

In [209]:
def run_rules(docs, grepls_col, classes):
    res = []
    for doc, grepls, cls in zip(docs, grepls_col, classes):
        res.append(run_rules_on_doc(doc, grepls, cls))
    return res

In [301]:
idx = 238
dc1 = ptrain.loc[idx]['final']
#dc1 = nlp("liked a post with the keyword big")
r1 = ptrain.loc[idx]['grepl']
print(ptrain.loc[idx])
print(dc1)
spacy.displacy.render(dc1, jupyter=True)
for token in dc1:
    print("{} {} {} {} {}".format(token.text, token.head.text, token.dep_, token.pos_, token.ent_type_))

phrase               replied to a post with <sentiment> sentiment
replacements                            {"sentiment": "negative"}
label                                                       reply
ex                                                             {}
final           (replied, to, a, post, with, negative, sentiment)
orig_final              replied to a post with negative sentiment
grepl                                                          []
Name: 238, dtype: object
replied to a post with negative sentiment


replied replied ROOT VERB 
to replied prep ADP 
a post det DET 
post to pobj NOUN 
with post prep ADP 
negative sentiment amod ADJ 
sentiment with pobj NOUN 


In [289]:
run_rules_on_doc(dc1, r1, 'reply', verbose=True)

New binding: {}
New binding: {}
New binding: {}
New binding: {}
New binding: {}
New binding: {}
New binding: {'reply.postText': 8}
Applied rule reply_post_text3 {'reply.postText': '"biplane"'}


{'reply.postText': '"biplane"'}

In [306]:
cls = 'reply'
posts = ptrain[ptrain['label'] == cls]
posts[['final']].assign(_=run_rules(posts['final'], posts['grepl'], [cls]*len(posts)))

Unnamed: 0,final,_
79,"(replied, to, Hoe, Griffith, 's, post)",{'reply.postAuthor': 'Griffith'}
417,"(replied, to, posts, about, "", guitar, "")","{'reply.postText': '""concussion""'}"
329,"(replied, to, a, post, by, Kristin, Hansen)",{'reply.postAuthor': 'Kristin Hansen'}
143,"(replied, to, post, written, by, Andy, Chandler)",{'reply.postAuthor': 'Andy Chandler'}
128,"(replied, to, the, post, that, contains, word, "", guitar, "")","{'reply.postText': '""blackout""'}"
514,"(post, with, 9, replies)",{}
230,"(left, a, comment, for, the, post, with, "", guitar, "")","{'reply.postText': '""biplane""'}"
219,"(replied, to, the, post, contains, "", guitar, "")",{}
99,"(replied, to, post, by, Sherman, Cooper)",{'reply.postAuthor': 'Sherman Cooper'}
422,"(replied, to, post, with, "", guitar, "", in, its, text)","{'reply.postText': '""concussion""'}"


In [817]:
#prep(test)

tc = test[['phrase', 'replacements', 'label']].copy()
ex = run_rules(test['final'], test['grepl'], )
exdf = pd.DataFrame({'col': ex, 'label': tc['label']})
exdf['col'] = exdf['col'].apply(json.dumps)
exdf['col'][exdf['label'] != 'post'] = '{}'
tc.assign(ex=exdf['col'], final=test['orig_final'])#.to_csv('phrases.arg_ex.test.csv')

In [187]:
test = pd.read_csv('phrases.arg_ex.test.csv', index_col=0)
ptest = prep(test)
def evaluate_for_class(test, cls):
    ex = run_rules(test['final'], test['grepl'], test['label'])
    exdf = pd.DataFrame({'col': ex, 'label': test['label']})
    return np.sum(test['ex'].apply(json.loads)[test['label'] == cls] == exdf['col'][exdf['label'] == cls]) \
           / np.sum(test['label'] == cls)

In [274]:
evaluate_for_class(ptest, 'like')

0.8461538461538461

In [190]:
evaluate_for_class(ptest, 'post')

0.9583333333333334

In [172]:
test_wo_like = pd.read_csv('phrases.arg_ex.test.csv', index_col=0)
test_wo_like = test_wo_like.assign(final=apply_replacements(test))
ptest = prep(test_wo_like)
ex = run_rules(ptest['final'], ptest['grepl'], ptest['label'])
exdf = pd.DataFrame({'col': ex, 'label': ptest['label']})
exdf['col'] = exdf['col'].apply(json.dumps)
#exdf['col'][(exdf['label'] == 'post')] = test_wo_like[test_wo_like['label'] == 'post']['ex']


In [176]:
ptest.loc[ptest['label'] == 'like', 'ex'] = exdf[exdf['label'] == 'like']['col']
ptest.loc[ptest['label'] == 'post', 'ex'] = exdf[exdf['label'] == 'post']['col']

In [183]:
ptest[['orig_final', 'ex', 'label']]#.to_csv('phrases.arg_ex.test.csv')

In [185]:
ptest

Unnamed: 0,phrase,replacements,label,ex,final,orig_final,grepl
530,clicked on <number> ads,"{""number"": ""4""}",ad,{},"(clicked, on, 4, ads)",clicked on 4 ads,[]
109,<first-name> as first name,"{""first-name"": ""Chante""}",profile,{},"(Chante, as, first, name)",Chante as first name,[]
81,left a note on page <number> times,"{""number"": ""9""}",note,{},"(left, a, note, on, page, 9, times)",left a note on page 9 times,[]
206,located in <locatoin>,"{""location"":""North Korea""}",profile,{},"(located, in, <, locatoin, >)",located in <locatoin>,[]
103,"""<post-word>"" in description","{""post-word"": ""big""}",profile,{},"("", guitar, "", in, description)","""big"" in description","[{'match': '""big""', 'start': 0, 'end': 10}]"
52,replied to <sentiment> sentiment post,"{""sentiment"": ""negative""}",reply,{},"(replied, to, negative, sentiment, post)",replied to negative sentiment post,[]
474,voted in poll,{},poll,{},"(voted, in, poll)",voted in poll,[]
30,<post-word> in bio,"{""post-word"": ""'captive'""}",profile,{},"(', captive, "", in, bio)",'captive' in bio,[]
412,composed a <sentiment> post,"{""sentiment"": ""positive""}",post,"{""post.sentiment"": ""positive""}","(composed, a, positive, post)",composed a positive post,[]
529,attended <number> events,"{""number"": ""7""}",event,{},"(attended, 7, events)",attended 7 events,[]
