In [1]:
import en_core_web_lg
import pandas as pd
from tqdm import tqdm, tqdm_pandas
import qgrid
import re
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import f1_score
import json
import spacy
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import MultinomialNB
from kanren import run, eq, membero, var, conde, Relation, facts, fact, unifiable
import collections
import itertools

pd.set_option('display.max_colwidth', 300)

In [61]:
nlp = en_core_web_lg.load()

In [58]:
def token_match(x):
    return (x.startswith(r'[\<]') and x.endswith(r'[\>]'))

In [59]:
nlp.tokenizer = spacy.tokenizer.Tokenizer(nlp.vocab, token_match=token_match )

In [5]:
from sklearn.externals import joblib
spacy_tokenizer = lambda xs: [x.lemma_ for x in xs]
fclf = joblib.load('filter_classifier.pkl')

In [6]:
from sklearn.externals import joblib
spacy_tokenizer = lambda xs: [x.lemma_ for x in xs]
splitter = joblib.load('splitter.pkl')

In [7]:
def apply_replacements(df):
    orig = []
    for i,row in df.iterrows():
        p = row['phrase']
        for k,v in json.loads(row['replacements']).items():
            p = re.sub('<'+k+'>', v, p)
        orig.append(p)
    return orig

In [8]:
cl = pd.read_csv('dataset/with_labels_1522902786972.csv', usecols=['phrase', 'label', 'replacements'])
cl = cl.assign(final=apply_replacements(cl))
train, test = train_test_split(cl, test_size=0.3)
#train.to_csv('phrases.arg_ex.train.csv')
#test.to_csv('phrases.arg_ex.test.csv')

In [639]:
train = pd.read_csv('phrases.arg_ex.train.csv', index_col=0)
test = pd.read_csv('phrases.arg_ex.test.csv')

#### Preprocessing

In [640]:
# Replace ' -> "
def replace_quotes_to_double_quotes(text):
    return re.sub(r"(\W)'(\w)", r'\1"\2', re.sub(r"([^s])'(\W|$)", r'\1"\2', text))

In [641]:
train['final'] = train['final'].apply(lambda x: x.strip()).apply(replace_quotes_to_double_quotes)

In [642]:
guitar_replacements = []
final_with_guitar = []
for i, row in train.iterrows():
    repl = []
    s = row['final']
    for i, match in enumerate(re.finditer(r"\"[^\"]*\"", s)):
        repl.append({'match': match[i]})
    s = re.sub(r"\"[^\"]*\"", '"guitar"', s)
    for i, match in enumerate(re.finditer(r'"guitar"', s)):
        repl[i]['start'] = match.start()
        repl[i]['end'] = match.end()
    guitar_replacements.append(repl)
    final_with_guitar.append(s)
train['final'] = final_with_guitar
train['grepl'] = guitar_replacements

In [643]:
train['final'] = train['final'].apply(nlp)

In [644]:
train['label'].unique()

array(['post', 'poll', 'reply', 'note', 'like', 'profile', 'rate', 'ad',
       'event', 'file', 'view', 'session'], dtype=object)

In [645]:
DEPS = [
    "acl",
    "acomp",
    "advcl",
    "advmod",
    "agent",
    "amod",
    "appos",
    "attr",
    "aux",
    "auxpass",
    "cc",
    "ccomp",
    "compound",
    "complm",
    "conj",
    "cop",
    "csubj",
    "csubjpass",
    "dep",
    "det",
    "dobj",
    "expl",
    "hmod",
    "hyph",
    "infmod",
    "intj",
    "iobj",
    "mark",
    "meta",
    "neg",
    "nmod",
    "nn",
    "npadvmod",
    "nsubj",
    "nsubjpass",
    "num",
    "nummod",
    "number",
    "oprd",
    "obj",
    "obl",
    "parataxis",
    "partmod",
    "pcomp",
    "pobj",
    "poss",
    "possessive",
    "preconj",
    "prep",
    "prt",
    "punct",
    "relcl",
    "quantmod",
    "rcmod",
    "root",
    "xcomp",
    "case"
]

In [240]:
def one_of(R, idx, lst):
    l = var('l'+str(idx))
    return conde((R['LEMMA'](idx, l), membero(l, lst)))

In [241]:
def gather_inside_outside_quote_facts(R, doc):
    inside = False
    R['insideq'] = Relation('insideq')
    R['outsideq'] = Relation('outsideq')
    for token in doc:
        if token.text == '"':
            inside = not inside
        else:
            if inside:
                fact(R['insideq'], token.i)
            else:
                fact(R['outsideq'], token.i)

In [242]:
def gather_facts(doc):
    R = {'LEMMA': Relation('LEMMA'),
         'root': Relation('root'),
         'head': Relation('head')}
    for rel in DEPS:
        R[rel] = Relation(rel)
    for tok in doc:
        facts(R['LEMMA'], (tok.i, tok.lemma_))
        if not tok.pos_ in R:
            R[tok.pos_] = Relation(tok.pos_)
        fact(R[tok.pos_], (tok.i))
        
        facts(R[tok.dep_ if tok.head.i != tok.i else 'root'], 
              (tok.head.i if tok.head.i != tok.i else -1, tok.i))
        facts(R['head'], (tok.head.i if tok.head.i != tok.i else -1, tok.i))
        
        if not tok.ent_type_ in R:
            R[tok.ent_type_] = Relation(tok.ent_type_)
        fact(R[tok.ent_type_], (tok.i))
        
    gather_inside_outside_quote_facts(R, doc)
            
    return R

In [493]:
RULES = collections.OrderedDict()
def N(rule):
    RULES[rule.__name__] = rule

In [631]:
def extract_count(doc, replacements, i):
    print(doc)
    return doc[i].text

In [646]:
def post_replies1(R, not_used):
    def locate():
        action = var('action')
        reply = var('reply')
        post_replies = var('post.repliesCount')
        reply_syns = {'reply', 'comment'}
        return (
            (post_replies,(reply,)),
            [
                conde((R['dobj'](action, reply),),
                      (R['pobj'](action, reply),)),
                one_of(R, reply, reply_syns),
                R['nummod'](reply, post_replies),
                membero(reply, not_used),
                membero(post_replies, not_used)
            ]
        )
    def extract(doc, replacements, i):
        return doc[i].text
    
    return locate, extract_count

N(post_replies1)

run_rules_on_doc(dc1, r1)

AttributeError: 'spacy.tokens.doc.Doc' object has no attribute 'i'

In [566]:
def post_text1(R, not_used):
    def locate():
        about = var('about')
        post_text = var('post.text')
        return (
            (post_text,(about,)),
            [
                R['LEMMA'](about, 'about'),
                R['pobj'](about, post_text),
                membero(post_text, not_used)
            ]
        )
    def extract(doc, replacements, i):
        matches = [repl for repl in replacements if (repl['start'] >= i) and (i < repl['end'])]
        if matches:
            return matches[0]['match']
        else:
            return doc[i].text
    
    return locate, extract
    
N(post_text1)

In [545]:
def post_text2(R, not_used):
    def locate():
        action = var('action')
        post_text = var('post.text')
        return (
            (post_text,(action,)),
            [
                R['LEMMA'](action, 'contain'),
                R['dobj'](action, post_text),
                membero(post_text, not_used)
            ]
        )
      
    def extract(doc, replacements, i):
        matches = [repl for repl in replacements if (repl['start'] >= i) and (i < repl['end'])]
        if matches:
            return matches[0]['match']
        else:
            return doc[i].text
    
    return locate, extract
N(post_text2)

In [546]:
def post_text3(R, not_used):
    def locate():
        post_text = var('post.text')
        prep = var('prep')
        return (
            (post_text, (prep,)),
            [
                one_of(R, prep, {'with'}),
                R['pobj'](prep, post_text),
                R['outsideq'](prep),
                R['insideq'](post_text),
                membero(prep, not_used),
                membero(post_text, not_used)
            ]
        )
    def extract(doc, replacements, i):
        matches = [repl for repl in replacements if (repl['start'] >= i) and (i < repl['end'])]
        if matches:
            return matches[0]['match']
    
    return locate, extract
N(post_text3)

In [549]:
def post_text4(R, not_used):
    def locate():
        post_text = var('post.text')
        post = var('post')
        post_syns = {'message', 'content', 'post'}
        contain = var('contain')
        return (
            (post_text,(contain, post_text)),
            [
                one_of(R, post, post_syns),
                R['LEMMA'](contain, 'contain'),
                conde((R['relcl'](post, contain),),
                      (R['acl'](post, contain),)),
                R['oprd'](contain, post_text),
                membero(contain, not_used),
                membero(post_text, not_used)
            ]
        )
    def extract(doc, replacements, i):
        print(list(doc))
        print(i)
        print(replacements)
        s,e = doc[i].idx, doc[i].idx+len(doc[i])
        repls = [repl for repl in replacements if (repl['start'] >= i) and (i < repl['end'])]
        if matches:
            return repls[0]['match']
    
    return locate, extract
N(post_text4)

In [550]:
def post_likes1(R, not_used):
    def locate():
        with_prep = var('with')
        with_what = var('with_what')
        post_likes = var('post.likes_count')
        return (
            (post_likes,(with_what,)),
            [
                R['LEMMA'](with_prep, 'with'),
                R['pobj'](with_prep, with_what),
                R['LEMMA'](with_what, 'like'),
                R['nummod'](with_what, post_likes),
                membero(post_likes, not_used),
                membero(with_what, not_used)
            ]
        )
    
    def extract(doc, replacements, i):
        return doc[i].text
    
    return locate, extract
N(post_likes1)

In [551]:
def post_likes2(R, not_used):
    def locate():
        like_action = var('like')
        times = var('times')
        post_likes = var('post.likes_count')
        return (
            (post_likes,(like_action, times)),
            [
                R['LEMMA'](like_action, 'like'),
                R['npadvmod'](like_action, times),
                R['nummod'](times, post_likes),
                membero(like_action, not_used),
                membero(times, not_used),
                membero(post_likes, not_used)
            ]
        )
    def extract(doc, replacements, i):
        return doc[i].text
    
    return locate, extract
N(post_likes2)

In [609]:
def extract_mention(doc, replacements, i):
    children = list(doc[i].children)
    compound = list(filter(lambda x: x.dep_ == 'compound', children))
    sort = sorted(compound + [doc[i]], key=lambda tok: tok.i)
    s = sort[0].idx
    e = sort[-1].idx + len(sort[-1].text)
    return doc.text[s:e]

In [610]:
def post_mention1(R, not_used):
    def locate():
        mention = var('mention')
        post_mention = var('post.mention')
        return (
            (post_mention,(mention,)),
            [
                R['LEMMA'](mention, 'mention'),
                R['dobj'](mention, post_mention),
                membero(mention, not_used),
                membero(post_mention, not_used)
            ]
        )   
    def extract(doc, replacements, i):
        return extract_mention(doc, replacements, i)
    
    return locate, extract_mention
N(post_mention1)

In [613]:
def post_mention2(R, not_used):
    def locate():
        mention = var('mention')
        post_mention = var('post.mention')
        return (
            (post_mention,(mention,)),
            [
                R['LEMMA'](mention, 'mention'),
                R['poss'](mention, post_mention),
                membero(mention, not_used),
                membero(post_mention, not_used)
            ]
        )
    def extract(doc, replacements, i):
        return doc[i].text
    
    return locate, extract_mention
N(post_mention2)

In [614]:
def post_mention3(R, not_used):
    def locate():
        mention = var('mention')
        prep = var('prep')
        post_mention = var('post.mention')
        return (
            (post_mention,(mention,prep)),
            [
                one_of(R, mention, {'mention'}),
                R['prep'](mention, prep),
                one_of(R, prep, {'of'}),
                R['pobj'](prep, post_mention),
                membero(mention, not_used),
                membero(post_mention, not_used),
                membero(prep, not_used)
            ]
        )
    def extract(doc, replacements, i):
        return doc[i].text
    
    return locate, extract_mention
N(post_mention3)

In [555]:
def post_sentiment1(R, not_used):
    def locate():
        post_sentiment = var('post.sentiment')
        post = var('post')
        return (
            (post_sentiment,()),
            [
                R['LEMMA'](post, 'post'),
                R['amod'](post, post_sentiment),
                membero(post_sentiment, not_used)
            ]
        )
    def extract(doc, replacements, i):
        return doc[i].text
    
    return locate, extract
N(post_sentiment1)

In [556]:
def post_sentiment2(R, not_used):
    def locate():
        post_sentiment = var('post.sentiment')
        sentiment = var('sentiment')
        return (
            (post_sentiment,(sentiment,)),
            [
                R['LEMMA'](sentiment, 'sentiment'),
                R['amod'](sentiment, post_sentiment),
                membero(post_sentiment, not_used)
            ]
        )
    
    def extract(doc, replacements, i):
        return doc[i].text
    
    return locate, extract
N(post_sentiment2)

In [557]:
def post_count1(R, not_used):
    def locate():
        post_count = var('post.count')
        post = var('post')
        post_syns = {'message', 'content', 'post'}
        return (
            (post_count,()),
            [
                one_of(R, post, post_syns),
                R['nummod'](post, post_count),
                membero(post_count, not_used)
            ]
        )
    def extract(doc, replacements, i):
        return doc[i].text
    
    return locate, extract
N(post_count1)

In [558]:
def run_rule(R, not_used, locate_fn):
    (vars, stmnts) = locate_fn()
    r = run(1, vars, *stmnts)
    if r:
        results = itertools.takewhile(lambda x: not isinstance(x, tuple), r[0])
        used = list(itertools.dropwhile(lambda x: not isinstance(x, tuple), r[0]))[0]
        return {variable.token: val for variable,val in zip(vars,results)},used
    else:
        return {},[]

In [559]:
def run_rules_on_doc(doc, grepls, rules=RULES):
    R = gather_facts(doc)
    bindings = {}
    not_used = set(range(len(doc)))
    for name, rule in rules.items():
        locate_fn, extract_fn = rule(R, not_used)
        new_bindings, used = run_rule(R, not_used, locate_fn)
        new_bindings = {variable: extract_fn(doc, grepls, val) for variable,val in new_bindings.items()}
        if new_bindings:
            print('Applied rule', name, new_bindings)
        new_bindings.update(bindings)
        bindings = new_bindings
        not_used = not_used - (set(used) | set(new_bindings.values()))
    return bindings

In [560]:
def run_rules(docs, grepls_col):
    res = []
    for doc, grepls in zip(docs, grepls_col):
        res.append(run_rules_on_doc(doc, grepls))
    return res

In [650]:
run_rules_on_doc(dc1, r1)

Applied rule post_sentiment1 {'post.sentiment': 'most'}
Applied rule post_sentiment2 {'post.sentiment': 'negative'}
Applied rule post_count1 {'post.count': 'one'}


{'post.count': 'one', 'post.sentiment': 'most'}

In [649]:
idx = 600
dc1 = train.loc[idx]['final']
r1 = train.loc[idx]['grepl']
print(train.loc[idx])
print(dc1)
spacy.displacy.render(dc1, jupyter=True)
for token in dc1:
    print("{} {} {}".format(token.lemma_, token.head.text, token.dep_))

phrase                 attendees posted at most one post with <sentiment> sentiment
replacements                                              {"sentiment": "negative"}
label                                                                          post
final           (attendees, posted, at, most, one, post, with, negative, sentiment)
grepl                                                                            []
Name: 600, dtype: object
attendees posted at most one post with negative sentiment


attendee posted nsubj
post posted ROOT
at posted prep
most post amod
one post nummod
post at pobj
with post prep
negative sentiment amod
sentiment with pobj


In [627]:
posts = train[train['label'] == 'post']
posts[['final']].assign(_=run_rules(posts['final'], posts['grepl']))

Applied rule post_text1 {'post.text': 'honey'}
Applied rule post_text3 {'post.text': '"easy"'}
Applied rule post_likes2 {'post.likes_count': '2'}
Applied rule post_likes1 {'post.likes_count': '8'}
Applied rule post_mention1 {'post.mention': 'Dorothy Larson'}
Applied rule post_text1 {'post.text': 'Larson'}
Applied rule post_text1 {'post.text': 'honey'}
Applied rule post_replies1 {'post.repliesCount': '5'}
Applied rule post_likes2 {'post.likes_count': '2'}
Applied rule post_text2 {'post.text': 'honey'}
Applied rule post_replies1 {'post.repliesCount': '7'}
Applied rule post_sentiment1 {'post.sentiment': 'positive'}
Applied rule post_mention1 {'post.mention': 'Stacy Vaughn'}
Applied rule post_sentiment2 {'post.sentiment': 'positive'}
Applied rule post_sentiment2 {'post.sentiment': 'negative'}
Applied rule post_text1 {'post.text': 'Vaughn'}
Applied rule post_likes2 {'post.likes_count': '2'}
Applied rule post_sentiment1 {'post.sentiment': 'positive'}
Applied rule post_replies1 {'post.replies

Unnamed: 0,final,_
253,"(posted, about, honey)",{'post.text': 'honey'}
137,"(created, a, post, with, word, "", guitar, "", in, text)","{'post.text': '""easy""'}"
469,"(posted, something, liked, 2, times)",{'post.likes_count': '2'}
492,"(posted, a, content, with, 8, likes)",{'post.likes_count': '8'}
362,"(mentioned, Dorothy, Larson, in, post)",{'post.mention': 'Dorothy Larson'}
364,"(posts, about, Dorothy, Larson)",{'post.text': 'Larson'}
262,"(wrote, something, about, honey)",{'post.text': 'honey'}
450,"(wrote, post, with, 5, comments)",{'post.repliesCount': '5'}
187,"(post, "", guitar, "")",{}
470,"(wrote, post, which, was, liked, 2, times)",{'post.likes_count': '2'}


In [523]:
train[train['label'] == 'post']['final'][:100]

253                                                  (posted, about, honey)
137                  (created, a, post, with, word, ", guitar, ", in, text)
469                                    (posted, something, liked, 2, times)
492                                    (posted, a, content, with, 8, likes)
362                                  (mentioned, Dorothy, Larson, in, post)
364                                         (posts, about, Dorothy, Larson)
262                                        (wrote, something, about, honey)
450                                        (wrote, post, with, 5, comments)
187                                                    (post, ", guitar, ")
470                              (wrote, post, which, was, liked, 2, times)
251                      (wrote, posts, that, contain, the, keyword, honey)
192                                               (post, has, ", guitar, ")
237                                 (wrote, a, post, that, got, 7, replies)
407         