In [120]:
import en_core_web_lg
import pandas as pd
from tqdm import tqdm, tqdm_pandas
import qgrid
import re
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import f1_score
import json
import spacy
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import MultinomialNB
from kanren import run, eq, membero, var, conde, Relation, facts, fact, unifiable
import collections
import itertools

pd.set_option('display.max_colwidth', 300)

In [2]:
nlp = en_core_web_lg.load()

In [178]:
def token_match(x):
    return (x.startswith(r'[<\'"]') and x.endswith(r'[>\'"]'))

In [179]:
nlp.tokenizer = spacy.tokenizer.Tokenizer(nlp.vocab, token_match=token_match )

In [4]:
from sklearn.externals import joblib
spacy_tokenizer = lambda xs: [x.lemma_ for x in xs]
fclf = joblib.load('filter_classifier.pkl')

In [5]:
from sklearn.externals import joblib
spacy_tokenizer = lambda xs: [x.lemma_ for x in xs]
splitter = joblib.load('splitter.pkl')

In [6]:
def apply_replacements(df):
    orig = []
    for i,row in df.iterrows():
        p = row['phrase']
        for k,v in json.loads(row['replacements']).items():
            p = re.sub('<'+k+'>', v, p)
        orig.append(p)
    return orig

In [4]:
cl = pd.read_csv('dataset/with_labels_1522902786972.csv', usecols=['phrase', 'label', 'replacements'])
cl = cl.assign(final=apply_replacements(cl))
train, test = train_test_split(cl, test_size=0.3)
#train.to_csv('phrases.arg_ex.train.csv')
#test.to_csv('phrases.arg_ex.test.csv')

NameError: name 'apply_replacements' is not defined

In [181]:
train = pd.read_csv('phrases.arg_ex.train.csv', index_col=0)
test = pd.read_csv('phrases.arg_ex.test.csv')

In [182]:
train['final'] = train['final'].apply(nlp)

In [7]:
train['label'].unique()

array(['post', 'poll', 'reply', 'note', 'like', 'profile', 'rate', 'ad',
       'event', 'file', 'view', 'session'], dtype=object)

In [9]:
DEPS = [
    "acl",
    "acomp",
    "advcl",
    "advmod",
    "agent",
    "amod",
    "appos",
    "attr",
    "aux",
    "auxpass",
    "cc",
    "ccomp",
    "compound",
    "complm",
    "conj",
    "cop",
    "csubj",
    "csubjpass",
    "dep",
    "det",
    "dobj",
    "expl",
    "hmod",
    "hyph",
    "infmod",
    "intj",
    "iobj",
    "mark",
    "meta",
    "neg",
    "nmod",
    "nn",
    "npadvmod",
    "nsubj",
    "nsubjpass",
    "num",
    "nummod",
    "number",
    "oprd",
    "obj",
    "obl",
    "parataxis",
    "partmod",
    "pcomp",
    "pobj",
    "poss",
    "possessive",
    "preconj",
    "prep",
    "prt",
    "punct",
    "relcl",
    "quantmod",
    "rcmod",
    "root",
    "xcomp",
]

In [10]:
def one_of(R, idx, lst):
    l = var('l'+str(idx))
    return conde((R['LEMMA'](idx, l), membero(l, lst)))

In [11]:
def gather_facts(doc):
    R = {'LEMMA': Relation('LEMMA'),
         'root': Relation('root')}
    for rel in DEPS:
        R[rel] = Relation(rel)
    for tok in doc:
        facts(R['LEMMA'], (tok.i, tok.lemma_))
        if not tok.pos_ in R:
            R[tok.pos_] = Relation(tok.pos_)
        fact(R[tok.pos_], (tok.i))
        
        facts(R[tok.dep_ if tok.head.i != tok.i else 'root'], 
              (tok.head.i if tok.head.i != tok.i else -1, tok.i))
        
        if not tok.ent_type_ in R:
            R[tok.ent_type_] = Relation(tok.ent_type_)
        fact(R[tok.ent_type_], (tok.i))
            
    return R

In [12]:
def run_rules(doc, vs, rules_fns): 
    R = gather_facts(doc)
    for rule_fn in rules_fns:
        result = run(1, vs, *rule_fn(R))
        if result:
            return result
    return []

In [102]:
RULES = collections.OrderedDict()
def N(rule):
    RULES[rule.__name__] = rule

In [116]:
def post_replies1(R, not_used):
    action = var('action')
    reply = var('reply')
    post_replies = var('post.repliesCount')
    reply_syns = {'reply', 'comment'}
    return (
        (post_replies,(reply,)),
        [
            conde((R['dobj'](action, reply),),
                  (R['pobj'](action, reply),)),
            one_of(R, reply, reply_syns),
            R['nummod'](reply, post_replies),
            membero(reply, not_used),
            membero(post_replies, not_used)
        ]
    )
N(post_replies1)

In [131]:
def post_text1(R, not_used):
    about = var('about')
    post_text = var('post.text')
    return (
        (post_text,(about,)),
        [
            R['LEMMA'](about, 'about'),
            R['pobj'](about, post_text),
            membero(post_text, not_used)
        ]
    )
N(post_text1)

In [152]:
def post_text2(R, not_used):
    action = var('action')
    post_text = var('post.text')
    return (
        (post_text,(action,)),
        [
            R['LEMMA'](action, 'contain'),
            R['dobj'](action, post_text),
            membero(post_text, not_used)
        ]
    )
N(post_text2)

In [154]:
def post_likes1(R, not_used):
    with_prep = var('with')
    with_what = var('with_what')
    post_likes = var('post.likes_count')
    return (
        (post_likes,(with_what,)),
        [
            R['LEMMA'](with_prep, 'with'),
            R['pobj'](with_prep, with_what),
            R['LEMMA'](with_what, 'like'),
            R['nummod'](with_what, post_likes),
            membero(post_likes, not_used),
            membero(with_what, not_used)
        ]
    )
N(post_likes1)

In [155]:
def post_likes2(R, not_used):
    like_action = var('like')
    times = var('times')
    post_likes = var('post.likes_count')
    return (
        (post_likes,(like_action, times)),
        [
            R['LEMMA'](like_action, 'like'),
            R['npadvmod'](like_action, times),
            R['nummod'](times, post_likes),
            membero(like_action, not_used),
            membero(times, not_used),
            membero(post_likes, not_used)
        ]
    )
N(post_likes2)

In [156]:
def post_mention1(R, not_used):
    mention = var('mention')
    post_mention = var('post.mention')
    return (
        (post_mention,(mention,)),
        [
            R['LEMMA'](mention, 'mention'),
            R['dobj'](mention, post_mention),
            membero(mention, not_used),
            membero(post_mention, not_used)
        ]
    )
N(post_mention1)

In [158]:
def post_sentiment1(R, not_used):
    post_sentiment = var('post.sentiment')
    post = var('post')
    return (
        (post_sentiment,()),
        [
            R['LEMMA'](post, 'post'),
            R['amod'](post, post_sentiment),
            membero(post_sentiment, not_used)
        ]
    )
N(post_sentiment1)

In [159]:
def post_sentiment2(R, not_used):
    post_sentiment = var('post.sentiment')
    sentiment = var('sentiment')
    return (
        (post_sentiment,(sentiment,)),
        [
            R['LEMMA'](sentiment, 'sentiment'),
            R['amod'](sentiment, post_sentiment),
            membero(post_sentiment, not_used)
        ]
    )
N(post_sentiment2)

In [165]:
def run_rule(R, not_used, rule):
    (vars, stmnts) = rule(R, not_used)
    r = run(1, vars, *stmnts)
    if r:
        results = itertools.takewhile(lambda x: not isinstance(x, tuple), r[0])
        used = list(itertools.dropwhile(lambda x: not isinstance(x, tuple), r[0]))[0]
        return {variable.token: val for variable,val in zip(vars,results)},used
    else:
        return {},[]

In [166]:
def run_rules(doc, rules=RULES):
    R = gather_facts(doc)
    bindings = {}
    not_used = set(range(len(doc)))
    for name, rule in rules.items():
        new_bindings, used = run_rule(R, not_used, rule)
        new_bindings.update(bindings)
        bindings = new_bindings
        not_used = not_used - (set(used) | set(new_bindings.values()))
    return {variable: doc[val].text for variable,val in bindings.items()}

In [167]:
run_rules(dc1, rules=collections.OrderedDict({'a': post_replies1, 'b': post_text1}))

{'post.repliesCount': '5'}

In [183]:
idx = 192
dc1 = train.loc[idx]['final']
print(train.loc[idx])
spacy.displacy.render(nlp("post has \"rare thing\""), jupyter=True)

phrase          post has "<post-word>"
replacements     {"post-word": "rare"}
label                             post
final              (post, has, "rare")
Name: 192, dtype: object


In [174]:
posts = train[train['label'] == 'post']
posts[['final']].assign(_=posts['final'].apply(run_rules))

Unnamed: 0,final,_
253,"(posted, about, honey)",{'post.text': 'honey'}
137,"(created, a, post, with, word, 'easy', in, text)",{}
469,"(posted, something, liked, 2, times)",{'post.likes_count': '2'}
492,"(posted, a, content, with, 8, likes)",{'post.likes_count': '8'}
362,"(mentioned, Dorothy, Larson, in, post)",{'post.mention': 'Larson'}
364,"(posts, about, Dorothy, Larson)",{'post.text': 'Larson'}
262,"(wrote, something, about, honey)",{'post.text': 'honey'}
450,"(wrote, post, with, 5, comments)",{'post.repliesCount': '5'}
187,"(post, ""rare"")",{}
470,"(wrote, post, which, was, liked, 2, times)",{'post.likes_count': '2'}


In [58]:
train[train['label'] == 'post']['final'][:100]

253                                                  (posted, about, honey)
137                        (created, a, post, with, word, 'easy', in, text)
469                                    (posted, something, liked, 2, times)
492                                    (posted, a, content, with, 8, likes)
362                                  (mentioned, Dorothy, Larson, in, post)
364                                         (posts, about, Dorothy, Larson)
262                                        (wrote, something, about, honey)
450                                        (wrote, post, with, 5, comments)
187                                                          (post, "rare")
470                              (wrote, post, which, was, liked, 2, times)
251                      (wrote, posts, that, contain, the, keyword, honey)
192                                                     (post, has, "rare")
237                                 (wrote, a, post, that, got, 7, replies)
407         