In [348]:
import os
import json
import pandas as pd
import numpy as np
from turing.features import nlp
from turing.features.freq import top_words, top_lemmas, word_counts, lemma_counts
import xgboost as xgb
from scipy.stats import spearmanr
from random import shuffle, randint
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (30,15)

In [30]:
doc = json.load(open("../../turing-data/train_20170724.json"))
doc2 = json.load(open("../../turing-data/train_20170725.json"))

diags = doc + doc2

In [39]:
diags[2]

{u'context': u'Though much of Enlightenment political thought was dominated by social contract theorists, both David Hume and Adam Ferguson criticized this camp. Hume\'s essay Of the Original Contract argues that governments derived from consent are rarely seen, and civil government is grounded in a ruler\'s habitual authority and force. It is precisely because of the ruler\'s authority over-and-against the subject, that the subject tacitly consents; Hume says that the subjects would "never imagine that their consent made him sovereign", rather the authority did so. Similarly, Ferguson did not believe citizens built the state, rather polities grew out of social development. In his 1767 An Essay on the History of Civil Society, Ferguson uses the four stages of progress, a theory that was very popular in Scotland at the time, to explain how humans advance from a hunting and gathering society to a commercial and civil society without "signing" a social contract.',
 u'dialogId': -155769874

In [31]:
def is_bot(diag, user):
    text = " ".join([x["text"] for x in diag["thread"] if x["userId"] == user])
    
    if len(text) == 0:
        return False
    
    if "avilable" in text:
        return True
    
    if "Hint: first" in text:
        return True
    
    if " ." in text or " ," in text or " '" in text:
        return True
    
    if "\n" in text:
        return True
    
    return False

In [43]:
def get_score(diag, user):
    return filter(lambda x: x["userId"] != user, diag["evaluation"])[0]["quality"]

In [355]:
def correlation(diags, bst):
    df = make_features(diags, True)
    
    label_column = df["label"]
    features = df.drop(["label", "dialogId", "user"], axis=1)
    dtest = xgb.DMatrix(features.values, feature_names=features.columns)
    
    preds = bst.predict(dtest)
    
#     for i, diag in enumerate(diags):
#         for j, name in enumerate(["Bob","Alice"]):
#             if is_bot(diag, name):
#                 preds[2*i+j] = 0

    df["prediction"] = preds
    
    print df[["label","prediction"]]
    
    return spearmanr(df["label"].values, df["prediction"].values)

# Features

In [32]:
def lengths(diag, user):
    res = pd.Series()
    self_thread = filter(lambda x: x["userId"] == user, diag["thread"])
    other_thread = filter(lambda x: x["userId"] != user, diag["thread"])
    res["self_phrases_cnt"] = len(self_thread)
    res["other_phrases_cnt"] = len(other_thread)
    
    res["self_words_cnt"] = len([w for s in self_thread for w in s])
    res["other_words_cnt"] = len([w for s in other_thread for w in s])
    
    res["self_avg_words"] = 0 if res["self_phrases_cnt"] == 0 else float(res["self_words_cnt"])/res["self_phrases_cnt"] 
    res["other_avg_words"] = 0 if res["other_phrases_cnt"] == 0 else float(res["other_words_cnt"])/res["other_phrases_cnt"]
    
    return res

In [35]:
def freq_stat(diag, user):

    def topN_count(parsed_text):
        return len(filter(lambda x: x.orth_.lower() in top_words, parsed_text))

    def topN_count_lemma(parsed_text):
        return len(filter(lambda x: x.lemma_ in top_lemmas, parsed_text))

    def no_vocab_tokens(parsed_text):
        return len(filter(lambda x: x.lemma_ not in lemma_counts, parsed_text))

    def avg_index_lemma(parsed_text):
        freqs = [freq_idx_lemmas.get(token.lemma_, None) for token in parsed_text]
        logs = [math.log(x) for x in filter(lambda x: x, freqs)]
        if logs:
            return np.mean(logs)
        else:
            return None

    res = pd.Series()
    self_thread = filter(lambda x: x["userId"] == user, diag["thread"])
    other_thread = filter(lambda x: x["userId"] != user, diag["thread"])
    
    res["self_topN_count"] = np.mean([
        topN_count(nlp(unicode(x))) for x in self_thread
    ])
    res["other_topN_count"] = np.mean([
        topN_count(nlp(unicode(x))) for x in other_thread
    ])
    
    res["self_no_vocab_tokens"] = np.mean([
        no_vocab_tokens(nlp(unicode(x))) for x in self_thread
    ])
    res["other_no_vocab_tokens"] = np.mean([
        no_vocab_tokens(nlp(unicode(x))) for x in other_thread
    ])
    
    res["self_avg_index_lemma"] = np.mean([
        topN_count(nlp(unicode(x))) for x in self_thread
    ])
    res["other_avg_index_lemma"] = np.mean([
        topN_count(nlp(unicode(x))) for x in other_thread
    ])
    
    return res

In [171]:
def make_features(diags,labeled=False):
    observations = []
    for d in diags:
        for name in ("Alice","Bob"):
            obs = freq_stat(d,name).append(lengths(d,name))
            obs["user"] = name
            obs["dialogId"] = d["dialogId"]
            
            if labeled:
                obs["label"] = get_score(d,name)
            
            observations.append(obs)
    
    return pd.DataFrame(observations)

In [300]:
shuffle(diags)
df_train = make_features(diags[:400], True)
label_column = df_train["label"]
features = df_train.drop(["label", "dialogId", "user"], axis=1)

train = xgb.DMatrix(features.values, label_column.values, feature_names=features.columns)

In [409]:
param = {'eta':0.1, 'max_depth':3, 'min_child_weight':1, 'gamma':0.1,
         'silent':0, 'subsample':0.8, 'colsample_bytree': 0.8,  
         'objective':'reg:linear', 'eval_metric':'rmse'}
bst = xgb.train(param, train, num_boost_round=100)

In [410]:
correlation(diags[400:500], bst)

     label  prediction
0        0    0.255618
1        1    1.120812
2        1    1.935809
3        0    0.222018
4        0   -0.292325
5        1    1.880636
6        1    1.208047
7        0    0.036530
8        1    2.037408
9        5    2.677605
10       2    1.958603
11       0    1.754847
12       0    2.321277
13       3    1.569914
14       4    2.795381
15       3    2.201808
16       5    2.381652
17       3    3.069283
18       1    1.682814
19       0    1.241106
20       5    2.300791
21       5    1.841551
22       4    1.031413
23       4    2.479464
24       0   -0.051920
25       3    1.143492
26       0    0.992837
27       1    2.089266
28       1    0.887477
29       0    0.501445
..     ...         ...
170      0    0.290598
171      1    1.419168
172      1    1.334366
173      0    0.286471
174      5    3.314591
175      4    2.892007
176      2    2.191095
177      0    1.234050
178      3    2.198762
179      0    3.153715
180      2    1.422367
181      0 

SpearmanrResult(correlation=0.50950642132879842, pvalue=1.310030563307665e-14)