In [129]:
import os
import json
import pandas as pd
import numpy as np
from turing.features import nlp
from turing.features.freq import top_words, top_lemmas, word_counts, lemma_counts
import xgboost as xgb
from scipy.stats import spearmanr
from random import shuffle, randint
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
plt.rcParams["figure.figsize"] = (30,15)

In [2]:
doc = json.load(open("../../turing-data/train_20170724.json"))
doc2 = json.load(open("../../turing-data/train_20170725.json"))

diags = doc + doc2

In [3]:
diags[2]

{u'context': u'Though much of Enlightenment political thought was dominated by social contract theorists, both David Hume and Adam Ferguson criticized this camp. Hume\'s essay Of the Original Contract argues that governments derived from consent are rarely seen, and civil government is grounded in a ruler\'s habitual authority and force. It is precisely because of the ruler\'s authority over-and-against the subject, that the subject tacitly consents; Hume says that the subjects would "never imagine that their consent made him sovereign", rather the authority did so. Similarly, Ferguson did not believe citizens built the state, rather polities grew out of social development. In his 1767 An Essay on the History of Civil Society, Ferguson uses the four stages of progress, a theory that was very popular in Scotland at the time, to explain how humans advance from a hunting and gathering society to a commercial and civil society without "signing" a social contract.',
 u'dialogId': -155769874

In [4]:
def is_bot(diag, user):
    text = " ".join([x["text"] for x in diag["thread"] if x["userId"] == user])
    
    if len(text) == 0:
        return False
    
    if "avilable" in text:
        return True
    
    if "Hint: first" in text:
        return True
    
    if " ." in text or " ," in text or " '" in text:
        return True
    
    if "\n" in text:
        return True
    
    return False

In [5]:
def get_score(diag, user):
    return filter(lambda x: x["userId"] != user, diag["evaluation"])[0]["quality"]

In [52]:
def correlation(diags, bst):
    df = make_features(diags, True)
    
    label_column = df["label"]
    features = df.drop(["label", "dialogId", "user"], axis=1)
    dtest = xgb.DMatrix(features.values, feature_names=features.columns)
    
    preds = bst.predict(dtest)
    
    for i, diag in enumerate(diags):
        for j, name in enumerate(["Bob","Alice"]):
            if is_bot(diag, name):
                preds[2*i+j] = 0

    df["prediction"] = preds
    
#     print df[["label","prediction"]]
    
    return spearmanr(df["label"].values, df["prediction"].values)

In [212]:
def correlation2(test_diags, test_set, bst):
    label_column = test_set["label"]
    features = test_set.drop(["label", "dialogId", "user"], axis=1)
    dtest = xgb.DMatrix(features.values, feature_names=features.columns)
    
    preds = bst.predict(dtest)

    for i, diag in enumerate(test_diags):
        for j, name in enumerate(["Bob","Alice"]):
            if is_bot(diag, name):
                preds[2*i+j] = 0

    test_set["prediction"] = preds
    
    return spearmanr(test_set["label"].values, test_set["prediction"].values)

In [210]:
def cv(diags, test_ratio=0.2, folds=5, params=None):
    scores = []
    df_feats = make_features(diags, True)
    
    for i in range(0,len(diags),len(diags)/folds)[:-1]:
        test_set = df_feats[2*i:2*i+2*int(test_ratio*len(diags))]
        test_diags = diags[i:i+int(test_ratio*len(diags))]
        train_set = pd.concat([df_feats[0:2*i],df_feats[2*i+2*int(test_ratio*len(diags)):]])

        label_column = train_set["label"]
        features = train_set.drop(["label", "dialogId", "user"], axis=1)

        dtrain = xgb.DMatrix(features.values, label_column.values, feature_names=features.columns)
        bst = xgb.train(param, dtrain, num_boost_round=30)
        
        
        scores.append(correlation2(test_diags, test_set, bst).correlation)
        print scores[-1]
        
    return np.mean(scores)

In [112]:
def flatten_dialogs(diags):
    res = []
    for d in diags:
        for t in d["thread"]:
            if len(t["text"].strip(" *")) > 10 and "\n" not in t["text"]:
                res.append((d["dialogId"],t["userId"],t["text"].encode("utf-8")))
    
    return pd.DataFrame.from_records(res, columns=["dialogId","userId","text"])

# Features

In [62]:
df = flatten_dialogs(diags)
df.to_csv("data/texts", sep="\t",encoding="utf-8")
ppl_df = pd.DataFrame.from_csv("data/ppl_scores", index_col=0).reset_index()
ppl_df = pd.concat([df,ppl_df], axis=1)

In [101]:
def ngram_ppl(diag, user):
    res = pd.Series()
    
    res["self_ngram_logprob"] = np.mean(ppl_df[ppl_df.dialogId == diag["dialogId"]][ppl_df.userId == user]["logprob"])
    res["self_ngram_ppl"] = np.mean(ppl_df[ppl_df.dialogId == diag["dialogId"]][ppl_df.userId == user]["ppl"])
    res["other_ngram_logprob"] = np.mean(ppl_df[ppl_df.dialogId == diag["dialogId"]][ppl_df.userId != user]["logprob"])
    res["other_ngram_ppl"] = np.mean(ppl_df[ppl_df.dialogId == diag["dialogId"]][ppl_df.userId != user]["ppl"])
    
    return res

In [7]:
def lengths(diag, user):
    res = pd.Series()
    self_thread = filter(lambda x: x["userId"] == user, diag["thread"])
    other_thread = filter(lambda x: x["userId"] != user, diag["thread"])
    res["self_phrases_cnt"] = len(self_thread)
    res["other_phrases_cnt"] = len(other_thread)
    
    res["self_words_cnt"] = len([w for s in self_thread for w in s])
    res["other_words_cnt"] = len([w for s in other_thread for w in s])
    
    res["self_avg_words"] = 0 if res["self_phrases_cnt"] == 0 else float(res["self_words_cnt"])/res["self_phrases_cnt"] 
    res["other_avg_words"] = 0 if res["other_phrases_cnt"] == 0 else float(res["other_words_cnt"])/res["other_phrases_cnt"]
    
    return res

In [161]:
def context_similarity(diag, user):
    context_vector = np.mean([
        word.vector for word in nlp(diag["context"])
    ], axis=0).reshape((1,-1))
    
    self_thread = filter(lambda x: x["userId"] == user, diag["thread"])
    other_thread = filter(lambda x: x["userId"] != user, diag["thread"])
    
    if self_thread and other_thread:
        self_vector = np.mean([
                word.vector for word in nlp(unicode(self_thread[0]["text"]))
            ],axis=0).reshape((1,-1))

        other_vector = np.mean([
                word.vector for word in nlp(unicode(other_thread[0]["text"]))
            ],axis=0).reshape((1,-1))

        res = pd.Series()

        res["self_context_cosine"] = 0 if not self_thread else cosine_similarity(self_vector, context_vector)
        res["other_context_cosine"] = 0 if not other_thread else cosine_similarity(other_vector, context_vector)
        return res
    else:
        return pd.Series()

In [214]:
def w2v(diag, user):
    self_thread = filter(lambda x: x["userId"] == user, diag["thread"])
    other_thread = filter(lambda x: x["userId"] != user, diag["thread"])
    
    if self_thread and other_thread:
        self_vector = np.mean([
                word.vector for word in nlp(unicode(" ".join([x["text"] for x in self_thread])))
            ],axis=0)

        other_vector = np.mean([
                word.vector for word in nlp(unicode(" ".join([x["text"] for x in other_thread])))
            ],axis=0)
        
        context_vector = np.mean([
            word.vector for word in nlp(diag["context"])
        ], axis=0)

        self_ser = pd.Series(data=self_vector, index=["self_w2v_{}".format(i) for i in range(0, self_vector.shape[0])])
        other_ser = pd.Series(data=other_vector, index=["other_w2v_{}".format(i) for i in range(0, other_vector.shape[0])])
        return self_ser.append(other_ser)
    else:
        return pd.Series()

In [8]:
def freq_stat(diag, user):

    def topN_count(parsed_text):
        return len(filter(lambda x: x.orth_.lower() in top_words, parsed_text))

    def topN_count_lemma(parsed_text):
        return len(filter(lambda x: x.lemma_ in top_lemmas, parsed_text))

    def no_vocab_tokens(parsed_text):
        return len(filter(lambda x: x.lemma_ not in lemma_counts, parsed_text))

    def avg_index_lemma(parsed_text):
        freqs = [freq_idx_lemmas.get(token.lemma_, None) for token in parsed_text]
        logs = [math.log(x) for x in filter(lambda x: x, freqs)]
        if logs:
            return np.mean(logs)
        else:
            return None

    res = pd.Series()
    self_thread = filter(lambda x: x["userId"] == user, diag["thread"])
    other_thread = filter(lambda x: x["userId"] != user, diag["thread"])
    
    res["self_topN_count"] = np.mean([
        topN_count(nlp(unicode(x))) for x in self_thread
    ])
    res["other_topN_count"] = np.mean([
        topN_count(nlp(unicode(x))) for x in other_thread
    ])
    
    res["self_no_vocab_tokens"] = np.mean([
        no_vocab_tokens(nlp(unicode(x))) for x in self_thread
    ])
    res["other_no_vocab_tokens"] = np.mean([
        no_vocab_tokens(nlp(unicode(x))) for x in other_thread
    ])
    
    res["self_avg_index_lemma"] = np.mean([
        topN_count(nlp(unicode(x))) for x in self_thread
    ])
    res["other_avg_index_lemma"] = np.mean([
        topN_count(nlp(unicode(x))) for x in other_thread
    ])
    
    return res

In [174]:
def make_features(diags,labeled=False):
    observations = []
    for d in diags:
        for name in ("Alice","Bob"):
            obs = freq_stat(d,name)\
            .append(lengths(d,name))\
            .append(ngram_ppl(d,name))\
            .append(context_similarity(d,name))\
            .append(w2v(d,name))
            
            obs["user"] = name
            obs["dialogId"] = d["dialogId"]
            
            if labeled:
                obs["label"] = get_score(d,name)
            
            observations.append(obs)
    
    return pd.DataFrame(observations)

In [146]:
param = {'eta':0.1, 'max_depth':3, 'min_child_weight':1, 'gamma':0.1,
         'silent':0, 'subsample':0.8, 'colsample_bytree': 0.8,  
         'objective':'reg:linear', 'eval_metric':'rmse'}

In [215]:
#+context_similarity
cv(diags,folds=10,params=param)

  after removing the cwd from sys.path.
  """
  
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


0.674480759818
0.727287984159
0.711365050089
0.716084322192
0.736449876131
0.688406255062
0.665303592635
0.691496486581
0.719814835325
0.718144848455


0.70488340104471636

In [175]:
cv(diags,folds=10,params=param)

  after removing the cwd from sys.path.
  """
  
  import sys


0.674480759818
0.727287984159
0.711365050089
0.716084322192
0.736449876131
0.688406255062
0.665303592635
0.691496486581
0.719814835325
0.718144848455
0.730267799491


0.70719107363069889

In [213]:
cv(diags,folds=10,params=param)

  after removing the cwd from sys.path.
  """
  
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


0.671341318063
0.716341404884
0.718474613098
0.691076095935
0.74375222679
0.673787903602
0.666707137732
0.689726211683
0.709204525515
0.700727290407


0.69811387277101589