In [47]:
import datetime
import itertools
import os
import copy
import json
import pandas as pd
import numpy as np
from turing.features import nlp
from turing.features.freq import top_words, top_lemmas, word_counts, lemma_counts
import xgboost as xgb
from scipy.stats import spearmanr
from random import shuffle, randint
import matplotlib.pyplot as plt

import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel, polynomial_kernel, rbf_kernel, chi2_kernel, additive_chi2_kernel, manhattan_distances

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from turing import DATA_DIR

In [2]:
%matplotlib inline
plt.rcParams["figure.figsize"] = (30,15)

  from IPython.lib.inputhook import _use_appnope
  @inputhook_manager.register('osx')
  @inputhook_manager.register('wx')
  @inputhook_manager.register('qt', 'qt4')
  @inputhook_manager.register('qt5')
  @inputhook_manager.register('gtk')
  @inputhook_manager.register('tk')
  @inputhook_manager.register('glut')
  @inputhook_manager.register('pyglet')
  @inputhook_manager.register('gtk3')


In [3]:
xgb.__version__

'0.6'

In [4]:
def load_dialog(datestring):
    with open(os.path.join(DATA_DIR, "onsite", "train_{}.json".format(datestring))) as f:
        doc = json.load(f)
        for d in doc:
            d["date"] = datestring
        
        return doc

In [5]:
# doc = json.load(open(os.path.join(DATA_DIR, "onsite", "train_20170724.json")))
# doc2 = json.load(open(os.path.join(DATA_DIR, "onsite", "train_20170725.json")))
# # doc3 = json.load(open(os.path.join(DATA_DIR, "onsite", "train_20170726.json")))

# diags = doc + doc2
diags = []

days = ["20170724", "20170725", "20170726"]
# days = ["20170724", "20170725"]

for day in days:
    diags.extend(load_dialog(day))

In [6]:
diags[20]

{u'context': u"Detroit (/d\u1d7b\u02c8tr\u0254\u026at/) is the most populous city in the U.S. state of Michigan, the fourth-largest city in the Midwest and the largest city on the United States\u2013Canada border. It is the seat of Wayne County, the most populous county in the state. Detroit's metropolitan area, known as Metro Detroit, is home to 5.3 million people, making it the fourteenth-most populous metropolitan area in the United States and the second-largest in the Midwestern United States (behind Chicago). It is a major port on the Detroit River, a strait that connects the Great Lakes system to the Saint Lawrence Seaway. The City of Detroit anchors the second-largest economic region in the Midwest, behind Chicago, and the thirteenth-largest in the United States.",
 'date': '20170724',
 u'dialogId': -709063062,
 u'evaluation': [{u'quality': 1, u'userId': u'Bob'},
  {u'quality': 0, u'userId': u'Alice'}],
 u'thread': [{u'text': u'I went to States once, I really enjoyed visiting it

In [7]:
ALL_DOCUMENTS = []
ALL_CONTEXTS = []
ALL_UTTERANCES = []

for d in diags:
    context = d["context"]
    utterances = []
    for t in d["thread"]:
        utterances.append(t["text"])
    ALL_DOCUMENTS.append(context)
    ALL_DOCUMENTS.extend(utterances)
    
    ALL_CONTEXTS.append(context)
    ALL_UTTERANCES.extend(utterances)
    

### Global vocabulary/text collection features

In [8]:
ANALYZER = "char"
# ANALYZER = "word"
NGRAM_RANGE = (2, 2)

In [9]:
ngrams_count_vectorizer = lambda: CountVectorizer(analyzer=ANALYZER, ngram_range=NGRAM_RANGE)
tfidf_vectorizer = lambda: TfidfVectorizer(analyzer=ANALYZER, ngram_range=NGRAM_RANGE)

In [10]:
all_docs_count_ngrams = ngrams_count_vectorizer().fit(ALL_DOCUMENTS)
utterances_count_ngrams = ngrams_count_vectorizer().fit(ALL_UTTERANCES)

In [11]:
all_docs_tfidf_ngrams = tfidf_vectorizer().fit(ALL_DOCUMENTS)
utterances_tfidf_ngrams = tfidf_vectorizer().fit(ALL_UTTERANCES)

In [12]:
def make_vector_features(mode="count", scope="utterance"):
    if mode == "count":
        if scope == "all":
            vectorizer = all_docs_count_ngrams
        else:
            vectorizer = utterances_count_ngrams
    else:
        if scope == "all":
            vectorizer = all_docs_tfidf_ngrams
        else:
            vectorizer = utterances_tfidf_ngrams
    
    def count_vector_all_replics(diag, user):
        self, other = [], []
        for msg in diag["thread"]:
            if msg["userId"] == user:
                self.append(msg["text"])
            else:
                other.append(msg["text"])

        if self and other:
            sim_metrics = pd.Series()
            self_vec, other_vec = vectorizer.transform([" ".join(self), " ".join(other)]).toarray()
            self_vec = pd.Series(data=self_vec, index=["self_countvec_{}".format(i) for i in range(0, self_vec.shape[0])])
            other_vec = pd.Series(data=other_vec, index=["self_countvec_{}".format(i) for i in range(0, other_vec.shape[0])])

#             sim_metrics.append(self_vec)
#             sim_metrics.append(other_vec)

            self_vec = self_vec.reshape((1,-1))
            other_vec = other_vec.reshape((1,-1))
            sim_metrics["cosine_sim"] = cosine_similarity(self_vec, other_vec).flatten()[0]
#             sim_metrics["linear_sim"] = linear_kernel(self_vec, other_vec)[0]
#             sim_metrics["polynomial_sim"] = polynomial_kernel(self_vec, other_vec)[0]
            sim_metrics["rbf_sim"] = rbf_kernel(self_vec, other_vec).flatten()[0]
#             sim_metrics["chi2_sim"] = chi2_kernel(self_vec, other_vec)[0]
            sim_metrics["additive_chi2_sim"] = additive_chi2_kernel(self_vec, other_vec).flatten()[0]
#             sim_metrics["manhattan_sim"] = manhattan_distances(self_vec, other_vec)[0]
            return sim_metrics
        else:
            return pd.Series()

    return count_vector_all_replics
# count_vector_all_replics(diags[2], "Alice")

### Utterances helpers

In [907]:
def is_question(u):
    u = u.lower()
    return (("?" in u) or 
            ("what" in u) or 
            ("why" in u) or 
            ("how" in u) or 
            ("when" in u) or
            ("do you" in u) or
            ("did you" in u) or
            ("have you" in u))

### Other features

In [13]:
def _get_threads(diag, user):
    """ Return threads of self and other, relative to given user """
    self_thread = [u["text"] for u in filter(lambda x: x["userId"] == user, diag["thread"])]
    other_thread = [u["text"] for u in filter(lambda x: x["userId"] != user, diag["thread"])]
    return self_thread, other_thread

In [14]:
def dataset_day(diag, user):
    res = pd.Series()
    date = (datetime.datetime.strptime(diag["date"], "%Y%m%d") - datetime.datetime(2017, 7, 24)).days
    
    res["dataset_day"] = date
    return res

In [15]:
def is_bot(diag, user):
    text = " ".join([x["text"] for x in diag["thread"] if x["userId"] == user])
    
    if len(text) == 0:
        return False
    
    if "avilable" in text:
        return True
    
    if "Hint: first" in text:
        return True
    
    if " ." in text or " ," in text or " '" in text:
        return True
    
    if "\n" in text:
        return True
    
    return False

In [16]:
def get_score(diag, user):
    return filter(lambda x: x["userId"] == user, diag["evaluation"])[0]["quality"]

In [17]:
def correlation(diags, bst):
    df = make_features(diags, True)
    
    label_column = df["label"]
    features = df.drop(["label", "dialogId", "user"], axis=1)
    dtest = xgb.DMatrix(features.values, feature_names=features.columns)
    
    preds = bst.predict(dtest)
    
    for i, diag in enumerate(diags):
        for j, name in enumerate(["Alice", "Bob"]):
            if is_bot(diag, name):
                preds[2*i+j] = 0

    df["prediction"] = preds
    
#     print df[["label","prediction"]]
    
    return spearmanr(df["label"].values, df["prediction"].values)

In [18]:
def correlation2(test_diags, test_set, bst):
    label_column = test_set["label"]
    features = test_set.drop(["label", "dialogId", "user"], axis=1)
    dtest = xgb.DMatrix(features.values, feature_names=features.columns)
    
    preds = bst.predict(dtest)

    for i, diag in enumerate(test_diags):
        for j, name in enumerate(["Alice", "Bob"]):
            if is_bot(diag, name):
                preds[2*i+j] = 0

    test_set["prediction"] = preds
    
    return (
        spearmanr(test_set["label"].values, test_set["prediction"].values), 
        test_set[test_set["label"] != test_set["prediction"]]
    )

In [19]:
def flatten_dialogs(diags):
    res = []
    for d in diags:
        for t in d["thread"]:
            if len(t["text"].strip(" *")) > 10 and "\n" not in t["text"]:
                res.append((d["dialogId"],t["userId"],t["text"].encode("utf-8")))
    
    return pd.DataFrame.from_records(res, columns=["dialogId","userId","text"])

# Features

In [38]:
df = flatten_dialogs(diags)
# df.to_csv("data/texts", sep="\t",encoding="utf-8")
ppl_df = pd.DataFrame.from_csv(os.path.join(DATA_DIR, "ppl_scores.crash"), index_col=0).reset_index()
ppl_df = pd.concat([df,ppl_df], axis=1)

In [39]:
ppl_df.head()

Unnamed: 0,dialogId,userId,text,logprob,ppl
0,-315877751,Alice,"Hi! As for me, I thought Spain got united much...",-8.808901,25377.6
1,-155769874,Alice,Who uses the four stages of civil society ?,-19.16043,546.015
2,-155769874,Alice,Ehh its incorrect. Hint: first 3 answer letter...,-13.5943,184.3812
3,-155769874,Bob,What is your name?,-14.51081,68696.95
4,-155769874,Alice,"Please, speak with me.",-11.92122,955.6641


In [20]:
def ngram_ppl(diag, user):
    res = pd.Series()
    
    res["self_ngram_logprob"] = np.mean(ppl_df[ppl_df.dialogId == diag["dialogId"]][ppl_df.userId == user]["logprob"])
    res["self_ngram_ppl"] = np.mean(ppl_df[ppl_df.dialogId == diag["dialogId"]][ppl_df.userId == user]["ppl"])
    res["other_ngram_logprob"] = np.mean(ppl_df[ppl_df.dialogId == diag["dialogId"]][ppl_df.userId != user]["logprob"])
    res["other_ngram_ppl"] = np.mean(ppl_df[ppl_df.dialogId == diag["dialogId"]][ppl_df.userId != user]["ppl"])
    
    return res

In [21]:
def lengths(diag, user):
    res = pd.Series()
    self_thread = filter(lambda x: x["userId"] == user, diag["thread"])
    other_thread = filter(lambda x: x["userId"] != user, diag["thread"])
    res["self_phrases_cnt"] = len(self_thread)
    res["other_phrases_cnt"] = len(other_thread)
    
    res["self_words_cnt"] = len([w for s in self_thread for w in s])
    res["other_words_cnt"] = len([w for s in other_thread for w in s])
    
    res["self_avg_words"] = 0 if res["self_phrases_cnt"] == 0 else float(res["self_words_cnt"])/res["self_phrases_cnt"] 
    res["other_avg_words"] = 0 if res["other_phrases_cnt"] == 0 else float(res["other_words_cnt"])/res["other_phrases_cnt"]
    
    return res

In [22]:
def context_similarity(diag, user):
    context_vector = np.mean([
        word.vector for word in nlp(diag["context"])
    ], axis=0).reshape((1,-1))
    
    self_thread = filter(lambda x: x["userId"] == user, diag["thread"])
    other_thread = filter(lambda x: x["userId"] != user, diag["thread"])
    
    if self_thread and other_thread:
        self_vector = np.mean([
                word.vector for word in nlp(unicode(self_thread[0]["text"]))
            ],axis=0).reshape((1,-1))

        other_vector = np.mean([
                word.vector for word in nlp(unicode(other_thread[0]["text"]))
            ],axis=0).reshape((1,-1))
        
        res = pd.Series()
        
        if not any(np.isnan(self_vector).flatten()):
            res["self_context_cosine"] = cosine_similarity(self_vector, context_vector).flatten()[0]
        else:
            res["self_context_cosine"] = 0
        if not any(np.isnan(other_vector).flatten()):
            res["other_context_cosine"] = cosine_similarity(other_vector, context_vector).flatten()[0]
        else:
            res["other_context_cosine"] = 0
            
        return res
    else:
        return pd.Series()

In [23]:
def w2v(diag, user):
    self_thread = filter(lambda x: x["userId"] == user, diag["thread"])
    other_thread = filter(lambda x: x["userId"] != user, diag["thread"])
    
    res = pd.Series()
    
    if self_thread and other_thread:
        self_vector = np.mean([
                word.vector for word in nlp(unicode(" ".join([x["text"] for x in self_thread])))
            ],axis=0)

        other_vector = np.mean([
                word.vector for word in nlp(unicode(" ".join([x["text"] for x in other_thread])))
            ],axis=0)
        
        avg_utterances_vector = np.mean([self_vector, other_vector], axis=0)
        
        context_vector = np.mean([
            word.vector for word in nlp(diag["context"])
        ], axis=0)

        self_ser = pd.Series(data=self_vector, index=["self_w2v_{}".format(i) for i in range(0, self_vector.shape[0])])
        other_ser = pd.Series(data=other_vector, index=["other_w2v_{}".format(i) for i in range(0, other_vector.shape[0])])
        res = self_ser.append(other_ser)
        
        self_vector = self_vector.reshape((1,-1))
        other_vector = other_vector.reshape((1,-1))
        avg_utterances_vector = avg_utterances_vector.reshape((1,-1))
        context_vector = context_vector.reshape((1,-1))
        
        res["w2v_sim"] = cosine_similarity(self_vector, other_vector).flatten()[0]
        w2v_sim_ctx_self = cosine_similarity(self_vector, context_vector).flatten()[0]
        w2v_sim_ctx_other = cosine_similarity(other_vector, context_vector).flatten()[0]
        res["w2v_other_better_ctx"] = w2v_sim_ctx_other / w2v_sim_ctx_self if w2v_sim_ctx_self else 1
#         res["w2v_sim_ctx_utterances"] = cosine_similarity(avg_utterances_vector, context_vector)[0]
        return res
    else:
        return pd.Series()

In [24]:
def questions_count(diag, user):
    """ Counts number of questions in the dialog """
    self, other = _get_threads(diag, user)
    
    self_questions = 0
    other_questions = 0

    for q in self:
        if is_question(q):
            self_questions += 1
    
    for q in other:
        if is_question(q):
            other_questions += 1
    
    res = pd.Series()
    res["questions_self"] = float(self_questions) / len(self) if self else 0
    res["questions_other"] = float(other_questions) / len(other) if other else 0
    res["questions_total"] = float(self_questions + other_questions) / (len(self) + len(other)) if self and other else 0
    return res
    

In [25]:
def token_info_gain(diag, user):
    """ Computes information gain metrics """
    self, other = _get_threads(diag, user)
    
    self_tokens = set()
    for u in self:
        self_tokens.update(u.split(" "))
    
    other_tokens = set()
    for u in other:
        other_tokens.update(u.split(" "))
        
    total = len(self_tokens) + len(other_tokens)
    res = pd.Series()
    res["extra_tokens_self"] = (float(len(self_tokens ^ other_tokens)) / total) if total else 0
    res["extra_tokens_other"] = (float(len(other_tokens ^ self_tokens)) / total) if total else 0
    return res
    

In [26]:
def dialog_flow_metrics(diag, user):
    """ Computes dialog flow metrics """
    self, other = _get_threads(diag, user)
    ctx = diag["context"]
    
    res = pd.Series()
    
    avg_ctx = np.mean([
        word.vector for word in nlp(ctx)
    ], axis=0).reshape((1,-1))
    
    self_flow = []
    other_flow = []
    for s, o in itertools.izip_longest(self, other):
        if s:
            vector = np.mean([word.vector for word in nlp(s)], axis=0).reshape((1,-1))
            sim = cosine_similarity(avg_ctx, vector).flatten()[0]
            self_flow.append(sim)
        else:
            self_flow.append(0)
        
        if o:
            vector = np.mean([word.vector for word in nlp(o)], axis=0).reshape((1,-1))
            sim = cosine_similarity(avg_ctx, vector).flatten()[0]
            other_flow.append(sim)
        else:
            other_flow.append(0)
    
    if self_flow and other_flow:
        corr = spearmanr(self_flow, other_flow).correlation
        res["dialog_flow_ctx_corr"] = corr
    return res

In [27]:
def freq_stat(diag, user):

    def topN_count(parsed_text):
        return len(filter(lambda x: x.orth_.lower() in top_words, parsed_text))

    def topN_count_lemma(parsed_text):
        return len(filter(lambda x: x.lemma_ in top_lemmas, parsed_text))

    def no_vocab_tokens(parsed_text):
        return len(filter(lambda x: x.lemma_ not in lemma_counts, parsed_text))

    def avg_index_lemma(parsed_text):
        freqs = [freq_idx_lemmas.get(token.lemma_, None) for token in parsed_text]
        logs = [math.log(x) for x in filter(lambda x: x, freqs)]
        if logs:
            return np.mean(logs)
        else:
            return None

    res = pd.Series()
    self_thread = filter(lambda x: x["userId"] == user, diag["thread"])
    other_thread = filter(lambda x: x["userId"] != user, diag["thread"])
    
    self_thread = [nlp(unicode(x)) for x in self_thread]
    other_thread = [nlp(unicode(x)) for x in other_thread]
    
    res["self_topN_count"] = np.mean([
        topN_count(x) for x in self_thread
    ])
    res["other_topN_count"] = np.mean([
        topN_count(x) for x in other_thread
    ])
    
    res["self_no_vocab_tokens"] = np.mean([
        no_vocab_tokens(x) for x in self_thread
    ])
    res["other_no_vocab_tokens"] = np.mean([
        no_vocab_tokens(x) for x in other_thread
    ])
    
    res["self_avg_index_lemma"] = np.mean([
        topN_count(x) for x in self_thread
    ])
    res["other_avg_index_lemma"] = np.mean([
        topN_count(x) for x in other_thread
    ])
    
    return res

In [28]:
def show_model_features(model, num=30):
    return sorted(model.get_fscore().items(), key=lambda x: -x[1])[:num]

In [29]:
def make_features(diags,labeled=False):
    observations = []
    for d in diags:
        for name in ("Bob", "Alice"):
            obs = (
                freq_stat(d,name)
                    .append(lengths(d,name))
                    .append(context_similarity(d,name))
                    .append(w2v(d,name))
                    .append(make_vector_features("count", "all")(d,name))
                    .append(dataset_day(d, name))
                    .append(token_info_gain(d, name))
                    .append(dialog_flow_metrics(d, name))
#                     .append(questions_count(d, name))
#             .append(ngram_ppl(d,name))
            )
            
            obs["user"] = name
            obs["dialogId"] = d["dialogId"]
            
            if labeled:
                obs["label"] = get_score(d,name)
            
            observations.append(obs)
    
    return pd.DataFrame(observations)

In [1003]:
def custom_objective(preds, dtrain):
    labels = dtrain.get_label()
    
    grad = (preds - labels) * (preds > labels)
    hess = 
    return grad, hess

In [30]:
shuffle(diags)

In [31]:
df_feats = make_features(diags, True)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  c /= stddev[:, None]
  c /= stddev[None, :]
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


In [44]:
def cv(diags, df_features, test_ratio=0.2, folds=5, params=None):
    scores = []
    fails_cv = []    
    
    for i in range(0,len(diags),len(diags)/folds)[:-1]:
        test_set = df_feats[2*i:2*i+2*int(test_ratio*len(diags))]
        test_diags = diags[i:i+int(test_ratio*len(diags))]
        train_set = pd.concat([df_feats[0:2*i],df_feats[2*i+2*int(test_ratio*len(diags)):]])

        label_column = train_set["label"]
        features = train_set.drop(["label", "dialogId", "user"], axis=1)

        dtrain = xgb.DMatrix(features.values, label_column.values, feature_names=features.columns)
        dtrain.set_group([features.shape[0]])
        bst = xgb.train(param, dtrain, num_boost_round=150)
        corr, fails = correlation2(test_diags, test_set, bst)
        scores.append(corr.correlation)
        fails_cv.append(fails)
#         print scores[-1]
        
    return np.mean(scores), bst, fails_cv

In [51]:
# param = {'eta':0.1, 'max_depth':3, 'min_child_weight':1, 'gamma':0.1,
#          'silent':0, 'subsample':0.8, 'colsample_bytree': 0.8,  
#          'objective':'reg:linear'}
# param = {'eta':0.15, 'max_depth':4, 'min_child_weight':1, 'gamma':0.1,
#          'silent':0, 'subsample':0.8, 'colsample_bytree': 0.8, 'n_estimators': 50,
#          'objective':'reg:linear', 'eval_metric':'rmse'}
param = {'eta':0.15, 'max_depth':8, 'min_child_weight':1, 'gamma':0.1,
         'silent':0, 'subsample':0.8, 'colsample_bytree': 0.8, 'n_estimators': 50,
         'objective': 'reg:linear', 'eval_metric':'rmse'}

The history saving thread hit an unexpected error (OperationalError('unable to open database file',)).History will not be written to the database.


In [40]:
# START: 0.669125953157

# param = {'eta':0.1, 'max_depth':4, 'min_child_weight':1, 'gamma':0.1,
#          'silent':0, 'subsample':0.8, 'colsample_bytree': 0.8, 'n_estimators': 50,
#          'objective':'reg:linear', 'eval_metric':'rmse'}: 0.672739057296

# param = {'eta':0.15, 'max_depth':4, 'min_child_weight':1, 'gamma':0.1,
#          'silent':0, 'subsample':0.8, 'colsample_bytree': 0.8, 'n_estimators': 50,
#          'objective':'reg:linear', 'eval_metric':'rmse'}: 0.673189943236



In [52]:
#+context_similarity
for md in [4,8,12]:
    for eta in [0.1, 0.2, 0.3]:
        for n_est in [20, 30,40,50,60, 70]:
            p = copy.copy(param)
            p.update({
                'max_depth': md,
                'eta': eta,
                'n_estimators': n_est
            })
            print p
            res, model, fails = cv(diags, df_feats, folds=5, params=p)
            print res, '\n\n'

{'colsample_bytree': 0.8, 'silent': 0, 'eval_metric': 'rmse', 'min_child_weight': 1, 'n_estimators': 20, 'subsample': 0.8, 'eta': 0.1, 'objective': 'reg:linear', 'max_depth': 4, 'gamma': 0.1}
0.691654902156 


{'colsample_bytree': 0.8, 'silent': 0, 'eval_metric': 'rmse', 'min_child_weight': 1, 'n_estimators': 30, 'subsample': 0.8, 'eta': 0.1, 'objective': 'reg:linear', 'max_depth': 4, 'gamma': 0.1}
0.691654902156 


{'colsample_bytree': 0.8, 'silent': 0, 'eval_metric': 'rmse', 'min_child_weight': 1, 'n_estimators': 40, 'subsample': 0.8, 'eta': 0.1, 'objective': 'reg:linear', 'max_depth': 4, 'gamma': 0.1}
0.691654902156 


{'colsample_bytree': 0.8, 'silent': 0, 'eval_metric': 'rmse', 'min_child_weight': 1, 'n_estimators': 50, 'subsample': 0.8, 'eta': 0.1, 'objective': 'reg:linear', 'max_depth': 4, 'gamma': 0.1}
0.691654902156 


{'colsample_bytree': 0.8, 'silent': 0, 'eval_metric': 'rmse', 'min_child_weight': 1, 'n_estimators': 60, 'subsample': 0.8, 'eta': 0.1, 'objective': 'reg:linear'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


KeyboardInterrupt: 

In [805]:
test_diags = load_dialog("20170726")
test_set = make_features(test_diags, True)

res = correlation2(test_diags, test_set, model)



In [806]:
res

(SpearmanrResult(correlation=0.40748362869971538, pvalue=2.0499227336345831e-23),
      additive_chi2_sim  cosine_sim  dataset_day    dialogId  \
 0          -177.088034    0.563971          2.0   524099220   
 1          -177.088034    0.563971          2.0   524099220   
 2          -159.415466    0.534078          2.0  1721190162   
 4          -173.782540    0.624292          2.0 -1952432306   
 7           -87.666667    0.158193          2.0   190425739   
 8                  NaN         NaN          2.0 -1397268762   
 9                  NaN         NaN          2.0 -1397268762   
 10         -141.361905    0.521311          2.0    69154797   
 11         -141.361905    0.521311          2.0    69154797   
 13         -217.204762    0.500918          2.0   314998699   
 15         -111.104762    0.404853          2.0 -1482843909   
 16         -205.475783    0.646677          2.0  1573121824   
 17         -205.475783    0.646677          2.0  1573121824   
 18         -158.30202

In [898]:
show_model_features(model, 30)

[('other_context_cosine', 181),
 ('self_context_cosine', 171),
 ('additive_chi2_sim', 157),
 ('self_avg_index_lemma', 156),
 ('dialog_flow_ctx_corr', 148),
 ('cosine_sim', 145),
 ('w2v_other_better_ctx', 138),
 ('other_no_vocab_tokens', 137),
 ('other_avg_index_lemma', 126),
 ('w2v_sim', 115),
 ('self_no_vocab_tokens', 111),
 ('rbf_sim', 104),
 ('other_phrases_cnt', 81),
 ('extra_tokens_other', 74),
 ('self_phrases_cnt', 48),
 ('dataset_day', 39),
 ('other_topN_count', 36),
 ('self_topN_count', 29),
 ('extra_tokens_self', 21),
 ('self_words_cnt', 10),
 ('other_words_cnt', 7),
 ('other_avg_words', 1)]

In [456]:
features_ds = make_features(diags, True)



In [349]:
fails[0].count()

dialogId                 290
label                    290
other_avg_index_lemma    223
other_avg_words          290
other_context_cosine     177
other_no_vocab_tokens    223
other_phrases_cnt        290
other_topN_count         223
other_w2v_0              177
other_w2v_1              177
other_w2v_10             177
other_w2v_100            177
other_w2v_101            177
other_w2v_102            177
other_w2v_103            177
other_w2v_104            177
other_w2v_105            177
other_w2v_106            177
other_w2v_107            177
other_w2v_108            177
other_w2v_109            177
other_w2v_11             177
other_w2v_110            177
other_w2v_111            177
other_w2v_112            177
other_w2v_113            177
other_w2v_114            177
other_w2v_115            177
other_w2v_116            177
other_w2v_117            177
                        ... 
self_w2v_75              177
self_w2v_76              177
self_w2v_77              177
self_w2v_78   

In [None]:
sorted(model.get_fscore().items(), key=lambda x: -x[1])[:30]

In [None]:
def make_ds(diags, features, labeled=False):
    """ Composes a model with given features """
    observations = []
    for d in diags:
        for name in ("Bob", "Alice"):
            obs = pd.Series()
            for func in features:
                obs = obs.append(func(d, name))
            obs["user"] = name
            obs["dialogId"] = d["dialogId"]
            
            if labeled:
                obs["label"] = get_score(d,name)
            
            observations.append(obs)
    
    return pd.DataFrame(observations)

In [None]:
def run_cv(diags, features_df, test_ratio=0.2, folds=5, params=None):
    scores = []
    fails_cv = []
    train_test = []
    df_feats = features_df
    
    for i in range(0,len(diags),len(diags)/folds)[:-1]:
        test_set = df_feats[2*i:2*i+2*int(test_ratio*len(diags))]
        test_diags = diags[i:i+int(test_ratio*len(diags))]
        train_set = pd.concat([df_feats[0:2*i],df_feats[2*i+2*int(test_ratio*len(diags)):]])

        label_column = train_set["label"]
        features = train_set.drop(["label", "dialogId", "user"], axis=1)

        dtrain = xgb.DMatrix(features.values, label_column.values, feature_names=features.columns)
        bst = xgb.train(param, dtrain, num_boost_round=150)
        corr, fails = correlation2(test_diags, test_set, bst)
        scores.append(corr.correlation)
        fails_cv.append(fails)
        train_test.append((train_set, test_set))
        print scores[-1]
        
    return np.mean(scores), bst, fails_cv, train_test

In [None]:
# def correlation2(test_diags, test_set, bst):
#     label_column = test_set["label"]
#     features = test_set.drop(["label", "dialogId", "user"], axis=1)
#     dtest = xgb.DMatrix(features.values, feature_names=features.columns)
    
#     preds = bst.predict(dtest)

#     for i, diag in enumerate(test_diags):
#         for j, name in enumerate(["Alice", "Bob"]):
#             if is_bot(diag, name):
#                 preds[2*i+j] = 0

#     test_set["prediction"] = preds
    
#     return (
#         spearmanr(test_set["label"].values, test_set["prediction"].values), 
#         test_set[test_set["label"] != test_set["prediction"]]
#     )

### Pasha's features

In [686]:
all_train = pd.read_csv(os.path.join(DATA_DIR, "all_own_train.csv"), index_col=0)
all_test = pd.read_csv(os.path.join(DATA_DIR, "all_own_test.csv"), index_col=0)

metric_train = pd.read_csv(os.path.join(DATA_DIR, "metric_own_train.csv"), index_col=0)
metric_test = pd.read_csv(os.path.join(DATA_DIR, "metric_own_test.csv"), index_col=0)

In [687]:
all_train.head()

Unnamed: 0,is_bot,is_bot_other,score,score_other,label,is_bot_real,dialog_id,user
0,0.9999983,3.698927e-11,3.087171,0.561719,5,1,-945444646,Alice
1,2.990065e-11,1.0,0.291393,3.151923,0,0,-945444646,Bob
2,2.378284e-14,2.343623e-11,2.931613,4.739244,3,0,1219840152,Alice
3,1.961205e-19,1.103536e-08,4.849874,3.019071,5,0,1219840152,Bob
4,1.473477e-11,0.9999994,-0.1562,4.002214,0,0,-813477735,Alice


In [688]:
metric_train.head()

Unnamed: 0,is_bot,score,label,is_bot_real,dialog_id,user
0,4.543962e-12,1.411948,1,0,-383961695,Alice
1,7.338488e-09,0.645948,1,0,407487443,Alice
2,4.448519e-18,1.421263,1,0,407487443,Bob
3,6.534568e-14,3.929897,4,0,-1254951162,Alice
4,0.9684952,-0.192897,0,1,-1254951162,Bob


In [689]:
def merge_diags_with_new(main_ds, new_ds):
    _ds = new_ds.drop(["label", "user"])
    return pd.merge(main_ds, new_ds, left_on="dialogId", right_on="dialog_id", how='left')

# def merge_diags_with_metric(main_ds, new_ds):
#     _ds = new_ds.drop(["label", "user"])
#     return pd.merge(main_ds, new_ds, left_on="dialogId", right_on="dialog_id", how='left')

### Run

In [690]:
features = [
    freq_stat, 
    lengths,
    context_similarity,
    w2v,
    make_vector_features("count", "all"),
    dataset_day,
    token_info_gain,
    dialog_flow_metrics,
#     questions_count,
#     ngram_ppl
]

In [691]:
main_ds = make_ds(
    diags, features, True
)



In [1008]:
w2v_ds = make_ds(diags, [w2v, context_similarity], True)

In [1009]:
freq_ds = make_ds(
    diags, 
    [freq_stat, lengths, make_vector_features("count", "all"), token_info_gain, dialog_flow_metrics],
    True
)



In [1011]:
w2v_results = run_cv(diags, w2v_ds, params=param)

0.670931082664
0.679240272281
0.591759702003
0.63532892902
0.651660553082


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [1012]:
freq_results = run_cv(diags, freq_ds, params=param)

0.668197015897
0.599121176117
0.622375320341
0.640056386201
0.652028588059


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [1013]:
_, w2v_model, _, w2v_train_test = w2v_results
_, freq_model, _, freq_train_test = freq_results