In [1]:
import ujson
import os
from os.path import join
from collections import Counter

In [2]:
from functools import partial

In [3]:
from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [4]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
import keras
from keras.layers import *
from keras.models import Model
from keras.optimizers import Adam
from keras.preprocessing import sequence
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


In [6]:
from numpy.random import random

In [7]:
from sklearn.preprocessing import MinMaxScaler

In [8]:
from scipy.stats import spearmanr
from sklearn.metrics import mean_squared_error, log_loss, accuracy_score, roc_auc_score

In [9]:
from random import shuffle, randint
import pandas as pd

In [10]:
file_format = "train_{}.json"

In [11]:
DATADIR = "/workspace/data/bots/turing-data/"

In [12]:
#0.62

In [13]:
DATES = ["20170724", "20170725", "20170726"]
VAL_DATES = ["20170726"]

In [14]:
DATA = [join(DATADIR, file_format.format(x)) for x in DATES]
VAL_DATA = [join(DATADIR, file_format.format(x)) for x in VAL_DATES]

In [15]:
all_data = [ujson.load(open(x)) for x in DATA]
val_data = [ujson.load(open(x)) for x in VAL_DATA]

In [17]:
def get_replies(diag, user):
    replies = list(filter(lambda x: x["userId"] == user, diag["thread"]))
    return replies

In [18]:
for r in get_replies(all_data[0][2], "Bob"):
    print(r["text"])

Hi
What is your name?


In [19]:
TRAIN_PART = 0.8

In [20]:
MAX_PAD = 200

In [21]:
def get_score(diag, user):
    score = list(filter(lambda x: x["userId"] == user, diag["evaluation"]))[0]["quality"]
#     score = 1 if score >= 2 else 0
    return score

In [22]:
def get_replies(diag, user):
    replies = list(filter(lambda x: x["userId"] == user, diag["thread"]))
    return replies

In [23]:
def get_type(diag, user):
    u = list(filter(lambda x: x["id"] == user, diag["users"]))[0]['userType']
    return 1 if u == "Bot" else 0

In [38]:
def make_features(diags, labeled=False):
    observations = []
    
    for d in diags:
        
        for name in ["Alice", "Bob"]:
            
            replies = get_replies(d, name)
            other_replies = get_replies(d, "Alice" if name == "Bob" else "Bob")
            
            if len(replies) > 0 and len(other_replies) > 0:
                    
                obs = {}
                obs["reply"] = "$".join([r["text"] for r in replies])
                obs["other_reply"] = "$".join([r["text"] for r in other_replies])
                obs["user"] = name
                obs["dialogId"] = d["dialogId"]
                
                if labeled:
                    obs["label"] = get_score(d, name)
                    obs["userType"] = get_type(d, name)
                    obs["other_label"] = get_score(d, "Alice" if name == "Bob" else "Bob")
                    obs["other_userType"] = get_type(d, "Alice" if name == "Bob" else "Bob")
                
                observations.append(obs)
    
    return pd.DataFrame(observations)

In [39]:
df_val = make_features(val_data[0], True)

In [40]:
def get_train_test(all_data, split=TRAIN_PART):
    train_data = []
    test_data = []
    for al in all_data:
        for d in al:
            if random() < split:
                train_data.append(d)
            else:
                test_data.append(d)
    return train_data, test_data

In [41]:
def get_vocab(data):
    vocab = Counter()
    for r in data.iterrows():
        vocab.update(r[1]["reply"])
    return vocab

In [42]:
def text_to_seq(text, char_to_ind):
    return [char_to_ind.get(c, 0) for c in text]

In [43]:
def add_sequences(df, char_to_ind, max_pad=MAX_PAD):
    for j in ["reply", "other_reply"]:
        df["{}_seq".format(j)] = df[j].apply(lambda x: text_to_seq(x, char_to_ind))
        df["{}_seq_len".format(j)] = df["{}_seq".format(j)].apply(len)
        df["{}_seq_padded".format(j)] = df["{}_seq".format(j)].apply(lambda x: sequence.pad_sequences([x], max_pad, padding="post"))
    return df

In [63]:
def get_chat_model(n_words,
              vocab_len,
              emb_size,
              filters,
              kernel_size,
              d_size1,
              d_size2,
              l_r,
              need_more_dense1,
              need_more_dense2):
    
    embedding_layer = Embedding(vocab_len, emb_size) # shared
    
    reply = Input(shape = (n_words,), name='n_reply')   
    reply_e = embedding_layer(reply)
    x = Conv1D(filters, kernel_size, activation='relu', strides=1)(reply_e) 
    xg = GlobalMaxPooling1D()(x)
    
    other_reply = Input(shape = (n_words,), name='n_other_reply')
    other_reply_e = embedding_layer(other_reply)
    xj = Conv1D(filters, kernel_size, activation='relu', strides=1)(other_reply_e)   
    xj = GlobalMaxPooling1D()(xj)
    
    xg = concatenate([xg, xj])
    
    if need_more_dense1:
        
        x = Dense(d_size1, activation="linear")(xg)
        x = Dropout(0.1)(x)
        
        z = Dense(d_size1, activation="linear")(xg)
        z = Dropout(0.1)(z)
    else:
        x = xg
        z = xg
    
    metric = Dense(1, activation="linear", name="metric")(x)
    other_metric = Dense(1, activation="linear", name="other_metric")(z)
    
    if need_more_dense2:
        
        y = Dense(d_size2, activation="relu")(xg)
        y = Dropout(0.1)(y)
        
        z = Dense(d_size2, activation="relu")(xg)
        z = Dropout(0.1)(z)
    else:
        y = xg
        z = xg
        
    bot = Dense(1, activation="sigmoid", name="bot")(y)
    other_bot = Dense(1, activation="sigmoid", name="other_bot")(z)
    
    model = Model(inputs=[reply, other_reply], outputs=[metric, bot, other_metric, other_bot])
    
    model.compile(loss={"metric": "mse", "other_metric": "mse",
                        "bot": "binary_crossentropy", "other_bot": "binary_crossentropy"},
                  optimizer=Adam(l_r), metrics=["accuracy"], loss_weights={"metric": 1,
                                                                           "bot": 10,
                                                                           "other_metric": 1,
                                                                           "other_bot": 10})
    
    return model

In [67]:
def score(all_data, k_fold, params):
    
    print(params)
    
    mses = []
    rrs = []
    mses_o = []
    rrs_o = []
    
    for i in range(k_fold):
        
        train_data, test_data = get_train_test(all_data)
        shuffle(train_data)
        shuffle(test_data)
        df_train = make_features(train_data, True)
        df_test = make_features(test_data, True)

        vocab = get_vocab(df_train)  
        char_to_ind = {c[0]: i + 1 for i, c in enumerate(vocab.most_common())}
        ind_to_char = {i: c for c, i in char_to_ind.items()}

        df_train = add_sequences(df_train, char_to_ind, int(params["pad_size"]))
        df_test = add_sequences(df_test, char_to_ind, int(params["pad_size"]))

        y_train_score = df_train["label"].values
        y_test_score = df_test["label"].values
        
        y_train_score_o = df_train["other_label"].values
        y_test_score_o = df_test["other_label"].values

        y_train_bot = df_train["userType"].values
        y_test_bot = df_test["userType"].values
        
        y_train_bot_o = df_train["other_userType"].values
        y_test_bot_o = df_test["other_userType"].values

        x_train_r = np.vstack(df_train["reply_seq_padded"].values)
        x_train_o = np.vstack(df_train["other_reply_seq_padded"].values)
        
        x_test_r = np.vstack(df_test["reply_seq_padded"].values)
        x_test_o = np.vstack(df_test["other_reply_seq_padded"].values)

        model = get_chat_model(int(params["pad_size"]),
                               len(char_to_ind) + 1,
                               int(params["emb_size"]),
                               int(params["filters"]),
                               int(params["kernel_size"]),
                               int(params.get("d_size1", 4)),
                               int(params.get("d_size2", 4)),
                               params["lr"],
                               params["need_more_dense1"],
                               params["need_more_dense2"])

        n_epochs = int(params["n_epochs"])

        model.fit([x_train_r, x_train_o], [y_train_score, y_train_bot, y_train_score_o, y_train_bot_o], batch_size=16, epochs=n_epochs, verbose=0)

        train_preds = model.predict([x_train_r, x_train_o])
        test_preds = model.predict([x_test_r, x_test_o])

        mse_train = mean_squared_error(train_preds[0], y_train_score)
        mse_test = mean_squared_error(test_preds[0], y_test_score)
        
        rr_train = roc_auc_score(y_train_bot, train_preds[1])
        rr_test = roc_auc_score(y_test_bot, test_preds[1])
        
        mse_train_o = mean_squared_error(train_preds[2], y_train_score_o)
        mse_test_o = mean_squared_error(test_preds[2], y_test_score_o)
        
        rr_train_o = roc_auc_score(y_train_bot_o, train_preds[3])
        rr_test_o = roc_auc_score(y_test_bot_o, test_preds[3])
        
        mses.append(mse_test)
        rrs.append(rr_test)
        mses_o.append(mse_test_o)
        rrs_o.append(rr_test_o)
    
    mean_mse = np.mean(mse_test)
    mean_rr = np.mean(rr_test)
    mean_mse_o = np.mean(mse_test_o)
    mean_rr_o = np.mean(rr_test_o)
    
    return {'loss': mean_mse + 10 * mean_rr + mean_mse_o + 10 * mean_rr_o,
            'bot_auc': mean_rr,
            'metric_loss': mean_mse,
            'other_bot_auc': mean_rr_o,
            'other_metric_loss': mean_mse_o,
            'status': STATUS_OK}

In [68]:
K_FOLD = 3

In [75]:
def optimize(trials):
    space = {
             'n_epochs' : hp.quniform('n_epochs', 200, 200, 1),
             'emb_size' : hp.quniform('emb_size', 4, 64, 1),
             'filters' : hp.quniform('filters', 4, 64, 4),
             'kernel_size' : hp.quniform('kernel_size', 4, 4, 1),
             'pad_size': hp.quniform('pad_size', 40, 200, 1),
             'lr': hp.uniform('lr', 0.0001, 0.01),
             'need_more_dense1': hp.choice("need_more_dense1",
                                           [True, False]),
             'need_more_dense2': hp.choice("need_more_dense2",
                                           [True, False]),
             }
    best = fmin(partial(score, all_data, K_FOLD), space, algo=tpe.suggest, trials=trials, max_evals=100)
    return best

In [None]:
%%time
trials = Trials()
res = optimize(trials)

{'n_epochs': 200.0, 'need_more_dense1': True, 'lr': 0.0055435120757885556, 'emb_size': 43.0, 'kernel_size': 4.0, 'need_more_dense2': False, 'filters': 44.0, 'pad_size': 168.0}


In [77]:
res

{'emb_size': 45.0,
 'filters': 52.0,
 'kernel_size': 4.0,
 'lr': 0.0008896005810278546,
 'n_epochs': 200.0,
 'need_more_dense1': 0,
 'need_more_dense2': 1,
 'pad_size': 166.0}

In [78]:
trials.best_trial

{'book_time': datetime.datetime(2017, 7, 27, 0, 21, 2, 227000),
 'exp_key': None,
 'misc': {'cmd': ('domain_attachment', 'FMinIter_Domain'),
  'idxs': {'emb_size': [2],
   'filters': [2],
   'kernel_size': [2],
   'lr': [2],
   'n_epochs': [2],
   'need_more_dense1': [2],
   'need_more_dense2': [2],
   'pad_size': [2]},
  'tid': 2,
  'vals': {'emb_size': [12.0],
   'filters': [8.0],
   'kernel_size': [4.0],
   'lr': [0.0003582588013982851],
   'n_epochs': [200.0],
   'need_more_dense1': [0],
   'need_more_dense2': [1],
   'pad_size': [70.0]},
  'workdir': None},
 'owner': None,
 'refresh_time': datetime.datetime(2017, 7, 27, 0, 30, 44, 552000),
 'result': {'bot_auc': 0.83649706457925643,
  'loss': 22.104022975446043,
  'metric_loss': 2.4526946763942812,
  'other_bot_auc': 0.87162426614481414,
  'other_metric_loss': 2.5701149918110553,
  'status': 'ok'},
 'spec': None,
 'state': 2,
 'tid': 2,
 'version': 0}

backup
====

In [1331]:
test_res = pd.DataFrame({"score": test_preds[0].reshape(1, -1)[0], "is_bot": test_preds[1].reshape(1, -1)[0]})
train_res = pd.DataFrame({"score": train_preds[0].reshape(1, -1)[0], "is_bot": train_preds[1].reshape(1, -1)[0]})

In [1332]:
test_res["label"] = df_test["label"]
test_res["is_bot_real"] = df_test["userType"]
test_res["dialog_id"] = df_test["dialogId"]
test_res["user"] = df_test["user"]
train_res["label"] = df_train["label"]
train_res["is_bot_real"] = df_train["userType"]
train_res["dialog_id"] = df_train["dialogId"]
train_res["user"] = df_train["user"]

In [1346]:
accuracy_score(test_res.is_bot_real, test_res.is_bot > 0.5)

0.90393013100436681

In [1348]:
roc_auc_score(test_res.is_bot_real, test_res.is_bot)

0.92551594746716692

In [1333]:
val_preds = model.predict(x_val)

In [1334]:
val_res = pd.DataFrame({"score": val_preds[0].reshape(1, -1)[0], "is_bot": val_preds[1].reshape(1, -1)[0]})

In [1335]:
val_res["dialog_id"] = df_val["dialogId"]
val_res["user"] = df_val["user"]

In [1336]:
train_res.to_csv("/workspace/data/bots/24_train.csv")
test_res.to_csv("/workspace/data/bots/25_test.csv")

In [1337]:
val_res.to_csv("/workspace/data/bots/26_val.csv")