In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
import time
import pickle
import threading
import numpy as np
from progressbar import *
from models import Model
from models.ExpGlm import ExpGlm
from models.WblGlm import WblGlm
from models.NpGlm import NpGlm
from models.RayGlm import RayGlm
from features.delicious.extraction import run as delicious_run
from features.movielens.extraction import run as movielens_run
from features.dblp.extraction import run as dblp_run
from features.utils import timestamp_delta_generator
from features.autoencoder import encode
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, median_absolute_error
from sklearn.preprocessing import MinMaxScaler

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [4]:
np.random.seed(0)

In [5]:
def get_model(dist):
    return {
        'np': NpGlm(),
        'wbl': WblGlm(),
        'exp': ExpGlm(),
        'ray': RayGlm(),
#         'pow': PowGlm(),
#         'gom': GomGlm()
    }[dist]

In [6]:
def generate_c_index(T_true, T_pred, Y):
    total_number_of_pairs = 0
    number_of_correct_predictions = 0

    for i in range(len(T_true)):
        for j in range(len(T_true) - 1, i, -1):
            if Y[i] != 0 or Y[j] != 0:  # if one or both of the samples are in observation window
                total_number_of_pairs += 1
                if T_true[i] > T_true[j] and T_pred[i] > T_pred[j]:
                    number_of_correct_predictions += 1
                if T_true[i] < T_true[j] and T_pred[i] < T_pred[j]:
                    number_of_correct_predictions += 1
                if T_true[i] == T_true[j] and T_pred[i] == T_pred[j]:
                    number_of_correct_predictions += 1

    return number_of_correct_predictions / total_number_of_pairs

In [7]:
def prepare_data(X, Y, T, convert_to_month=False):
    T = T.astype(np.float64)
    if convert_to_month:
        T /= timestamp_delta_generator(months=1)
    T += np.random.rand(len(T)) * Y

    index = np.argsort(T, axis=0).ravel()
    X = X[index, :]
    Y = Y[index]
    T = T[index]

    return X, Y, T

In [8]:
def evaluate(model: Model, X_train: np.ndarray, Y_train: np.ndarray, T_train: np.ndarray, X_test: np.ndarray,
             Y_test: np.ndarray, T_test: np.ndarray, acc_thresholds):
    model.fit(X_train, Y_train, T_train)

    # T_pred = model.mean(X_test)
    T_pred = model.quantile(X_test, .5).ravel()
    #     T_pred = np.fmin(T_pred, max(T_test))

    c_index = generate_c_index(T_test, np.fmin(T_pred, max(T_test)), Y_test)

    k = Y_test.sum()
    # X_test = X_test[:k, :]
    T_test = T_test[:k]
    T_pred = T_pred[:k]

    res = np.abs(T_pred - T_test)

    distance = np.zeros((len(acc_thresholds)))
    for i in range(len(acc_thresholds)):
        distance[i] = (res <= acc_thresholds[i]).sum() / len(res)

    #     ev = explained_variance_score(T_test, T_pred)
    mae = mean_absolute_error(T_test, T_pred)
    rmse = mean_squared_error(T_test, T_pred) ** .5
    msle = mean_squared_log_error(T_test, T_pred)
    mre = (res / T_test).mean()
    mad = median_absolute_error(T_test, T_pred)
    #     r2 = r2_score(T_test, T_pred)

    return (mae, mre, rmse, msle, mad, c_index) + tuple(distance)

In [9]:
def cross_validate(dists, X_stat, X, Y, T, cv=5, acc_thresholds:
    threads = []
    results = {dist+pos: [] for dist in dists for pos in ['', '_stat']}
    k_fold = StratifiedKFold(n_splits=cv, shuffle=True)

    widget = [Bar('=', '[', ']'), ' ', Percentage()]
    bar = ProgressBar(maxval=cv*len(dists)*2, widgets=widget)
    
    for training_indices, test_indices in k_fold.split(X=X, y=Y):
        X_stat_train = X_stat[training_indices, :]
        X_train = X[training_indices, :]
        Y_train = Y[training_indices]
        T_train = T[training_indices]

        X_stat_test = X_stat[test_indices, :]
        X_test = X[test_indices, :]
        Y_test = Y[test_indices]
        T_test = T[test_indices]

        def worker():
            for dist in dists:
                model = get_model(dist)
                scores = evaluate(model, X_train, Y_train, T_train, X_test, Y_test, T_test, acc_thresholds)
                results[dist].append(scores)
                bar.update(bar.currval+1)
                scores_stat = evaluate(model, X_stat_train, Y_train, T_train, X_stat_test, Y_test, T_test, acc_thresholds)
                results[dist+'_stat'].append(scores_stat)
                bar.update(bar.currval+1)

        job = threading.Thread(target=worker)
        threads.append(job)
        
    bar.start()

    for t in threads:
        t.start()
    for t in threads:
        t.join()
    
    bar.finish()
    
    return results

In [10]:
def get_name(dist):
    return {
        'np': 'NP-Glm',
        'wbl': 'Wbl-Glm',
        'exp': 'Exp-Glm',
        'ray': 'Ray-Glm',
        'gom': 'Gom-Glm'
    }[dist]

In [18]:
X_list, Y_raw, T_raw = dblp_run(delta=1, observation_window=6, n_snapshots=9)
# X_list, Y_raw, T_raw = delicious_run(delta=1, observation_window=6, n_snapshots=9)
# X_list, Y_raw, T_raw = movielens_run(delta=1, observation_window=6, n_snapshots=9)

20:27:27: generating papers ...
20:27:55: parsing dataset ...
20:27:55: generating samples ...
20:27:55: extracting ...


6834


20:28:03: parsing dataset ...
20:28:03: extracting ...




20:28:11: parsing dataset ...
20:28:11: extracting ...




20:28:17: parsing dataset ...
20:28:17: extracting ...




20:28:22: parsing dataset ...
20:28:23: extracting ...




20:28:27: parsing dataset ...
20:28:27: extracting ...




20:28:32: parsing dataset ...
20:28:32: extracting ...




20:28:36: parsing dataset ...
20:28:36: extracting ...




20:28:39: parsing dataset ...
20:28:39: extracting ...




20:28:43: parsing dataset ...
20:28:43: extracting ...




20:28:47: done.


In [19]:
limit = 4000
if len(Y_raw) > limit:
    X = np.stack(X_list, axis=1)  # X.shape = (n_samples, timesteps, n_features)
    X, _, Y_raw, _, T_raw, _ = train_test_split(X, Y_raw, T_raw, train_size=limit, stratify=Y_raw, shuffle=True)
    for i in range(len(X_list)):
        X_list[i] = X[:,i,:]



In [20]:
X_raw = encode(X_list, epochs=100, latent_factor=2)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [21]:
start_time = time.time()
X, Y, T = prepare_data(X_raw, Y_raw, T_raw)
scaler = MinMaxScaler(copy=True)
X_stat = scaler.fit_transform(X_list[0])

dists = [
    'np',
    'wbl',
#     'exp',
#     'ray',
    # 'gom'
]

print(len(T))

results = cross_validate(dists, X_stat, X, Y, T, cv=5)
print("--- %s seconds ---" % (time.time() - start_time))


4000




--- 117.1062262058258 seconds ---




In [22]:
from tabulate import tabulate
table = []
row = []
header = ['MAE', 'MRE', 'RMSE', 'MSLE', 'MDAE', 'CI', 'ACC-1', 'ACC-2', 'ACC-3', 'ACC-4', 'ACC-5', 'ACC-6']
for pos in ['', '_stat']:
    for dist in dists:
        row.append(get_name(dist)+pos)
        result = np.array(results[dist+pos])
        mean = result.mean(axis=0)
        table.append(mean)
print(tabulate(table, showindex=row, floatfmt=".2f", headers=header))

                MAE    MRE    RMSE    MSLE    MDAE    CI    ACC-1    ACC-2    ACC-3    ACC-4    ACC-5    ACC-6
------------  -----  -----  ------  ------  ------  ----  -------  -------  -------  -------  -------  -------
NP-Glm         1.65   0.81    2.02    0.24    1.47  0.70     0.17     0.34     0.51     0.66     0.77     0.86
Wbl-Glm        3.57   2.02   25.43    0.44    1.63  0.70     0.16     0.30     0.45     0.59     0.69     0.78
NP-Glm_stat    2.86   1.42    3.13    0.46    3.08  0.40     0.05     0.10     0.18     0.28     0.38     0.47
Wbl-Glm_stat   2.89   1.43    3.17    0.47    3.05  0.50     0.04     0.10     0.18     0.28     0.36     0.48
