In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from zipfile import ZipFile
import pandas as pd
import torch
import random
import numpy as np
from matplotlib import pyplot as plt
from tqdm import tqdm, trange

from HP import PointProcessStorage, DirichletMixtureModel, EM_clustering
from metrics import consistency, purity
from data_utils import load_data

### IPTV Data

In [3]:
path = Path('../..', 'data', 'IPTV_Data')
ss, Ts, class2idx = load_data(path, nfiles=300, ext='txt', datetime=True, maxlen=500)

In [6]:
N = len(ss)

In [7]:
D = 3
basis_fs = [lambda x: torch.exp(- x**2 / (2.*(k+1)**2) ) for k in range(D)]

In [8]:
C = len(class2idx)
K = 6

In [9]:
ntrials = 10
niter = 5

labels = torch.zeros(ntrials, len(ss))
nlls = torch.zeros(ntrials, niter)

for i in trange(ntrials):
    Sigma = torch.rand(C, C, D, K)
    B = torch.rand(C, K)
    alpha = 1.

    train_ids = np.sort(np.random.choice(np.arange(len(ss)), size=len(ss) // 2, replace=False))
    train_fold = [ss[i] for i in range(len(ss)) if i in train_ids]
    train_Ts = Ts[train_ids]
    
    # learn
    hp = PointProcessStorage(train_fold, train_Ts, basis_fs)
    model = DirichletMixtureModel(K, C, D, alpha, B, Sigma)
    EM = EM_clustering(hp, model, n_inner=5)

    _, nll_history, _ = EM.learn_hp(niter=niter, ninner=[2,4,6,7,8])

    # validate
    EM.hp = PointProcessStorage(ss, Ts, basis_fs)
    EM.N = len(ss)
    EM.int_g = []
    EM.g = []
    r = EM.e_step()
    
    labels[i] = r.argmax(-1)
    nlls[i] = torch.FloatTensor(nll_history)

    print(f'\nConsistency of clustering: {consistency(labels[:i+1]).item():.4f}')

  0%|          | 0/10 [00:00<?, ?it/s]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:37<02:30, 37.65s/it][A
NLL / N: 537.3705

 40%|████      | 2/5 [01:03<01:42, 34.08s/it][A
NLL / N: 596.5760

 60%|██████    | 3/5 [01:36<01:07, 33.93s/it][A
NLL / N: 587.5982

 80%|████████  | 4/5 [02:15<00:35, 35.21s/it][A
NLL / N: 601.1137

100%|██████████| 5/5 [02:58<00:00, 35.79s/it]
NLL / N: 645.3842

 10%|█         | 1/10 [03:50<34:37, 230.82s/it]
  0%|          | 0/5 [00:00<?, ?it/s][A
Consistency of clustering: nan

 20%|██        | 1/5 [00:38<02:32, 38.19s/it][A
NLL / N: 522.6370

 40%|████      | 2/5 [01:05<01:44, 34.94s/it][A
NLL / N: 516.6924

 60%|██████    | 3/5 [01:42<01:11, 35.68s/it][A
NLL / N: 482.7739

 80%|████████  | 4/5 [02:23<00:37, 37.17s/it][A
NLL / N: 487.5942

100%|██████████| 5/5 [03:09<00:00, 37.82s/it]
NLL / N: 559.7852

 20%|██        | 2/10 [07:49<31:05, 233.22s/it]
  0%|          | 0/5 [00:00<?, ?it/s][A
Consistency of clustering: 0.3361

 

In [10]:
assert (model.A >= 0).all()
assert (model.mu > 0).all()

In [11]:
# plt.figure(figsize=(9, 5))
# plt.grid()
# plt.plot(np.arange(niter)+1, nlls.mean(0).numpy() / len(train_ids))
# plt.fill_between(np.arange(niter)+1, (nlls.mean(0).numpy() - nlls.std(0).numpy()) / len(train_ids), (nlls.mean(0).numpy() + nlls.std(0).numpy()) / len(train_ids), alpha=0.2)
# plt.title('Mixing DMMHP', fontsize=15)
# plt.xlabel(r'$n$ outer iterations', fontsize=15)
# plt.ylabel(r'$\sim$ NLL / $N$', fontsize=15)
# plt.show()

In [12]:
print(f'Consistency of clustering: {consistency(labels).item():.4f}')

Consistency of clustering: 0.3033


### Synthetic data

In [255]:
path = Path('../..', 'data', 'simulated_Hawkes', 'K5_C5')
ss, Ts, class2idx = load_data(path, nfiles=2000, maxlen=-1, ext='csv')

In [258]:
gt_ids = pd.read_csv(Path(path, 'clusters.csv'))['cluster_id'].to_numpy()

In [259]:
gt_ids = torch.LongTensor(gt_ids)

In [260]:
N = len(ss)
D = 5
basis_fs = [lambda x: torch.exp(- x**2 / (3.*(k+1)**2) ) for k in range(D)]

hp = PointProcessStorage(ss, Ts, basis_fs)

C = len(class2idx)
K = 5


In [261]:
ntrials = 4
niter = 20

labels = torch.zeros(ntrials, len(ss))
nlls = torch.zeros(ntrials, niter)

for i in range(ntrials):
    Sigma = torch.rand(C, C, D, K)
    B = torch.rand(C, K)
    alpha = 1.

    model = DirichletMixtureModel(K, C, D, alpha, B, Sigma)
    EM = EM_clustering(hp, model)
    r, nll_history, r_history = EM.learn_hp(niter=niter, ninner=[2,2,3,3,4,4,5,5,6,6,7,7] + (niter - 12)*[8])

    labels[i] = r.argmax(-1)
    nlls[i] = torch.FloatTensor(nll_history)

    print(f'Purity: {purity(labels[i], gt_ids):.4f}')


1it [00:55, 55.29s/it]
NLL / N: 113.6334
2it [01:07, 42.34s/it]
NLL / N: 109.6858
3it [01:23, 34.44s/it]
NLL / N: 112.1539
4it [01:39, 28.99s/it]
NLL / N: 112.9830
5it [02:00, 26.53s/it]
NLL / N: 107.6991
6it [02:21, 24.77s/it]
NLL / N: 113.6205
7it [02:45, 24.51s/it]
NLL / N: 114.3848
8it [03:05, 23.22s/it]
NLL / N: 112.4740
9it [03:29, 23.45s/it]
NLL / N: 110.9315
10it [03:56, 24.70s/it]
NLL / N: 110.1535
11it [04:29, 27.04s/it]
NLL / N: 109.8683
12it [05:01, 28.49s/it]
NLL / N: 109.7370
13it [05:38, 31.10s/it]
NLL / N: 109.6369
14it [06:15, 32.75s/it]
NLL / N: 109.5524
15it [06:51, 33.91s/it]
NLL / N: 109.4811
16it [07:28, 34.82s/it]
NLL / N: 109.4183
17it [08:06, 35.66s/it]
NLL / N: 109.3717
18it [08:42, 35.83s/it]
NLL / N: 109.3340
19it [09:18, 35.99s/it]
NLL / N: 109.3059
20it [09:55, 29.77s/it]
0it [00:00, ?it/s]
NLL / N: 109.2871
Purity: 0.6380
1it [00:54, 54.32s/it]
NLL / N: 111.1096
2it [01:05, 41.51s/it]
NLL / N: 118.6358
3it [01:21, 33.75s/it]
NLL / N: 120.2586
4it [01:37, 

In [263]:
assert (model.A >= 0).all()
assert (model.mu > 0).all()

In [264]:
# plt.figure(figsize=(9, 5))
# plt.grid()
# plt.plot(np.arange(niter)+1, nlls.numpy() / len(ss))
# plt.title('Mixing of DMMHP', fontsize=15)
# plt.xlabel(r'$n$ outer iterations', fontsize=15)
# plt.ylabel(r'$\sim$ NLL / $N$', fontsize=15)
# plt.show()

In [265]:
print([purity(x, gt_ids) for x in r_history.argmax(-1)])

[0.24850000000000003, 0.278, 0.314, 0.37400000000000005, 0.42700000000000005, 0.4555, 0.5265, 0.6135, 0.6325, 0.6280000000000001, 0.6335000000000001, 0.6380000000000001, 0.6380000000000001, 0.6460000000000001, 0.6540000000000001, 0.6585000000000001, 0.6635, 0.6675, 0.671, 0.6735]


In [266]:
pur_val_mean = np.mean([purity(x, gt_ids) for x in labels])
pur_val_std = np.std([purity(x, gt_ids) for x in labels])

In [267]:
print(f'Purity: {pur_val_mean:.4f}+-{pur_val_std:.4f}')

Purity: 0.6450+-0.0168


In [268]:
labels[::10]

tensor([[3., 3., 3.,  ..., 1., 1., 1.]])

In [3]:
path = Path('../..', 'data', 'simulated_Hawkes', 'K4_C5')
ss, Ts, class2idx = load_data(path, nfiles=1600, maxlen=-1, ext='csv')

gt_ids = pd.read_csv(Path(path, 'clusters.csv'))['cluster_id'].to_numpy()

gt_ids = torch.LongTensor(gt_ids)

In [4]:
N = len(ss)
D = 5
basis_fs = [lambda x: torch.exp(- x**2 / (3.*(k+1)**2) ) for k in range(D)]

hp = PointProcessStorage(ss, Ts, basis_fs)

C = len(class2idx)
K = 4

ntrials = 4
niter = 20

labels = torch.zeros(ntrials, len(ss))
nlls = torch.zeros(ntrials, niter)

for i in range(ntrials):
    Sigma = torch.rand(C, C, D, K)
    B = torch.rand(C, K)
    alpha = 1.

    model = DirichletMixtureModel(K, C, D, alpha, B, Sigma)
    EM = EM_clustering(hp, model)
    r, nll_history, r_history = EM.learn_hp(niter=niter, ninner=[2,2,3,3,4,4,5,5,6,6,7,7] + (niter - 12)*[8])

    labels[i] = r.argmax(-1)
    nlls[i] = torch.FloatTensor(nll_history)

    print(f'Purity: {purity(labels[i], gt_ids):.4f}')


0it [00:02, ?it/s]


KeyboardInterrupt: 

In [None]:
pur_val_mean = np.mean([purity(x, gt_ids) for x in labels])
pur_val_std = np.std([purity(x, gt_ids) for x in labels])

In [None]:
print(f'Purity: {pur_val_mean:.4f}+-{pur_val_std:.4f}')

In [3]:
path = Path('../..', 'data', 'simulated_Hawkes', 'K3_C1')
ss, Ts, class2idx = load_data(path, nfiles=300, maxlen=-1, ext='csv')

gt_ids = pd.read_csv(Path(path, 'clusters.csv'))['cluster_id'].to_numpy()
gt_ids = torch.LongTensor(gt_ids)

In [4]:
N = len(ss)
D = 3
basis_fs = [lambda x: torch.exp(- x**2 / (3.*(k+1)**2) ) for k in range(D)]

hp = PointProcessStorage(ss, Ts, basis_fs)

C = len(class2idx)
K = 3

ntrials = 4
niter = 20

labels = torch.zeros(ntrials, len(ss))
nlls = torch.zeros(ntrials, niter)

for i in range(ntrials):
    Sigma = torch.rand(C, C, D, K)
    B = torch.rand(C, K)
    alpha = 1.

    model = DirichletMixtureModel(K, C, D, alpha, B, Sigma)
    EM = EM_clustering(hp, model)
    r, nll_history, r_history = EM.learn_hp(niter=niter, ninner=[2,2,3,3,4,4,5,5,6,6,7,7] + (niter - 12)*[8])

    labels[i] = r.argmax(-1)
    nlls[i] = torch.FloatTensor(nll_history)

    print(f'Purity: {purity(labels[i], gt_ids):.4f}')


1it [04:32, 272.51s/it]
NLL / N: 821.9668
2it [05:48, 213.68s/it]
NLL / N: -21.0775
3it [06:51, 168.34s/it]
NLL / N: -25.2455
4it [07:52, 136.04s/it]
NLL / N: 10.4621
5it [09:08, 118.03s/it]
NLL / N: 11.2603
6it [10:24, 105.58s/it]
NLL / N: 10.7322
7it [11:50, 99.54s/it] 
NLL / N: 10.5084
8it [13:14, 95.13s/it]
NLL / N: 10.4957
9it [14:53, 96.14s/it]
NLL / N: 10.5146
10it [16:32, 97.00s/it]
NLL / N: 10.4436
11it [18:17, 99.54s/it]
NLL / N: 10.3164
12it [20:07, 102.40s/it]
NLL / N: 10.7432
13it [22:05, 107.34s/it]
NLL / N: 10.7802
14it [24:06, 111.20s/it]
NLL / N: 10.5157
15it [26:06, 113.99s/it]
NLL / N: 10.5320
16it [28:04, 115.03s/it]
NLL / N: 11.0811
17it [29:58, 114.78s/it]
NLL / N: 10.9054
18it [31:59, 116.58s/it]
NLL / N: 10.1535
19it [33:58, 117.60s/it]
NLL / N: 10.5953
20it [35:58, 107.94s/it]
0it [00:00, ?it/s]
NLL / N: 10.6457
Purity: 0.6633
1it [01:59, 119.20s/it]
NLL / N: 291.6378
2it [02:44, 96.90s/it] 
NLL / N: 16.0954
3it [03:42, 85.38s/it]
NLL / N: 11.6632
4it [04:38, 7

KeyboardInterrupt: 

In [None]:
pur_val_mean = np.mean([purity(x, gt_ids) for x in labels])
pur_val_std = np.std([purity(x, gt_ids) for x in labels])

In [None]:
print(f'Purity: {pur_val_mean:.4f}+-{pur_val_std:.4f}')