In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from zipfile import ZipFile
import pandas as pd
import torch
import random
import numpy as np
from matplotlib import pyplot as plt
from tqdm import tqdm, trange

import sys
sys.path.append('..')

from HP import PointProcessStorage, DirichletMixtureModel, EM_clustering
from metrics import consistency, purity
from Cohortney.data_utils import load_data

### IPTV Data

In [3]:
path = Path('../..', 'data', 'IPTV_Data')
ss, Ts, class2idx, _ = load_data(path, ext='txt', datetime=True, maxlen=500)

In [4]:
N = len(ss)

In [5]:
D = 5
basis_fs = [lambda x: torch.exp(- x**2 / (2.*(k+1)**2) ) for k in range(D)]

In [6]:
C = len(class2idx)
K = 6

In [7]:
ntrials = 10
niter = 10

labels = torch.zeros(ntrials, len(ss))
nlls = torch.zeros(ntrials, niter)

for i in trange(ntrials):
    Sigma = torch.rand(C, C, D, K)
    B = torch.rand(C, K)
    alpha = 1.

    train_ids = np.sort(np.random.choice(np.arange(len(ss)), size=len(ss) // 2, replace=False))
    train_fold = [ss[i] for i in range(len(ss)) if i in train_ids]
    train_Ts = Ts[train_ids]
    
    # learn
    hp = PointProcessStorage(train_fold, train_Ts, basis_fs)
    model = DirichletMixtureModel(K, C, D, alpha, B, Sigma)
    EM = EM_clustering(hp, model, n_inner=5)

    _, nll_history, _ = EM.learn_hp(niter=niter, ninner=[2,3,4,5,6,7] + (niter-6)*[8])

    # validate
    EM.hp = PointProcessStorage(ss, Ts, basis_fs)
    EM.N = len(ss)
    EM.int_g = []
    EM.g = []
    r = EM.e_step()
    
    labels[i] = r.argmax(-1)
    nlls[i] = torch.FloatTensor(nll_history)

    print(f'\nConsistency of clustering: {consistency(labels[:i+1]).item():.4f}')

  0%|          | 0/10 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
1it [01:18, 78.20s/it][A
NLL / N: 432.7707

2it [01:48, 63.93s/it][A
NLL / N: 502.6920

3it [02:36, 59.14s/it][A
NLL / N: 566.7167

4it [03:38, 59.95s/it][A
NLL / N: 492.0925

5it [04:52, 64.19s/it][A
NLL / N: 492.3028

6it [06:12, 69.02s/it][A
NLL / N: 514.4466

7it [07:41, 74.77s/it][A
NLL / N: 537.4928

8it [08:47, 72.20s/it][A
NLL / N: 564.8536

9it [09:44, 67.65s/it][A
NLL / N: 570.9984

10it [10:59, 65.95s/it]
NLL / N: 575.5531

 10%|█         | 1/10 [12:22<1:51:18, 742.10s/it]
0it [00:00, ?it/s][A
Consistency of clustering: nan

1it [01:14, 74.91s/it][A
NLL / N: 379.3969

2it [01:53, 64.15s/it][A
NLL / N: 397.7283

3it [02:44, 59.98s/it][A
NLL / N: 414.1325

4it [03:37, 58.09s/it][A
NLL / N: 446.1228

5it [04:48, 61.75s/it][A
NLL / N: 445.5093

6it [06:08, 67.26s/it][A
NLL / N: 519.6938

7it [07:35, 73.17s/it][A
NLL / N: 539.3990

8it [09:01, 77.15s/it][A
NLL / N: 559.1468

9it [10:00, 71.70s/it][A
N

In [8]:
assert (model.A >= 0).all()
assert (model.mu > 0).all()

In [9]:
# plt.figure(figsize=(9, 5))
# plt.grid()
# plt.plot(np.arange(niter)+1, nlls.mean(0).numpy() / len(train_ids))
# plt.fill_between(np.arange(niter)+1, (nlls.mean(0).numpy() - nlls.std(0).numpy()) / len(train_ids), (nlls.mean(0).numpy() + nlls.std(0).numpy()) / len(train_ids), alpha=0.2)
# plt.title('Mixing DMMHP', fontsize=15)
# plt.xlabel(r'$n$ outer iterations', fontsize=15)
# plt.ylabel(r'$\sim$ NLL / $N$', fontsize=15)
# plt.show()

In [10]:
print(f'Consistency of clustering: {consistency(labels).item():.4f}')

Consistency of clustering: 0.3660


### Synthetic data

#### 5 cluster, 5 classes

In [11]:
path = Path('../..', 'data', 'simulated_Hawkes', 'K5_C5')
ss, Ts, class2idx, _ = load_data(path, maxlen=-1, ext='csv', datetime=False)

In [12]:
gt_ids = pd.read_csv(Path(path, 'clusters.csv'))['cluster_id'].to_numpy()

In [13]:
gt_ids = torch.LongTensor(gt_ids)

In [14]:
N = len(ss)
D = 5
basis_fs = [lambda x: torch.exp(- x**2 / (2.*(k+1)**2) ) for k in range(D)]

hp = PointProcessStorage(ss, Ts, basis_fs)

C = len(class2idx)
K = 5


In [15]:
ntrials = 5
niter = 10

labels = torch.zeros(ntrials, len(ss))
nlls = torch.zeros(ntrials, niter)

for i in range(ntrials):
    Sigma = torch.rand(C, C, D, K)
    B = torch.rand(C, K)
    alpha = 1.

    model = DirichletMixtureModel(K, C, D, alpha, B, Sigma)
    EM = EM_clustering(hp, model)
    r, nll_history, r_history = EM.learn_hp(niter=niter, ninner=[2,3,4,5,6,7] + (niter - 6)*[8])

    labels[i] = r.argmax(-1)
    nlls[i] = torch.FloatTensor(nll_history)

    print(f'Purity: {purity(labels[i], gt_ids):.4f}')


1it [00:50, 50.66s/it]
NLL / N: 114.6237
2it [01:04, 39.70s/it]
NLL / N: 112.4859
3it [01:22, 33.22s/it]
NLL / N: 107.7913
4it [01:44, 29.71s/it]
NLL / N: 107.6673
5it [02:09, 28.32s/it]
NLL / N: 110.5551
6it [02:38, 28.54s/it]
NLL / N: 112.7791
7it [03:11, 29.99s/it]
NLL / N: 111.4655
8it [03:43, 30.40s/it]
NLL / N: 110.0676
9it [04:16, 31.33s/it]
NLL / N: 109.5276
10it [04:48, 28.81s/it]
0it [00:00, ?it/s]
NLL / N: 109.4583
Purity: 0.6370
1it [00:50, 50.08s/it]
NLL / N: 110.6662
2it [01:03, 38.95s/it]
NLL / N: 110.4590
3it [01:19, 32.21s/it]
NLL / N: 109.8640
4it [01:40, 28.86s/it]
NLL / N: 106.5325
5it [02:05, 27.67s/it]
NLL / N: 112.3647
6it [02:33, 27.83s/it]
NLL / N: 110.9341
7it [03:04, 28.84s/it]
NLL / N: 109.2221
8it [03:36, 29.55s/it]
NLL / N: 108.0501
9it [04:07, 30.13s/it]
NLL / N: 107.5931
10it [04:38, 27.84s/it]
0it [00:00, ?it/s]
NLL / N: 107.3807
Purity: 0.5150
1it [00:51, 51.33s/it]
NLL / N: 107.9458
2it [01:05, 40.25s/it]
NLL / N: 105.7468
3it [01:24, 33.83s/it]
NLL /

In [16]:
assert (model.A >= 0).all()
assert (model.mu > 0).all()

In [17]:
# plt.figure(figsize=(9, 5))
# plt.grid()
# plt.plot(np.arange(niter)+1, nlls.numpy() / len(ss))
# plt.title('Mixing of DMMHP', fontsize=15)
# plt.xlabel(r'$n$ outer iterations', fontsize=15)
# plt.ylabel(r'$\sim$ NLL / $N$', fontsize=15)
# plt.show()

In [18]:
print([purity(x, gt_ids) for x in r_history.argmax(-1)])

[0.36000000000000004, 0.40199999999999997, 0.5175000000000001, 0.5995, 0.596, 0.6105, 0.636, 0.6365000000000001, 0.635, 0.6385000000000001]


In [19]:
pur_val_mean = np.mean([purity(x, gt_ids) for x in labels])
pur_val_std = np.std([purity(x, gt_ids) for x in labels])

In [20]:
print(f'Purity: {pur_val_mean:.4f}+-{pur_val_std:.4f}')

Purity: 0.6108+-0.0495


In [21]:
labels[::10]

tensor([[3., 3., 3.,  ..., 0., 0., 0.]])

#### 4 cluster, 5 classes

In [22]:
path = Path('../..', 'data', 'simulated_Hawkes', 'K4_C5')
ss, Ts, class2idx, _ = load_data(path, maxlen=-1, ext='csv', datetime=False)

gt_ids = pd.read_csv(Path(path, 'clusters.csv'))['cluster_id'].to_numpy()

gt_ids = torch.LongTensor(gt_ids)

In [23]:
N = len(ss)
D = 5
basis_fs = [lambda x: torch.exp(- x**2 / (2.*(k+1)**2) ) for k in range(D)]

hp = PointProcessStorage(ss, Ts, basis_fs)

C = len(class2idx)
K = 4

ntrials = 5
niter = 10

labels = torch.zeros(ntrials, len(ss))
nlls = torch.zeros(ntrials, niter)

for i in range(ntrials):
    Sigma = torch.rand(C, C, D, K)
    B = torch.rand(C, K)
    alpha = 1.

    model = DirichletMixtureModel(K, C, D, alpha, B, Sigma)
    EM = EM_clustering(hp, model)
    r, nll_history, r_history = EM.learn_hp(niter=niter, ninner=[2,3,4,5,6,7] + (niter - 6)*[8])

    labels[i] = r.argmax(-1)
    nlls[i] = torch.FloatTensor(nll_history)

    print(f'Purity: {purity(labels[i], gt_ids):.4f}')


1it [00:39, 39.30s/it]
NLL / N: 112.3002
2it [00:51, 31.06s/it]
NLL / N: 113.9484
3it [01:03, 25.56s/it]
NLL / N: 110.4884
4it [01:21, 23.07s/it]
NLL / N: 109.6833
5it [01:41, 22.12s/it]
NLL / N: 110.3378
6it [02:02, 21.91s/it]
NLL / N: 110.3131
7it [02:28, 23.22s/it]
NLL / N: 110.4250
8it [02:52, 23.29s/it]
NLL / N: 110.5777
9it [03:15, 23.28s/it]
NLL / N: 110.6202
10it [03:40, 22.07s/it]
0it [00:00, ?it/s]
NLL / N: 110.6629
Purity: 0.7450
1it [00:39, 39.82s/it]
NLL / N: 114.2860
2it [00:50, 31.12s/it]
NLL / N: 113.1868
3it [01:03, 25.74s/it]
NLL / N: 107.6603
4it [01:19, 22.85s/it]
NLL / N: 107.9281
5it [01:39, 21.98s/it]
NLL / N: 111.7562
6it [02:01, 21.98s/it]
NLL / N: 111.9735
7it [02:28, 23.46s/it]
NLL / N: 111.6347
8it [02:54, 24.12s/it]
NLL / N: 111.5501
9it [03:19, 24.43s/it]
NLL / N: 111.5162
10it [03:45, 22.51s/it]
0it [00:00, ?it/s]
NLL / N: 111.5071
Purity: 0.7350
1it [00:39, 39.98s/it]
NLL / N: 113.2184
2it [00:51, 31.37s/it]
NLL / N: 106.9172
3it [01:04, 25.83s/it]
NLL /

In [24]:
pur_val_mean = np.mean([purity(x, gt_ids) for x in labels])
pur_val_std = np.std([purity(x, gt_ids) for x in labels])

In [25]:
print(f'Purity: {pur_val_mean:.4f}+-{pur_val_std:.4f}')

Purity: 0.7505+-0.0212


#### 3 cluster, 5 classes

In [26]:
path = Path('../..', 'data', 'simulated_Hawkes', 'K3_C5')
ss, Ts, class2idx, _ = load_data(path, maxlen=-1, ext='csv', datetime=False)

gt_ids = pd.read_csv(Path(path, 'clusters.csv'))['cluster_id'].to_numpy()
gt_ids = torch.LongTensor(gt_ids)

In [27]:
N = len(ss)
D = 5
basis_fs = [lambda x: torch.exp(- x**2 / (3.*(k+1)**2) ) for k in range(D)]

hp = PointProcessStorage(ss, Ts, basis_fs)

C = len(class2idx)
K = 3

ntrials = 5
niter = 10

labels = torch.zeros(ntrials, len(ss))
nlls = torch.zeros(ntrials, niter)

for i in range(ntrials):
    Sigma = torch.rand(C, C, D, K)
    B = torch.rand(C, K)
    alpha = 1.

    model = DirichletMixtureModel(K, C, D, alpha, B, Sigma)
    EM = EM_clustering(hp, model)
    r, nll_history, r_history = EM.learn_hp(niter=niter, ninner=[2,3,4,5,6,7] + (niter - 6)*[8])

    labels[i] = r.argmax(-1)
    nlls[i] = torch.FloatTensor(nll_history)

    print(f'Purity: {purity(labels[i], gt_ids):.4f}')


1it [00:28, 28.83s/it]
NLL / N: 112.3995
2it [00:36, 22.56s/it]
NLL / N: 107.3360
3it [00:46, 18.57s/it]
NLL / N: 117.0898
4it [00:58, 16.87s/it]
NLL / N: 116.8130
5it [01:13, 16.31s/it]
NLL / N: 107.0087
6it [01:31, 16.60s/it]
NLL / N: 116.9097
7it [01:50, 17.33s/it]
NLL / N: 116.2267
8it [02:09, 17.94s/it]
NLL / N: 113.6239
9it [02:28, 18.22s/it]
NLL / N: 112.0471
10it [02:47, 16.73s/it]
0it [00:00, ?it/s]
NLL / N: 111.6934
Purity: 0.8808
1it [00:28, 28.66s/it]
NLL / N: 112.1850
2it [00:35, 22.21s/it]
NLL / N: 106.7475
3it [00:45, 18.50s/it]
NLL / N: 103.8981
4it [00:58, 16.67s/it]
NLL / N: 102.8498
5it [01:11, 15.81s/it]
NLL / N: 105.6443
6it [01:28, 16.11s/it]
NLL / N: 111.2864
7it [01:47, 16.82s/it]
NLL / N: 109.3345
8it [02:04, 17.05s/it]
NLL / N: 110.3924
9it [02:23, 17.43s/it]
NLL / N: 111.2633
10it [02:41, 16.11s/it]
0it [00:00, ?it/s]
NLL / N: 111.3966
Purity: 0.8392
1it [00:28, 28.22s/it]
NLL / N: 114.1269
2it [00:36, 22.12s/it]
NLL / N: 122.7714
3it [00:46, 18.69s/it]
NLL /

In [28]:
pur_val_mean = np.mean([purity(x, gt_ids) for x in labels])
pur_val_std = np.std([purity(x, gt_ids) for x in labels])

In [29]:
print(f'Purity: {pur_val_mean:.4f}+-{pur_val_std:.4f}')

Purity: 0.8723+-0.0298
