In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
from pathlib import Path
from zipfile import ZipFile
import pandas as pd
import torch
import random
import numpy as np
from matplotlib import pyplot as plt
from tqdm import tqdm, trange

import sys
sys.path.append('..')

from HP import PointProcessStorage, DirichletMixtureModel, EM_clustering
from metrics import consistency, purity
from Cohortney.data_utils import load_data

### IPTV Data

In [5]:
path = Path('../..', 'data', 'IPTV_Data')
ss, Ts, class2idx, _ = load_data(path, ext='txt', datetime=True, maxlen=500)

In [6]:
N = len(ss)

In [9]:
D = 3
basis_fs = [lambda x: torch.exp(- x**2 / (2.*(k+1)**2) ) for k in range(D)]

In [10]:
C = len(class2idx)
K = 6

In [7]:
ntrials = 10
niter = 5

labels = torch.zeros(ntrials, len(ss))
nlls = torch.zeros(ntrials, niter)

for i in trange(ntrials):
    Sigma = torch.rand(C, C, D, K)
    B = torch.rand(C, K)
    alpha = 1.

    train_ids = np.sort(np.random.choice(np.arange(len(ss)), size=len(ss) // 2, replace=False))
    train_fold = [ss[i] for i in range(len(ss)) if i in train_ids]
    train_Ts = Ts[train_ids]
    
    # learn
    hp = PointProcessStorage(train_fold, train_Ts, basis_fs)
    model = DirichletMixtureModel(K, C, D, alpha, B, Sigma)
    EM = EM_clustering(hp, model, n_inner=5)

    _, nll_history, _ = EM.learn_hp(niter=niter, ninner=[2,4,6,7,8])

    # validate
    EM.hp = PointProcessStorage(ss, Ts, basis_fs)
    EM.N = len(ss)
    EM.int_g = []
    EM.g = []
    r = EM.e_step()
    
    labels[i] = r.argmax(-1)
    nlls[i] = torch.FloatTensor(nll_history)

    print(f'\nConsistency of clustering: {consistency(labels[:i+1]).item():.4f}')

  0%|          | 0/10 [00:00<?, ?it/s]
0it [00:04, ?it/s]
  0%|          | 0/10 [00:04<?, ?it/s]


KeyboardInterrupt: 

In [None]:
assert (model.A >= 0).all()
assert (model.mu > 0).all()

In [None]:
# plt.figure(figsize=(9, 5))
# plt.grid()
# plt.plot(np.arange(niter)+1, nlls.mean(0).numpy() / len(train_ids))
# plt.fill_between(np.arange(niter)+1, (nlls.mean(0).numpy() - nlls.std(0).numpy()) / len(train_ids), (nlls.mean(0).numpy() + nlls.std(0).numpy()) / len(train_ids), alpha=0.2)
# plt.title('Mixing DMMHP', fontsize=15)
# plt.xlabel(r'$n$ outer iterations', fontsize=15)
# plt.ylabel(r'$\sim$ NLL / $N$', fontsize=15)
# plt.show()

In [None]:
print(f'Consistency of clustering: {consistency(labels).item():.4f}')

### Synthetic data

In [8]:
path = Path('../..', 'data', 'simulated_Hawkes', 'K5_C5')
ss, Ts, class2idx, _ = load_data(path, maxlen=-1, ext='csv', datetime=False)

In [19]:
gt_ids = pd.read_csv(Path(path, 'clusters.csv'))['cluster_id'].to_numpy()

In [20]:
gt_ids = torch.LongTensor(gt_ids)

In [21]:
N = len(ss)
D = 5
basis_fs = [lambda x: torch.exp(- x**2 / (3.*(k+1)**2) ) for k in range(D)]

hp = PointProcessStorage(ss, Ts, basis_fs)

C = len(class2idx)
K = 5


In [22]:
ntrials = 4
niter = 20

labels = torch.zeros(ntrials, len(ss))
nlls = torch.zeros(ntrials, niter)

for i in range(ntrials):
    Sigma = torch.rand(C, C, D, K)
    B = torch.rand(C, K)
    alpha = 1.

    model = DirichletMixtureModel(K, C, D, alpha, B, Sigma)
    EM = EM_clustering(hp, model)
    r, nll_history, r_history = EM.learn_hp(niter=niter, ninner=[2,2,3,3,4,4,5,5,6,6,7,7] + (niter - 12)*[8])

    labels[i] = r.argmax(-1)
    nlls[i] = torch.FloatTensor(nll_history)

    print(f'Purity: {purity(labels[i], gt_ids):.4f}')


0it [00:00, ?it/s]

In [263]:
assert (model.A >= 0).all()
assert (model.mu > 0).all()

In [264]:
# plt.figure(figsize=(9, 5))
# plt.grid()
# plt.plot(np.arange(niter)+1, nlls.numpy() / len(ss))
# plt.title('Mixing of DMMHP', fontsize=15)
# plt.xlabel(r'$n$ outer iterations', fontsize=15)
# plt.ylabel(r'$\sim$ NLL / $N$', fontsize=15)
# plt.show()

In [265]:
print([purity(x, gt_ids) for x in r_history.argmax(-1)])

[0.24850000000000003, 0.278, 0.314, 0.37400000000000005, 0.42700000000000005, 0.4555, 0.5265, 0.6135, 0.6325, 0.6280000000000001, 0.6335000000000001, 0.6380000000000001, 0.6380000000000001, 0.6460000000000001, 0.6540000000000001, 0.6585000000000001, 0.6635, 0.6675, 0.671, 0.6735]


In [266]:
pur_val_mean = np.mean([purity(x, gt_ids) for x in labels])
pur_val_std = np.std([purity(x, gt_ids) for x in labels])

In [267]:
print(f'Purity: {pur_val_mean:.4f}+-{pur_val_std:.4f}')

Purity: 0.6450+-0.0168


In [268]:
labels[::10]

tensor([[3., 3., 3.,  ..., 1., 1., 1.]])

In [3]:
path = Path('../..', 'data', 'simulated_Hawkes', 'K4_C5')
ss, Ts, class2idx, _ = load_data(path, maxlen=-1, ext='csv', datetime=False)

gt_ids = pd.read_csv(Path(path, 'clusters.csv'))['cluster_id'].to_numpy()

gt_ids = torch.LongTensor(gt_ids)

In [4]:
N = len(ss)
D = 5
basis_fs = [lambda x: torch.exp(- x**2 / (3.*(k+1)**2) ) for k in range(D)]

hp = PointProcessStorage(ss, Ts, basis_fs)

C = len(class2idx)
K = 4

ntrials = 4
niter = 20

labels = torch.zeros(ntrials, len(ss))
nlls = torch.zeros(ntrials, niter)

for i in range(ntrials):
    Sigma = torch.rand(C, C, D, K)
    B = torch.rand(C, K)
    alpha = 1.

    model = DirichletMixtureModel(K, C, D, alpha, B, Sigma)
    EM = EM_clustering(hp, model)
    r, nll_history, r_history = EM.learn_hp(niter=niter, ninner=[2,2,3,3,4,4,5,5,6,6,7,7] + (niter - 12)*[8])

    labels[i] = r.argmax(-1)
    nlls[i] = torch.FloatTensor(nll_history)

    print(f'Purity: {purity(labels[i], gt_ids):.4f}')


0it [00:02, ?it/s]


KeyboardInterrupt: 

In [None]:
pur_val_mean = np.mean([purity(x, gt_ids) for x in labels])
pur_val_std = np.std([purity(x, gt_ids) for x in labels])

In [None]:
print(f'Purity: {pur_val_mean:.4f}+-{pur_val_std:.4f}')

In [3]:
path = Path('../..', 'data', 'simulated_Hawkes', 'K3_C1')
ss, Ts, class2idx, _ = load_data(path, maxlen=-1, ext='csv', datetime=False)

gt_ids = pd.read_csv(Path(path, 'clusters.csv'))['cluster_id'].to_numpy()
gt_ids = torch.LongTensor(gt_ids)

In [4]:
N = len(ss)
D = 3
basis_fs = [lambda x: torch.exp(- x**2 / (3.*(k+1)**2) ) for k in range(D)]

hp = PointProcessStorage(ss, Ts, basis_fs)

C = len(class2idx)
K = 3

ntrials = 4
niter = 20

labels = torch.zeros(ntrials, len(ss))
nlls = torch.zeros(ntrials, niter)

for i in range(ntrials):
    Sigma = torch.rand(C, C, D, K)
    B = torch.rand(C, K)
    alpha = 1.

    model = DirichletMixtureModel(K, C, D, alpha, B, Sigma)
    EM = EM_clustering(hp, model)
    r, nll_history, r_history = EM.learn_hp(niter=niter, ninner=[2,2,3,3,4,4,5,5,6,6,7,7] + (niter - 12)*[8])

    labels[i] = r.argmax(-1)
    nlls[i] = torch.FloatTensor(nll_history)

    print(f'Purity: {purity(labels[i], gt_ids):.4f}')


1it [01:36, 96.71s/it]
NLL / N: 679.9168
2it [02:03, 75.61s/it]
NLL / N: -60.2518
3it [02:37, 63.36s/it]
NLL / N: -0.6981
4it [03:11, 54.53s/it]
NLL / N: 20.1141
5it [03:54, 50.84s/it]
NLL / N: 20.1951
6it [04:35, 48.15s/it]
NLL / N: 20.4390
7it [05:24, 48.29s/it]
NLL / N: 20.7928
8it [06:17, 49.75s/it]
NLL / N: 20.9816
9it [07:30, 56.76s/it]
NLL / N: 21.3264
10it [08:50, 63.62s/it]
NLL / N: 21.2528
11it [10:13, 69.36s/it]
NLL / N: 21.3383
12it [11:17, 67.84s/it]
NLL / N: 21.4435
13it [12:55, 76.87s/it]
NLL / N: 21.5729
14it [14:07, 75.39s/it]
NLL / N: 21.5506
15it [15:28, 77.27s/it]
NLL / N: 21.3079
16it [17:19, 87.37s/it]
NLL / N: 21.3306
17it [18:55, 89.80s/it]
NLL / N: 21.3939
18it [21:04, 101.70s/it]
NLL / N: 21.3338
19it [22:16, 92.67s/it] 
NLL / N: 21.5600
20it [23:47, 71.37s/it]
0it [00:00, ?it/s]
NLL / N: 21.3110
Purity: 0.9033
1it [01:43, 103.57s/it]
NLL / N: 665.3323
2it [02:11, 80.93s/it] 
NLL / N: -51.3180
3it [03:05, 72.69s/it]
NLL / N: -3.0699
4it [03:52, 65.07s/it]
NLL 

In [5]:
pur_val_mean = np.mean([purity(x, gt_ids) for x in labels])
pur_val_std = np.std([purity(x, gt_ids) for x in labels])

In [6]:
print(f'Purity: {pur_val_mean:.4f}+-{pur_val_std:.4f}')

Purity: 0.8433+-0.1039
