In [1]:
import time
import hdbscan
import numpy as np
import pandas as pd
from tqdm import tqdm 
# import seaborn as sns
from src.fABBA_test import fABBA
import matplotlib.pyplot as plt
from src.cABBA_test import fABBA as cABBA
from sklearn import cluster, mixture
from src.cagg_memview import aggregate as aggregate_fc
from sklearn.metrics import mean_squared_error
from tslearn.metrics import dtw as dtw
from threadpoolctl import threadpool_limits
import warnings
plt.style.use('bmh')
warnings.filterwarnings('ignore')
np.random.seed(0)

In [2]:
N = 5000
M = 100
TOL = 0.15
ALPHA = 0.5

compression_rate =  np.zeros(M)
clustering_mse = {
    'Aggregation': np.zeros(M),
    'K-means++': np.zeros(M),
    'Spectral Clustering': np.zeros(M),
    'Gaussian Mixture': np.zeros(M),
    #'BIRCH': np.zeros(M),
    'DBSCAN': np.zeros(M),
    'HDBSCAN': np.zeros(M)
}

clustering_dtw = {
    'Aggregation': np.zeros(M),
    'K-means++': np.zeros(M),
    'Spectral Clustering': np.zeros(M),
    'Gaussian Mixture': np.zeros(M),
    #'BIRCH': np.zeros(M),
    'DBSCAN': np.zeros(M),
    'HDBSCAN': np.zeros(M)
}

clustering_time = {
    'Aggregation': np.zeros(M),
    'K-means++': np.zeros(M),
    'Spectral Clustering': np.zeros(M),
    'Gaussian Mixture': np.zeros(M),
    #'BIRCH': np.zeros(M),
    'DBSCAN': np.zeros(M),
    'HDBSCAN': np.zeros(M)
}

clustering_symbols = {
    'Aggregation': np.zeros(M, dtype=int),
    'K-means++': np.zeros(M, dtype=int),
    'Spectral Clustering': np.zeros(M, dtype=int),
    'Gaussian Mixture': np.zeros(M, dtype=int),
    #'BIRCH': np.zeros(M, dtype=int),
    'DBSCAN': np.zeros(M, dtype=int),
    'HDBSCAN': np.zeros(M, dtype=int)
}

with threadpool_limits(limits=1, user_api='blas'):
    for i in tqdm(range(M)):
        ts = np.random.rand(N)
        
        st = time.time()
        fabba = fABBA(tol=TOL, alpha=ALPHA, sorting='2-norm', scl=1, verbose=1, max_len=np.inf) 
        strings = fabba.fit_transform(ts)
        inverse_ts = fabba.inverse_transform(strings, ts[0])
        et = time.time()
        
        clustering_mse['Aggregation'][i] = mean_squared_error(ts, inverse_ts)
        clustering_dtw['Aggregation'][i] = dtw(ts, inverse_ts)
        clustering_time['Aggregation'][i] = et - st
        clustering_symbols['Aggregation'][i] = len(fabba.splist)
        
        clusters_num = fabba.centers.shape[0]

        kmeans = cluster.KMeans(n_clusters=clusters_num, random_state=1, init='k-means++')
        spectral = cluster.SpectralClustering(n_clusters=clusters_num, eigen_solver='arpack', affinity="nearest_neighbors")
        gmm = mixture.GaussianMixture(n_components=clusters_num, covariance_type='full', init_params='kmeans')
        # birch = cluster.Birch(n_clusters=2*clusters_num)
        dbscan = cluster.DBSCAN(eps=TOL, min_samples=2)
        h_dbscan = hdbscan.HDBSCAN(algorithm='best', min_samples=2, min_cluster_size=12, alpha=ALPHA)
        
        clustering_algorithms = (
            ('K-means++', kmeans),
            ('Spectral Clustering', spectral),
            ('Gaussian Mixture', gmm),
            # ('BIRCH', birch),
            ('DBSCAN', dbscan),
            ('HDBSCAN', h_dbscan)
        )
        
        compression_rate[i] = fabba.compression_rate
        
        for name, clustering in clustering_algorithms:
            try:
                st = time.time()
                cabba = cABBA(clustering=clustering.fit_predict, tol=TOL, scl=1, verbose=0, max_len=np.inf) 
                strings = cabba.fit_transform(ts)
                inverse_ts = cabba.inverse_transform(strings, ts[0])
                et = time.time()
                
                clustering_mse[name][i] = mean_squared_error(ts, inverse_ts)
                clustering_dtw[name][i] = dtw(ts, inverse_ts)
                clustering_time[name][i] = et - st
                clustering_symbols[name][i] = cabba.centers.shape[0]
                # print(name, cabba.centers.shape[0])
            except Exception as e:
                print(name, "error!", e)

  0%|          | 0/100 [00:00<?, ?it/s]

Compression: Reduced series of length 5000 to 1554 segments. Digitization: Reduced 1554 pieces to 45 symbols.


  1%|          | 1/100 [00:03<06:06,  3.70s/it]

Compression: Reduced series of length 5000 to 1393 segments. Digitization: Reduced 1393 pieces to 51 symbols.


  2%|▏         | 2/100 [00:06<05:34,  3.41s/it]

Compression: Reduced series of length 5000 to 1574 segments. Digitization: Reduced 1574 pieces to 48 symbols.


  3%|▎         | 3/100 [00:10<05:22,  3.32s/it]

Compression: Reduced series of length 5000 to 1587 segments. Digitization: Reduced 1587 pieces to 51 symbols.


  4%|▍         | 4/100 [00:13<05:17,  3.30s/it]

Compression: Reduced series of length 5000 to 1546 segments. Digitization: Reduced 1546 pieces to 46 symbols.


  5%|▌         | 5/100 [00:16<05:09,  3.26s/it]

Compression: Reduced series of length 5000 to 1581 segments. Digitization: Reduced 1581 pieces to 48 symbols.


  6%|▌         | 6/100 [00:19<05:03,  3.23s/it]

Compression: Reduced series of length 5000 to 1598 segments. Digitization: Reduced 1598 pieces to 45 symbols.


  7%|▋         | 7/100 [00:22<04:57,  3.19s/it]

Compression: Reduced series of length 5000 to 1621 segments. Digitization: Reduced 1621 pieces to 52 symbols.


  8%|▊         | 8/100 [00:26<05:01,  3.27s/it]

Compression: Reduced series of length 5000 to 1448 segments. Digitization: Reduced 1448 pieces to 50 symbols.


  9%|▉         | 9/100 [00:29<04:51,  3.20s/it]

Compression: Reduced series of length 5000 to 1496 segments. Digitization: Reduced 1496 pieces to 49 symbols.


 10%|█         | 10/100 [00:32<04:46,  3.18s/it]

Compression: Reduced series of length 5000 to 1521 segments. Digitization: Reduced 1521 pieces to 51 symbols.


 11%|█         | 11/100 [00:35<04:45,  3.21s/it]

Compression: Reduced series of length 5000 to 1515 segments. Digitization: Reduced 1515 pieces to 49 symbols.


 12%|█▏        | 12/100 [00:39<04:46,  3.26s/it]

Compression: Reduced series of length 5000 to 1621 segments. Digitization: Reduced 1621 pieces to 53 symbols.


 13%|█▎        | 13/100 [00:42<04:49,  3.33s/it]

Compression: Reduced series of length 5000 to 1532 segments. Digitization: Reduced 1532 pieces to 46 symbols.


 14%|█▍        | 14/100 [00:46<04:48,  3.35s/it]

Compression: Reduced series of length 5000 to 1486 segments. Digitization: Reduced 1486 pieces to 49 symbols.


 15%|█▌        | 15/100 [00:49<04:53,  3.45s/it]

Compression: Reduced series of length 5000 to 1465 segments. Digitization: Reduced 1465 pieces to 51 symbols.


 16%|█▌        | 16/100 [00:53<04:56,  3.53s/it]

Compression: Reduced series of length 5000 to 1556 segments. Digitization: Reduced 1556 pieces to 46 symbols.


 17%|█▋        | 17/100 [00:57<05:01,  3.64s/it]

Compression: Reduced series of length 5000 to 1513 segments. Digitization: Reduced 1513 pieces to 46 symbols.


 18%|█▊        | 18/100 [01:01<05:00,  3.67s/it]

Compression: Reduced series of length 5000 to 1511 segments. Digitization: Reduced 1511 pieces to 51 symbols.


 19%|█▉        | 19/100 [01:05<05:05,  3.77s/it]

Compression: Reduced series of length 5000 to 1531 segments. Digitization: Reduced 1531 pieces to 46 symbols.


 20%|██        | 20/100 [01:08<05:02,  3.78s/it]

Compression: Reduced series of length 5000 to 1560 segments. Digitization: Reduced 1560 pieces to 45 symbols.


 21%|██        | 21/100 [01:12<04:58,  3.77s/it]

Compression: Reduced series of length 5000 to 1688 segments. Digitization: Reduced 1688 pieces to 52 symbols.


 22%|██▏       | 22/100 [01:16<04:46,  3.68s/it]

Compression: Reduced series of length 5000 to 1572 segments. Digitization: Reduced 1572 pieces to 52 symbols.


 23%|██▎       | 23/100 [01:19<04:30,  3.51s/it]

Compression: Reduced series of length 5000 to 1626 segments. Digitization: Reduced 1626 pieces to 49 symbols.


 24%|██▍       | 24/100 [01:22<04:19,  3.41s/it]

Compression: Reduced series of length 5000 to 1605 segments. Digitization: Reduced 1605 pieces to 49 symbols.


 25%|██▌       | 25/100 [01:25<04:10,  3.35s/it]

Compression: Reduced series of length 5000 to 1569 segments. Digitization: Reduced 1569 pieces to 47 symbols.


 26%|██▌       | 26/100 [01:28<04:05,  3.32s/it]

Compression: Reduced series of length 5000 to 1463 segments. Digitization: Reduced 1463 pieces to 50 symbols.


 27%|██▋       | 27/100 [01:32<03:59,  3.28s/it]

Compression: Reduced series of length 5000 to 1543 segments. Digitization: Reduced 1543 pieces to 48 symbols.


 28%|██▊       | 28/100 [01:35<03:52,  3.22s/it]

Compression: Reduced series of length 5000 to 1457 segments. Digitization: Reduced 1457 pieces to 48 symbols.


 29%|██▉       | 29/100 [01:38<03:44,  3.17s/it]

Compression: Reduced series of length 5000 to 1531 segments. Digitization: Reduced 1531 pieces to 49 symbols.


 30%|███       | 30/100 [01:41<03:44,  3.21s/it]

Compression: Reduced series of length 5000 to 1577 segments. Digitization: Reduced 1577 pieces to 49 symbols.


 31%|███       | 31/100 [01:44<03:41,  3.21s/it]

Compression: Reduced series of length 5000 to 1544 segments. Digitization: Reduced 1544 pieces to 49 symbols.


 32%|███▏      | 32/100 [01:47<03:38,  3.22s/it]

Compression: Reduced series of length 5000 to 1563 segments. Digitization: Reduced 1563 pieces to 47 symbols.


 33%|███▎      | 33/100 [01:51<03:39,  3.27s/it]

Compression: Reduced series of length 5000 to 1498 segments. Digitization: Reduced 1498 pieces to 47 symbols.


 34%|███▍      | 34/100 [01:54<03:42,  3.37s/it]

Compression: Reduced series of length 5000 to 1575 segments. Digitization: Reduced 1575 pieces to 51 symbols.


 35%|███▌      | 35/100 [01:58<03:46,  3.49s/it]

Compression: Reduced series of length 5000 to 1551 segments. Digitization: Reduced 1551 pieces to 47 symbols.


 36%|███▌      | 36/100 [02:02<03:43,  3.49s/it]

Compression: Reduced series of length 5000 to 1456 segments. Digitization: Reduced 1456 pieces to 47 symbols.


 37%|███▋      | 37/100 [02:05<03:34,  3.40s/it]

Compression: Reduced series of length 5000 to 1596 segments. Digitization: Reduced 1596 pieces to 47 symbols.


 38%|███▊      | 38/100 [02:08<03:29,  3.38s/it]

Compression: Reduced series of length 5000 to 1630 segments. Digitization: Reduced 1630 pieces to 55 symbols.


 39%|███▉      | 39/100 [02:12<03:27,  3.41s/it]

Compression: Reduced series of length 5000 to 1459 segments. Digitization: Reduced 1459 pieces to 49 symbols.


 40%|████      | 40/100 [02:15<03:21,  3.36s/it]

Compression: Reduced series of length 5000 to 1584 segments. Digitization: Reduced 1584 pieces to 44 symbols.


 41%|████      | 41/100 [02:18<03:14,  3.29s/it]

Compression: Reduced series of length 5000 to 1647 segments. Digitization: Reduced 1647 pieces to 53 symbols.


 42%|████▏     | 42/100 [02:22<03:15,  3.37s/it]

Compression: Reduced series of length 5000 to 1699 segments. Digitization: Reduced 1699 pieces to 55 symbols.


 43%|████▎     | 43/100 [02:25<03:12,  3.38s/it]

Compression: Reduced series of length 5000 to 1510 segments. Digitization: Reduced 1510 pieces to 46 symbols.


 44%|████▍     | 44/100 [02:28<03:05,  3.30s/it]

Compression: Reduced series of length 5000 to 1634 segments. Digitization: Reduced 1634 pieces to 49 symbols.


 45%|████▌     | 45/100 [02:31<03:02,  3.32s/it]

Compression: Reduced series of length 5000 to 1498 segments. Digitization: Reduced 1498 pieces to 46 symbols.


 46%|████▌     | 46/100 [02:35<02:59,  3.32s/it]

Compression: Reduced series of length 5000 to 1544 segments. Digitization: Reduced 1544 pieces to 47 symbols.


 47%|████▋     | 47/100 [02:38<02:54,  3.29s/it]

Compression: Reduced series of length 5000 to 1582 segments. Digitization: Reduced 1582 pieces to 46 symbols.


 48%|████▊     | 48/100 [02:41<02:52,  3.31s/it]

Compression: Reduced series of length 5000 to 1533 segments. Digitization: Reduced 1533 pieces to 43 symbols.


 49%|████▉     | 49/100 [02:45<02:47,  3.28s/it]

Compression: Reduced series of length 5000 to 1484 segments. Digitization: Reduced 1484 pieces to 50 symbols.


 50%|█████     | 50/100 [02:48<02:44,  3.28s/it]

Compression: Reduced series of length 5000 to 1507 segments. Digitization: Reduced 1507 pieces to 49 symbols.


 51%|█████     | 51/100 [02:51<02:41,  3.29s/it]

Compression: Reduced series of length 5000 to 1481 segments. Digitization: Reduced 1481 pieces to 47 symbols.


 52%|█████▏    | 52/100 [02:55<02:41,  3.37s/it]

Compression: Reduced series of length 5000 to 1583 segments. Digitization: Reduced 1583 pieces to 48 symbols.


 53%|█████▎    | 53/100 [02:59<02:48,  3.58s/it]

Compression: Reduced series of length 5000 to 1467 segments. Digitization: Reduced 1467 pieces to 49 symbols.


 54%|█████▍    | 54/100 [03:03<02:48,  3.67s/it]

Compression: Reduced series of length 5000 to 1614 segments. Digitization: Reduced 1614 pieces to 49 symbols.


 55%|█████▌    | 55/100 [03:07<02:48,  3.74s/it]

Compression: Reduced series of length 5000 to 1499 segments. Digitization: Reduced 1499 pieces to 50 symbols.


 56%|█████▌    | 56/100 [03:11<02:47,  3.80s/it]

Compression: Reduced series of length 5000 to 1496 segments. Digitization: Reduced 1496 pieces to 48 symbols.


 57%|█████▋    | 57/100 [03:14<02:44,  3.82s/it]

Compression: Reduced series of length 5000 to 1693 segments. Digitization: Reduced 1693 pieces to 53 symbols.


 58%|█████▊    | 58/100 [03:19<02:46,  3.97s/it]

Compression: Reduced series of length 5000 to 1553 segments. Digitization: Reduced 1553 pieces to 50 symbols.


 59%|█████▉    | 59/100 [03:23<02:44,  4.02s/it]

Compression: Reduced series of length 5000 to 1500 segments. Digitization: Reduced 1500 pieces to 44 symbols.


 60%|██████    | 60/100 [03:27<02:39,  3.98s/it]

Compression: Reduced series of length 5000 to 1647 segments. Digitization: Reduced 1647 pieces to 48 symbols.


 61%|██████    | 61/100 [03:31<02:37,  4.03s/it]

Compression: Reduced series of length 5000 to 1590 segments. Digitization: Reduced 1590 pieces to 52 symbols.


 62%|██████▏   | 62/100 [03:35<02:35,  4.08s/it]

Compression: Reduced series of length 5000 to 1601 segments. Digitization: Reduced 1601 pieces to 48 symbols.


 63%|██████▎   | 63/100 [03:39<02:31,  4.08s/it]

Compression: Reduced series of length 5000 to 1622 segments. Digitization: Reduced 1622 pieces to 53 symbols.


 64%|██████▍   | 64/100 [03:43<02:27,  4.09s/it]

Compression: Reduced series of length 5000 to 1610 segments. Digitization: Reduced 1610 pieces to 50 symbols.


 65%|██████▌   | 65/100 [03:47<02:22,  4.06s/it]

Compression: Reduced series of length 5000 to 1578 segments. Digitization: Reduced 1578 pieces to 46 symbols.


 66%|██████▌   | 66/100 [03:51<02:15,  3.98s/it]

Compression: Reduced series of length 5000 to 1537 segments. Digitization: Reduced 1537 pieces to 45 symbols.


 67%|██████▋   | 67/100 [03:55<02:11,  3.97s/it]

Compression: Reduced series of length 5000 to 1500 segments. Digitization: Reduced 1500 pieces to 53 symbols.


 68%|██████▊   | 68/100 [03:59<02:06,  3.94s/it]

Compression: Reduced series of length 5000 to 1641 segments. Digitization: Reduced 1641 pieces to 56 symbols.


 69%|██████▉   | 69/100 [04:03<02:03,  3.98s/it]

Compression: Reduced series of length 5000 to 1549 segments. Digitization: Reduced 1549 pieces to 49 symbols.


 70%|███████   | 70/100 [04:07<01:57,  3.91s/it]

Compression: Reduced series of length 5000 to 1508 segments. Digitization: Reduced 1508 pieces to 45 symbols.


 71%|███████   | 71/100 [04:10<01:50,  3.82s/it]

Compression: Reduced series of length 5000 to 1545 segments. Digitization: Reduced 1545 pieces to 46 symbols.


 72%|███████▏  | 72/100 [04:14<01:48,  3.87s/it]

Compression: Reduced series of length 5000 to 1565 segments. Digitization: Reduced 1565 pieces to 49 symbols.


 73%|███████▎  | 73/100 [04:18<01:45,  3.89s/it]

Compression: Reduced series of length 5000 to 1684 segments. Digitization: Reduced 1684 pieces to 48 symbols.


 74%|███████▍  | 74/100 [04:22<01:40,  3.87s/it]

Compression: Reduced series of length 5000 to 1512 segments. Digitization: Reduced 1512 pieces to 50 symbols.


 75%|███████▌  | 75/100 [04:26<01:33,  3.73s/it]

Compression: Reduced series of length 5000 to 1590 segments. Digitization: Reduced 1590 pieces to 47 symbols.


 76%|███████▌  | 76/100 [04:29<01:26,  3.60s/it]

Compression: Reduced series of length 5000 to 1555 segments. Digitization: Reduced 1555 pieces to 43 symbols.


 77%|███████▋  | 77/100 [04:32<01:21,  3.55s/it]

Compression: Reduced series of length 5000 to 1408 segments. Digitization: Reduced 1408 pieces to 50 symbols.


 78%|███████▊  | 78/100 [04:35<01:15,  3.44s/it]

Compression: Reduced series of length 5000 to 1573 segments. Digitization: Reduced 1573 pieces to 47 symbols.


 79%|███████▉  | 79/100 [04:39<01:11,  3.40s/it]

Compression: Reduced series of length 5000 to 1583 segments. Digitization: Reduced 1583 pieces to 46 symbols.


 80%|████████  | 80/100 [04:42<01:06,  3.34s/it]

Compression: Reduced series of length 5000 to 1572 segments. Digitization: Reduced 1572 pieces to 45 symbols.


 81%|████████  | 81/100 [04:45<01:03,  3.32s/it]

Compression: Reduced series of length 5000 to 1575 segments. Digitization: Reduced 1575 pieces to 48 symbols.


 82%|████████▏ | 82/100 [04:48<00:59,  3.29s/it]

Compression: Reduced series of length 5000 to 1611 segments. Digitization: Reduced 1611 pieces to 52 symbols.


 83%|████████▎ | 83/100 [04:52<00:56,  3.34s/it]

Compression: Reduced series of length 5000 to 1584 segments. Digitization: Reduced 1584 pieces to 49 symbols.


 84%|████████▍ | 84/100 [04:55<00:53,  3.34s/it]

Compression: Reduced series of length 5000 to 1665 segments. Digitization: Reduced 1665 pieces to 52 symbols.


 85%|████████▌ | 85/100 [04:59<00:50,  3.36s/it]

Compression: Reduced series of length 5000 to 1600 segments. Digitization: Reduced 1600 pieces to 52 symbols.


 86%|████████▌ | 86/100 [05:02<00:47,  3.38s/it]

Compression: Reduced series of length 5000 to 1619 segments. Digitization: Reduced 1619 pieces to 54 symbols.


 87%|████████▋ | 87/100 [05:05<00:43,  3.37s/it]

Compression: Reduced series of length 5000 to 1556 segments. Digitization: Reduced 1556 pieces to 52 symbols.


 88%|████████▊ | 88/100 [05:09<00:40,  3.34s/it]

Compression: Reduced series of length 5000 to 1620 segments. Digitization: Reduced 1620 pieces to 41 symbols.


 89%|████████▉ | 89/100 [05:12<00:36,  3.29s/it]

Compression: Reduced series of length 5000 to 1523 segments. Digitization: Reduced 1523 pieces to 49 symbols.


 90%|█████████ | 90/100 [05:15<00:33,  3.38s/it]

Compression: Reduced series of length 5000 to 1531 segments. Digitization: Reduced 1531 pieces to 48 symbols.


 91%|█████████ | 91/100 [05:19<00:29,  3.33s/it]

Compression: Reduced series of length 5000 to 1579 segments. Digitization: Reduced 1579 pieces to 52 symbols.


 92%|█████████▏| 92/100 [05:22<00:26,  3.34s/it]

Compression: Reduced series of length 5000 to 1528 segments. Digitization: Reduced 1528 pieces to 36 symbols.


 93%|█████████▎| 93/100 [05:25<00:22,  3.28s/it]

Compression: Reduced series of length 5000 to 1516 segments. Digitization: Reduced 1516 pieces to 50 symbols.


 94%|█████████▍| 94/100 [05:28<00:19,  3.27s/it]

Compression: Reduced series of length 5000 to 1563 segments. Digitization: Reduced 1563 pieces to 55 symbols.


 95%|█████████▌| 95/100 [05:32<00:16,  3.29s/it]

Compression: Reduced series of length 5000 to 1656 segments. Digitization: Reduced 1656 pieces to 48 symbols.


 96%|█████████▌| 96/100 [05:35<00:13,  3.29s/it]

Compression: Reduced series of length 5000 to 1612 segments. Digitization: Reduced 1612 pieces to 47 symbols.


 97%|█████████▋| 97/100 [05:38<00:09,  3.27s/it]

Compression: Reduced series of length 5000 to 1554 segments. Digitization: Reduced 1554 pieces to 46 symbols.


 98%|█████████▊| 98/100 [05:41<00:06,  3.23s/it]

Compression: Reduced series of length 5000 to 1565 segments. Digitization: Reduced 1565 pieces to 43 symbols.


 99%|█████████▉| 99/100 [05:44<00:03,  3.18s/it]

Compression: Reduced series of length 5000 to 1603 segments. Digitization: Reduced 1603 pieces to 49 symbols.


100%|██████████| 100/100 [05:48<00:00,  3.48s/it]


In [3]:
print("Average compression rate:", np.mean(compression_rate))

Average compression rate: 0.3117


In [4]:
pd.DataFrame(clustering_mse).to_csv("results/COM_MSE.csv",index=False)
pd.DataFrame(clustering_dtw).to_csv("results/COM_DTW.csv",index=False)
pd.DataFrame(clustering_time).to_csv("results/COM_RUNTIME.csv",index=False)
pd.DataFrame(clustering_symbols).to_csv("results/COM_SYMBOLS.csv",index=False)