In [1]:
import os
import csv
import time
import pickle
import warnings
import numpy as np
import pandas as pd
from src.fABBA_test import fABBA
from src.ABBA import ABBA
from src.mydefaults import mydefaults
from collections import defaultdict
from tslearn.metrics import dtw as dtw
warnings.filterwarnings('ignore')

datadir = 'UCRArchive_2018/'
tol = [0.05*i for i in range(1,11)]
ts_count = 0
for root, dirs, files in os.walk(datadir):
    for file in files:
        if file.endswith('tsv'):
            with open(os.path.join(root, file)) as f:
                content = f.readlines()
                ts_count += len(content)

print('Number of time series:', ts_count)


alphas = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9] 

sorting_methods = ['2-norm']

for sortname in sorting_methods: 
    print('\n sorting method:', sortname)
    D_fABBA_2 = len(alphas)*ts_count*[np.NaN]
    D_fABBA_DTW = len(alphas)*ts_count*[np.NaN]
    D_fABBA_time = len(alphas)*ts_count*[np.NaN]
    alphalist = len(alphas)*ts_count*[np.NaN]
    countlist = len(alphas)*ts_count*[np.NaN]
    
    ts_name = len(alphas)*ts_count*[''] # time series name for debugging
    tol_used = len(alphas)*ts_count*[np.NaN] # Store tol used

    csymbolicNum = len(alphas)*ts_count*[np.NaN]  # Store amount of symbols
    cpiecesNum = len(alphas)*ts_count*[np.NaN]  # Store amount of pieces after compression
    ctsname = len(alphas)*ts_count*['']

    index = 0
    for root, dirs, files in os.walk(datadir):
        for file in files:
            if file.endswith('tsv'):
                print(' file:', file)
                with open(os.path.join(root, file)) as tsvfile:
                    tsvfile = csv.reader(tsvfile, delimiter='\t')
                    for ind, column in enumerate(tsvfile):
                        for alpha in alphas:
                            # print('alpha:' + str(alpha) + ' file:', file)
                            ts_name[index] += str(file) + '_' + str(ind) 

                            ts = [float(i) for i in column] 
                            ts = np.array(ts[1:]) 
                            ts = ts[~np.isnan(ts)]

                            norm_ts = (ts -  np.mean(ts))
                            std = np.std(norm_ts, ddof=1)
                            std = std if std > np.finfo(float).eps else 1
                            norm_ts /= std

                            if len(norm_ts) < 100:
                                break

                            tol_index = 0

                            CompressionTolHigh = False
                            for tol_index in range(len(tol)):
                                abba = ABBA(tol=tol[tol_index], verbose=0)
                                pieces = abba.compress(norm_ts)
                                ABBA_len = len(pieces)
                                if ABBA_len <= len(norm_ts)/5:
                                    tol_used[index] = tol[tol_index]
                                    break
                                elif tol_index == len(tol)-1:
                                    CompressionTolHigh = True

                            if CompressionTolHigh:
                                continue # uniform to performance profiles test!

                            fabba = fABBA(verbose=0, alpha=alpha, scl=1, sorting=sortname) 
                            st = time.time()
                            symbolic_tsf = fabba.digitize(pieces[:,:2])
                            ed = time.time()

                            time_fabba = ed - st
                            symbolnum = len(set(symbolic_tsf))
                            csymbolicNum[index] = symbolnum
                            cpiecesNum[index] = len(pieces)
                            ctsname[index] = str(file) + '_' + str(ind)
                            
                            ts_fABBA = fabba.inverse_transform(symbolic_tsf, norm_ts[0])

                            D_fABBA_2[index] = np.linalg.norm(norm_ts - ts_fABBA)
                            D_fABBA_DTW[index] = dtw(norm_ts, ts_fABBA)
                            D_fABBA_time[index] = time_fabba
                            alphalist[index] = alpha
                            countlist[index] = fabba.nr_dist
                            
                            index += 1

    Datastore = pd.DataFrame(columns=['ts name', 'number of pieces',
                                      'number of symbols', 'tol', 'fABBA_2',
                                      'fABBA_DTW', 'fABBA_time', 'alpha'])
    Datastore["ts name"] = ctsname
    Datastore["number of pieces"] = cpiecesNum
    Datastore["number of symbols"] = csymbolicNum
    Datastore["tol"] = tol_used

    Datastore["fABBA_2"] = D_fABBA_2
    Datastore["fABBA_DTW"] = D_fABBA_DTW
    Datastore["fABBA_time"] = D_fABBA_time
    Datastore["alpha"] = alphalist
    Datastore["count"] = countlist
    
    Datastore.to_csv('results/count_rate'+sortname+'.csv',index=False)

Number of time series: 201161

 sorting method: 2-norm
 file: DodgerLoopWeekend_TRAIN.tsv
 file: DodgerLoopWeekend_TEST.tsv
 file: Wine_TRAIN.tsv
 file: Wine_TEST.tsv
 file: Strawberry_TRAIN.tsv
 file: Strawberry_TEST.tsv
 file: ItalyPowerDemand_TRAIN.tsv
 file: ItalyPowerDemand_TEST.tsv
 file: PLAID_TRAIN.tsv
 file: PLAID_TEST.tsv
 file: BirdChicken_TEST.tsv
 file: BirdChicken_TRAIN.tsv
 file: ProximalPhalanxTW_TEST.tsv
 file: ProximalPhalanxTW_TRAIN.tsv
 file: Symbols_TRAIN.tsv
 file: Symbols_TEST.tsv
 file: FiftyWords_TRAIN.tsv
 file: FiftyWords_TEST.tsv
 file: ChlorineConcentration_TEST.tsv
 file: ChlorineConcentration_TRAIN.tsv
 file: Lightning2_TRAIN.tsv
 file: Lightning2_TEST.tsv
 file: UWaveGestureLibraryX_TEST.tsv
 file: UWaveGestureLibraryX_TRAIN.tsv
 file: FacesUCR_TRAIN.tsv
 file: FacesUCR_TEST.tsv
 file: Plane_TEST.tsv
 file: Plane_TRAIN.tsv
 file: RefrigerationDevices_TRAIN.tsv
 file: RefrigerationDevices_TEST.tsv
 file: DodgerLoopWeekend_TRAIN.tsv
 file: DodgerLoopWeeken