In [1]:
# Nestor Cabello, Elham Naghizade, Jianzhong Qi, Lars Kulik

# Cabello N, Naghizade E, Qi J, Kulik L (2021) Fast, Accurate and Interpretable Time Series Classification 
# Through Randomization.


from rSTSF_functions import *

In [2]:
## 85 benchmark datasets from the UCR repository (http://timeseriesclassification.com)
# dset_names = ["Adiac","ArrowHead","Beef","BeetleFly","BirdChicken","Car","CBF",
#               "ChlorineConcentration","CinCECGTorso","Coffee","Computers","CricketX",
#               "CricketY","CricketZ","DiatomSizeReduction",
#               "DistalPhalanxOutlineAgeGroup","DistalPhalanxOutlineCorrect",
#               "DistalPhalanxTW","Earthquakes","ECG200","ECG5000","ECGFiveDays","ElectricDevices",
#               "FaceAll","FaceFour","FacesUCR","FiftyWords","Fish","FordA","FordB",
#               "GunPoint","Ham","HandOutlines",
#               "Haptics","Herring","InlineSkate",
#               "InsectWingbeatSound","ItalyPowerDemand","LargeKitchenAppliances",
#               "Lightning2","Lightning7","Mallat","Meat","MedicalImages",
#               "MiddlePhalanxOutlineAgeGroup","MiddlePhalanxOutlineCorrect","MiddlePhalanxTW",
#               "MoteStrain","NonInvasiveFetalECGThorax1","NonInvasiveFetalECGThorax2","OliveOil","OSULeaf",
#               "PhalangesOutlinesCorrect",
#               "Phoneme","Plane",
#               "ProximalPhalanxOutlineAgeGroup","ProximalPhalanxOutlineCorrect","ProximalPhalanxTW",
#               "RefrigerationDevices","ScreenType",
#               "ShapeletSim","ShapesAll","SmallKitchenAppliances",
#               "SonyAIBORobotSurface1","SonyAIBORobotSurface2","StarLightCurves",
#               "Strawberry","SwedishLeaf","Symbols","SyntheticControl",
#               "ToeSegmentation1","ToeSegmentation2","Trace","TwoLeadECG","TwoPatterns",
#               "UWaveGestureLibraryAll","UWaveGestureLibraryX","UWaveGestureLibraryY","UWaveGestureLibraryZ",
#               "Wafer","Wine","WordSynonyms","Worms","WormsTwoClass","Yoga"]

## 43 additional datasets from the UCR repository (http://timeseriesclassification.com)
# dset_names = ["ACSF1", "AllGestureWiimoteX","AllGestureWiimoteY","AllGestureWiimoteZ",
#               "BME","Chinatown","Crop", "DodgerLoopDay","DodgerLoopGame","DodgerLoopWeekend",
#               "EOGHorizontalSignal","EOGVerticalSignal","EthanolLevel","FreezerRegularTrain","FreezerSmallTrain",
#               "Fungi", "GestureMidAirD1", "GestureMidAirD2", "GestureMidAirD3", "GesturePebbleZ1", "GesturePebbleZ2",
#               "GunPointAgeSpan","GunPointMaleVersusFemale","GunPointOldVersusYoung",
#               "HouseTwenty","InsectEPGRegularTrain","InsectEPGSmallTrain",
#               "MelbournePedestrian","MixedShapesRegularTrain","MixedShapesSmallTrain",
#               "PLAID","PickupGestureWiimoteZ","PigAirwayPressure","PigArtPressure",
#               "PigCVP","PowerCons","Rock","SemgHandGenderCh2","SemgHandMovementCh2","SemgHandSubjectCh2",
#               "ShakeGestureWiimoteZ","SmoothSubspace","UMD"]

In [10]:
dset_names = ["ItalyPowerDemand"]
# dset_names = ["ItalyPowerDemand","LargeKitchenAppliances","SonyAIBORobotSurface2","ECG200"]

agg_fns = [np.mean, np.std, np.polyfit, np.median, np.min, np.max, iqr, np.percentile, np.quantile]
# np.percentile and np.quantile are just used as identifiers for 
# count mean-crossings and count of values above mean statistics. See function getIntervalFeature(...)

nruns = 10
repr_types = [1,2,3,4] # 1: Raw series, 2: Periodogram, 3: First-order Difference, 4: Autoregressive
d = 50 # Number of sets of candidate discriminatory interval features to compute
r = 500 # Number of trees

accuracies = np.zeros((len(dset_names),nruns))
training_times = []
testing_times = []

cont_dsets = 0
for dset_name in dset_names:
    print("Dataset: ", dset_name)

    X_train_ori, y_train_ori, X_test, y_test = getTrainTestSets(dset_name)
    
    inner_training_time = []
    inner_testing_time = []
    
    for nrun in range(nruns):
        print('run ',str(nrun+1))
        timeA = time.perf_counter()
        
        #For cases of unbalanced datasets --> oversampling
        X_train, per_X_train, diff_X_train, ar_X_train, y_train = dataAugmented(X_train_ori,y_train_ori)

        
        # For the extraction of candidate interval features we use the FisherScore feature ranking metric. 
        # For such metric, all features must z-normalized.
        X_train_norm = zscore(X_train, axis=0, ddof=1)
        X_train_norm[np.isnan(X_train_norm)] = 0 # In case of Nan values set them to zero
        per_X_train_norm = getPeriodogramRepr(X_train_norm)
        diff_X_train_norm = np.diff(X_train_norm)

        ar_X_train_norm = ar_coefs(X_train_norm)
        ar_X_train_norm[np.isnan(ar_X_train_norm)] = 0 # In case of Nan values set them to zero

        all_X_train_T = np.zeros((X_train.shape[0],1))
        all_candidate_agg_feats = []


        for t in range(d): # Compute d sets of candidate discriminatory interval features 
            candidate_agg_feats,X_train_T = getAllCandidateAggFeats(X_train, y_train, agg_fns, repr_types, 
                                                             per_X_train, diff_X_train, ar_X_train,
                                                             X_train_norm, per_X_train_norm, diff_X_train_norm,
                                                             ar_X_train_norm)
            
            # Merge each computed interval-based representation
            all_X_train_T = np.hstack((all_X_train_T,X_train_T)) 
            all_candidate_agg_feats.extend(candidate_agg_feats)
        
        all_X_train_T = all_X_train_T[:,1:]

        clf = ExtraTreesClassifier(n_estimators=r,criterion='entropy',class_weight='balanced',max_features='sqrt')
        clf.fit(all_X_train_T, y_train) # Train the ensemble of ET classifiers

        current_training_time = time.perf_counter()-timeA
        inner_training_time.append(current_training_time)
        print("training time: ", current_training_time)

        timeA = time.perf_counter()
        
        per_X_test = getPeriodogramRepr(X_test)
        diff_X_test = np.diff(X_test)
        ar_X_test = ar_coefs(X_test)
        ar_X_test[np.isnan(ar_X_test)] = 0

        # The testing set has to be transformed into an interval-based representation
        # use only the relevant interval as according to the training process (i.e., tree nodes)
        relevant_caf_idx = [] 
        for dt_tree in clf.estimators_:
            caf_idx_to_train = dt_tree.tree_.feature
            relevant_caf_idx.extend(caf_idx_to_train[caf_idx_to_train>=0])
        relevant_caf_idx = np.unique(relevant_caf_idx)
        
        X_test_T = getIntervalBasedTransform(X_test, per_X_test, diff_X_test, ar_X_test, all_candidate_agg_feats, relevant_caf_idx)
        y_pred = clf.predict(X_test_T)

        current_testing_time = time.perf_counter()-timeA
        inner_testing_time.append(current_testing_time)
        print("testing time: ", current_testing_time)

        accu = np.sum(y_pred==y_test)/len(y_test)
        print('accuracy: ', accu)
        accuracies[cont_dsets,nrun] = accu

    avg_accuracy_this_dataset = np.mean(accuracies[cont_dsets,:])
    print('avg accuracy for ' + str(nruns) + ' runs: ' , avg_accuracy_this_dataset)
    
    training_times.append(np.mean(inner_training_time))
    testing_times.append(np.mean(inner_testing_time))

    cont_dsets+=1
    print("\n")


## comment/uncomment the lines below according to the number of runs
columns = {'Dataset':dset_names,
           'run1':accuracies[:,0],
               'run2':accuracies[:,1],
               'run3':accuracies[:,2],
               'run4':accuracies[:,3],
               'run5':accuracies[:,4],
               'run6':accuracies[:,5],
               'run7':accuracies[:,6],
               'run8':accuracies[:,7],
               'run9':accuracies[:,8],
               'run10':accuracies[:,9],
           'avgAccu':np.mean(accuracies,axis=1),
           'avgTrainTime':np.array(training_times),'avgTestTime':np.array(testing_times)}
dfResults = pd.DataFrame(columns)
dfResults = dfResults[['Dataset',
                       'run1',
                           'run2',
                           'run3',
                           'run4',
                           'run5',
                           'run6',
                           'run7',
                           'run8',
                           'run9',
                           'run10',
                       'avgAccu','avgTrainTime','avgTestTime'
                      ]]
    

dfResults.to_csv("output/r-STSF_experiment_run_at_" + time.strftime("%d-%m-%Y %H%M%S") + ".csv",encoding='utf-8' ,index=False) 

Dataset:  ItalyPowerDemand
run  1
training time:  1.581384989000071
testing time:  0.6833054840026307
accuracy:  0.9727891156462585
run  2
training time:  1.814018474000477
testing time:  0.608274900001561
accuracy:  0.9737609329446064
run  3
training time:  1.5461253920002491
testing time:  1.705918808998831
accuracy:  0.9718172983479106
run  4
training time:  1.5608163290016819
testing time:  0.6141648250013532
accuracy:  0.9737609329446064
run  5
training time:  1.5199255789993913
testing time:  0.6116699260019232
accuracy:  0.9737609329446064
run  6
training time:  1.6070880990009755
testing time:  1.0241273009996803
accuracy:  0.9737609329446064
run  7
training time:  3.4471930920008163
testing time:  1.0309765969977889
accuracy:  0.9737609329446064
run  8
training time:  2.059693645001971
testing time:  1.4476360280023073
accuracy:  0.9727891156462585
run  9
training time:  2.4981720299983863
testing time:  0.7756279839995841
accuracy:  0.9718172983479106
run  10
training time:  