# Test surrogate augmentation on InceptionTimeClassifier


In [21]:
# make some imports 
import warnings
warnings.filterwarnings("ignore")  # hide some annoying deprication warnings

import numpy as np
import pandas as pd

import random

import torch
import torch.utils.data as data_utils  

import os
import tqdm


random.seed(24569)

## Download Data

In [30]:
# Load Data

import load_data as dataloader
from pyts.datasets import ucr_dataset_list

dataset_name_list = ucr_dataset_list()
print(dataset_name_list)  
CACHED_DATA_FOLDER = os.path.join(os.path.dirname(os.getcwd()), "Data")
dataset_list = []
for dataset_name in tqdm.tqdm(dataset_name_list):
    cache_path = os.path.join(CACHED_DATA_FOLDER, dataset_name)
    datset = dataloader.fetch_ucr_dataset(dataset=dataset_name, use_cache=True, data_home=cache_path)
    dataset_list.append(datset)


## Create pandas dataframe
dataset_list_binary = []
dataset_train_size = []
dataset_test_size = []
datset_length = []
binary_dataset_name = []
test_balance = []
num_classes = []

for i,dataset_object in enumerate(dataset_list):
    # Filter the datasets depending on number of classes
    nclasses = len(np.unique(dataset_object['target_train']))
    #if num_clases < 3:
    
    name = dataset_name_list[i]
    dataset_list_binary.append(dataset_object)
    data_length = dataset_object['data_train'].shape[1]
    train_size = dataset_object['data_train'].shape[0]
    test_size = dataset_object['data_test'].shape[0]
    (labels,counts) = np.unique(dataset_object['target_test'],return_counts=True)
    test_proportion = counts[0]/(counts[0]+counts[1])

    datset_length.append(data_length)
    dataset_train_size.append(train_size)
    dataset_test_size.append(test_size)
    binary_dataset_name.append(name)
    test_balance.append(test_proportion)
    num_classes.append(nclasses)

meta_data = {'name': binary_dataset_name, 'train_size': dataset_train_size, 'test_size': dataset_test_size,'length':datset_length, 'test_balance':test_balance, "nr_classes":num_classes}
meta_df = pd.DataFrame(data=meta_data)
print(meta_df)



['ACSF1', 'Adiac', 'AllGestureWiimoteX', 'AllGestureWiimoteY', 'AllGestureWiimoteZ', 'ArrowHead', 'BME', 'Beef', 'BeetleFly', 'BirdChicken', 'CBF', 'Car', 'Chinatown', 'ChlorineConcentration', 'CinCECGtorso', 'Coffee', 'Computers', 'CricketX', 'CricketY', 'CricketZ', 'Crop', 'DiatomSizeReduction', 'DistalPhalanxOutlineAgeGroup', 'DistalPhalanxOutlineCorrect', 'DistalPhalanxTW', 'DodgerLoopDay', 'DodgerLoopGame', 'DodgerLoopWeekend', 'ECG200', 'ECG5000', 'ECGFiveDays', 'EOGHorizontalSignal', 'EOGVerticalSignal', 'Earthquakes', 'ElectricDevices', 'EthanolLevel', 'FaceAll', 'FaceFour', 'FacesUCR', 'FiftyWords', 'Fish', 'FordA', 'FordB', 'FreezerRegularTrain', 'FreezerSmallTrain', 'Fungi', 'GestureMidAirD1', 'GestureMidAirD2', 'GestureMidAirD3', 'GesturePebbleZ1', 'GesturePebbleZ2', 'GunPoint', 'GunPointAgeSpan', 'GunPointMaleVersusFemale', 'GunPointOldVersusYoung', 'Ham', 'HandOutlines', 'Haptics', 'Herring', 'HouseTwenty', 'InlineSkate', 'InsectEPGRegularTrain', 'InsectEPGSmallTrain', 'I

  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 128/128 [01:31<00:00,  1.40it/s]


                   name  train_size  test_size  length  test_balance   
0                 ACSF1         100        100    1460      0.500000  \
1                 Adiac         390        391     176      0.350000   
2    AllGestureWiimoteX         300        700     500      0.500000   
3    AllGestureWiimoteY         300        700     500      0.500000   
4    AllGestureWiimoteZ         300        700     500      0.500000   
..                  ...         ...        ...     ...           ...   
123                Wine          57         54     234      0.500000   
124        WordSynonyms         267        638     270      0.090909   
125               Worms         181         77     900      0.717391   
126       WormsTwoClass         181         77     900      0.428571   
127                Yoga         300       3000     426      0.464333   

     nr_classes  
0            10  
1            37  
2            10  
3            10  
4            10  
..          ...  
123      

### Filter out datasets of reasonable sizes

In [33]:


reasonable_datasets = meta_df.loc[(meta_df['train_size']>200)&(meta_df['train_size']<2000)& (meta_df['test_size']<2*meta_df['train_size']) & (meta_df['length']<1000)]
print(reasonable_datasets.shape[0])
print(reasonable_datasets)


31
                               name  train_size  test_size  length   
1                             Adiac         390        391     176  \
16                        Computers         250        250     720   
17                         CricketX         390        390     300   
18                         CricketY         390        390     300   
19                         CricketZ         390        390     300   
22     DistalPhalanxOutlineAgeGroup         400        139      80   
23      DistalPhalanxOutlineCorrect         600        276      80   
24                  DistalPhalanxTW         400        139      80   
33                      Earthquakes         322        139     512   
39                       FiftyWords         450        455     270   
46                  GestureMidAirD1         208        130     360   
47                  GestureMidAirD2         208        130     360   
48                  GestureMidAirD3         208        130     360   
65           Larg

### Create Training/Testing Datasets

In [34]:

indx = 91    #corresponds to ProximalPhalanxOutlineAgeGroup dataset

dataset_obj = dataset_list[indx]

x_train = dataset_obj['data_train']
y_train = dataset_obj['target_train']
x_test = dataset_obj['data_test']
y_test = dataset_obj['target_test']



### Create Training/Testing Datasets

### Load data using sktime.datasets

In [2]:

from sktime.datasets import load_UCR_UEA_dataset
from sklearn.metrics import accuracy_score

dataset = "Car"
X_train, y_train = load_UCR_UEA_dataset(name=dataset, split="train", return_X_y=True, return_type="numpy2d")
X_test, y_test = load_UCR_UEA_dataset(name=dataset, split="test", return_X_y=True, return_type="numpy2d")

print("Training data shape is", X_train.shape)


Training data shape is (60, 577)


## Train and evaluate Model

In [None]:
def stop_training(val_losses):
    """This function is to be used for early stopping during training. 
    Input: val_loss, an array containing the calculated validation losses over epochs. 
    Output: returns True if the training should stop, False if the training should continue"""

    # If function is called in the early stages of training
    if len(val_losses)<10:
        return False
    
    # Check if validation loss or accuracy loss have basically stopped changing the last 5 epochs
    tolerance = 1e-3    # saw docs for stopping_tolerance by h20.ai, they used this as default tolerance
    n_epochs = len(val_losses)
    loss_avg = np.mean(val_losses[n_epochs-7:n_epochs-2])

    if np.abs(val_losses[-1]-loss_avg) < tolerance:
        return True
    else:
        return False


In [50]:
from sktime.classification.deep_learning import InceptionTimeClassifier
from sklearn.metrics import accuracy_score
from keras.callbacks import EarlyStopping
from keras.callbacks import Callback
from sklearn.model_selection import cross_validate

# Train InceptionTimeClassifier on normal data
N_EPOCHS = 100


# class MyCallback(keras.callback.Callback):
#     def on_train_begin(self, logs={}):
#         self.history = {"loss":[], "val_loss":[]}
#     def on_batch_end()


early_stopping_callback = EarlyStopping(monitor="loss", verbose=1, patience=5, start_from_epoch=50, min_delta=1e-3)
callbacks=[early_stopping_callback]

ITC = InceptionTimeClassifier(n_epochs=N_EPOCHS, verbose=1, kernel_size=16, n_filters=16, bottleneck_size=16)
cross_val_obj = cross_validate(ITC, X=x_test, y=y_test)
ITC.fit_predict(x_train, y_train, cv=cross_val_obj)


print(ITC.summary())

# Predict and evaluate on test set
y_preditions = ITC.predict(x_test)


accuracy_score(y_test, y_preditions)

Model: "model_15"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_16 (InputLayer)          [(None, 80, 1)]      0           []                               
                                                                                                  
 max_pooling1d_90 (MaxPooling1D  (None, 80, 1)       0           ['input_16[0][0]']               
 )                                                                                                
                                                                                                  
 conv1d_465 (Conv1D)            (None, 80, 16)       256         ['input_16[0][0]']               
                                                                                                  
 conv1d_466 (Conv1D)            (None, 80, 16)       128         ['input_16[0][0]']        

AttributeError: 'dict' object has no attribute 'split'

## Test surrogate augmentation on InceptionTime

In [None]:
# Create a subset of the training data to be augmented

from copy import deepcopy

N = X_train.shape[0]
p = 0.2     # percentage of data to augment
rows_to_augment = random.sample(range(0,N), int(N*p))

X_subset = deepcopy(X_test[rows_to_augment,:])
y_subset = deepcopy(y_train[rows_to_augment])



In [None]:
from surrogate_augmentation import correlated_noise_surrogates, AAFT_surrogates, refined_AAFT_surrogates

# Correlated noise surrogate
X_subset_augmented = correlated_noise_surrogates(X_subset)
X_train_augmented = np.concatenate((X_train, X_subset_augmented))
y_train_augmented = np.concatenate((y_train, y_subset))


ITC2 = InceptionTimeClassifier(n_epochs=N_EPOCHS, verbose=1)
ITC2.fit(X_train_augmented, y_train_augmented)
y_preditions = ITC2.predict(X_test)
accuracy_score(y_test, y_preditions)


In [None]:
# AFFT surrogates

X_subset_augmented2 = AAFT_surrogates(X_subset)
X_train_augmented2 = np.concatenate((X_train, X_subset_augmented2))

ITC3 = InceptionTimeClassifier(n_epochs=N_EPOCHS, verbose=1)
ITC3.fit(X_train_augmented2, y_train_augmented)
y_preditions = ITC3.predict(X_test)
accuracy_score(y_test, y_preditions)


In [None]:
# AAFT refined surrogates
n_iters = 20

X_subset_augmented3 = refined_AAFT_surrogates(X_subset, n_iterations=n_iters)
X_train_augmented3 = np.concatenate((X_train, X_subset_augmented3), axis=0)

ITC4 = InceptionTimeClassifier(n_epochs=N_EPOCHS, verbose=1, batch_size=BATCH_SIZE)
ITC4.fit(X_train_augmented3, y_train_augmented)
y_preditions = ITC4.predict(X_test)
accuracy_score(y_test, y_preditions)


