# DD2430 Project: Data Augmentation Using Surrogates

In [None]:
# Imports


import tqdm
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
import sys
import os
import pyts

### Functions for loading or downloading data

In [None]:

#!pip install pyts
#!pip install sktime

#@title UCR auxiliary functions
"""
Utility functions for the UCR multivariate time series classification
archive.
"""
# Author: Johann Faouzi <johann.faouzi@gmail.com>
# License: BSD-3-Clause

import numpy as np
import os
import pickle
from scipy.io.arff import loadarff
from sklearn.utils import Bunch
from urllib.request import urlretrieve
import zipfile


def _correct_ucr_name_download(dataset):
    if dataset == 'CinCECGtorso':
        return 'CinCECGTorso'
    elif dataset == 'MixedShapes':
        return 'MixedShapesRegularTrain'
    elif dataset == 'NonInvasiveFetalECGThorax1':
        return 'NonInvasiveFatalECGThorax1'
    elif dataset == 'NonInvasiveFetalECGThorax2':
        return 'NonInvasiveFatalECGThorax2'
    elif dataset == 'StarlightCurves':
        return 'StarLightCurves'
    else:
        return dataset


def _correct_ucr_name_description(dataset):
    if dataset == 'CinCECGTorso':
        return 'CinCECGtorso'
    elif dataset == 'MixedShapesRegularTrain':
        return 'MixedShapes'
    elif dataset == 'NonInvasiveFatalECGThorax1':
        return 'NonInvasiveFetalECGThorax1'
    elif dataset == 'NonInvasiveFatalECGThorax2':
        return 'NonInvasiveFetalECGThorax2'
    elif dataset == 'StarLightCurves':
        return 'StarlightCurves'
    else:
        return dataset


def ucr_dataset_list():
    """List of available UCR datasets.

    Returns
    -------
    datasets : list
        List of available datasets from the UCR Time Series
        Classification Archive.

    References
    ----------
    .. [1] `List of datasets on the UEA & UCR archive
           <http://www.timeseriesclassification.com/dataset.php>`_

    Examples
    --------
    >>> from pyts.datasets import ucr_dataset_list
    >>> ucr_dataset_list()[:3]
    ['ACSF1', 'Adiac', 'AllGestureWiimoteX']

    """
    module_path = os.path.dirname(__file__)
    finfo = os.path.join(module_path, 'info', 'ucr.pickle')
    dictionary = pickle.load(open(finfo, 'rb'))
    datasets = sorted(dictionary.keys())
    return datasets


def ucr_dataset_info(dataset=None):
    """Information about the UCR datasets.

    Parameters
    ----------
    dataset : str, list of str or None (default = None)
        The data sets for which the information will be returned.
        If None, the information for all the datasets is returned.

    Returns
    -------
    dictionary : dict
        Dictionary with the information for each dataset.

    References
    ----------
    .. [1] `List of datasets on the UEA & UCR archive
           <http://www.timeseriesclassification.com/dataset.php>`_

    Examples
    --------
    >>> from pyts.datasets import ucr_dataset_info
    >>> ucr_dataset_info('Adiac')['n_classes']
    37

    """
    module_path = os.path.dirname(__file__)
    finfo = os.path.join(module_path, 'info', 'ucr.pickle')
    dictionary = pickle.load(open(finfo, 'rb'))
    datasets = list(dictionary.keys())

    if dataset is None:
        return dictionary
    elif isinstance(dataset, str):
        if dataset not in datasets:
            raise ValueError(
                "{0} is not a valid name. The list of available names "
                "can be obtained by calling the "
                "'pyts.datasets.ucr_dataset_list' function."
                .format(dataset)
            )
        else:
            return dictionary[dataset]
    elif isinstance(dataset, (list, tuple, np.ndarray)):
        dataset = np.asarray(dataset)
        invalid_datasets = np.setdiff1d(dataset, datasets)
        if invalid_datasets.size > 0:
            raise ValueError(
                "The following names are not valid: {0}. The list of "
                "available names can be obtained by calling the "
                "'pyts.datasets.ucr_dataset_list' function."
                .format(invalid_datasets)
            )
        else:
            info = {}
            for data in dataset:
                info[data] = dictionary[data]
            return info


def fetch_ucr_dataset(dataset, use_cache=True, data_home=None,
                      return_X_y=False):
    r"""Fetch dataset from UCR TSC Archive by name.

    Fetched data sets are automatically saved in the
    ``pyts/datasets/_cached_datasets`` folder. To avoid
    downloading the same data set several times, it is
    highly recommended not to change the default values
    of ``use_cache`` and ``path``.

    Parameters
    ----------
    dataset : str
        Name of the dataset.

    use_cache : bool (default = True)
        If True, look if the data set has already been fetched
        and load the fetched version if it is the case. If False,
        download the data set from the UCR Time Series Classification
        Archive.

    data_home : None or str (default = None)
        The path of the folder containing the cached data set.
        If None, the ``pyts.datasets.cached_datasets/UCR/`` folder is
        used. If the data set is not found, it is downloaded and cached
        in this path.

    return_X_y : bool (default = False)
        If True, returns ``(data_train, data_test, target_train, target_test)``
        instead of a Bunch object. See below for more information about the
        `data` and `target` object.

    Returns
    -------
    data : Bunch
        Dictionary-like object, with attributes:

        data_train : array of floats
            The time series in the training set.
        data_test : array of floats
            The time series in the test set.
        target_train : array of integers
            The classification labels in the training set.
        target_test : array of integers
            The classification labels in the test set.
        DESCR : str
            The full description of the dataset.
        url : str
            The url of the dataset.

    (data_train, data_test, target_train, target_test) : tuple if ``return_X_y`` is True

    Notes
    -----
    Missing values are represented as NaN's.

    References
    ----------
    .. [1] H. A. Dau et al, "The UCR Time Series Archive".
           arXiv:1810.07758 [cs, stat], 2018.

    .. [2] A. Bagnall et al, "The UEA & UCR Time Series Classification
           Repository", www.timeseriesclassification.com.

    """  # noqa: E501
    if dataset not in ucr_dataset_list():
        raise ValueError(
            "{0} is not a valid name. The list of available names "
            "can be obtained with ``pyts.datasets.ucr_dataset_list()``"
            .format(dataset)
        )
    if data_home is None:
        import pyts
        home = '/'.join(pyts.__file__.split('/')[:-2]) + '/'
        relative_path = 'pyts/datasets/cached_datasets/UCR/'
        path = home + relative_path
    else:
        path = data_home
    if not os.path.exists(path):
        os.makedirs(path)

    correct_dataset = _correct_ucr_name_download(dataset)
    if use_cache and os.path.exists(os.path.join(path, correct_dataset)):
        bunch = _load_ucr_dataset(correct_dataset, path=path)
    else:
        # CHANGED LINE --------
        # url = ("http://www.timeseriesclassification.com/Downloads/{0}.zip"
        # url = ("http://www.timeseriesclassification.com/ClassificationDownloads/{0}.zip".format(correct_dataset))
        url = ("https://www.timeseriesclassification.com/aeon-toolkit/{0}.zip".format(correct_dataset))

        # ---------------------
        temp_filename = 'temp_{}'.format(correct_dataset)
        temp_file_path = os.path.join(path, temp_filename)
        _ = urlretrieve(url, temp_file_path)
        zipfile.ZipFile(temp_file_path).extractall(os.path.join(path, correct_dataset))
        os.remove(temp_file_path)
        bunch = _load_ucr_dataset(correct_dataset, path)

    if return_X_y:
        return (bunch.data_train, bunch.data_test,
                bunch.target_train, bunch.target_test)
    return bunch


def _load_ucr_dataset(dataset, path):
    """Load a UCR data set from a local folder.

    Parameters
    ----------
    dataset : str
        Name of the dataset.

    path : str
        The path of the folder containing the cached data set.

    Returns
    -------
    data : Bunch
        Dictionary-like object, with attributes:

        data_train : array of floats
            The time series in the training set.
        data_test : array of floats
            The time series in the test set.
        target_train : array
            The classification labels in the training set.
        target_test : array
            The classification labels in the test set.
        DESCR : str
            The full description of the dataset.
        url : str
            The url of the dataset.

    Notes
    -----
    Padded values are represented as NaN's.

    """
    new_path = os.path.join(path, dataset) + '/'
    try:
        with(open(new_path + dataset + '.txt', encoding='utf-8')) as f:
            description = f.read()
    except UnicodeDecodeError:
        with(open(new_path + dataset + '.txt', encoding='ISO-8859-1')) as f:
            description = f.read()
    try:
        data_train = np.genfromtxt(new_path + dataset + '_TRAIN.txt')
        data_test = np.genfromtxt(new_path + dataset + '_TEST.txt')

        X_train, y_train = data_train[:, 1:], data_train[:, 0]
        X_test, y_test = data_test[:, 1:], data_test[:, 0]

    except IndexError:
        train = loadarff(new_path + dataset + '_TRAIN.arff')
        test = loadarff(new_path + dataset + '_TEST.arff')

        data_train = np.asarray([train[0][name] for name in train[1].names()])
        X_train = data_train[:-1].T.astype('float64')
        y_train = data_train[-1]

        data_test = np.asarray([test[0][name] for name in test[1].names()])
        X_test = data_test[:-1].T.astype('float64')
        y_test = data_test[-1]

    try:
        y_train = y_train.astype('float64').astype('int64')
        y_test = y_test.astype('float64').astype('int64')
    except ValueError:
        y_train = y_train.astype(str)
        y_test = y_test.astype(str)

    bunch = Bunch(
        data_train=X_train, target_train=y_train,
        data_test=X_test, target_test=y_test,
        DESCR=description,
        url=("http://www.timeseriesclassification.com/"
             "description.php?Dataset={}".format(dataset))
    )

    return bunch

### Load and prepare datasets 

In [None]:
# Load Data

from pyts.datasets import ucr_dataset_list

dataset_name_list = ucr_dataset_list()[0:20]
print(dataset_name_list)  
CACHED_DATA_FOLDER = os.path.join(os.path.dirname(os.getcwd()), "Data")
dataset_list = []
for dataset_name in tqdm.tqdm(dataset_name_list):

    cache_path = os.path.join(CACHED_DATA_FOLDER, dataset_name)
    datset = fetch_ucr_dataset(dataset=dataset_name, use_cache=True, data_home=cache_path)
    dataset_list.append(datset)


## Create pandas dataframe
dataset_list_binary = []
dataset_train_size = []
dataset_test_size = []
datset_length = []
binary_dataset_name = []
test_balance = []
num_classes = []

for i,dataset_object in enumerate(dataset_list):
    # Filter the datasets depending on number of classes
    nclasses = len(np.unique(dataset_object['target_train']))
    #if num_clases < 3:
    
    name = dataset_name_list[i]
    dataset_list_binary.append(dataset_object)
    data_length = dataset_object['data_train'].shape[1]
    train_size = dataset_object['data_train'].shape[0]
    test_size = dataset_object['data_test'].shape[0]
    (labels,counts) = np.unique(dataset_object['target_test'],return_counts=True)
    test_proportion = counts[0]/(counts[0]+counts[1])

    datset_length.append(data_length)
    dataset_train_size.append(train_size)
    dataset_test_size.append(test_size)
    binary_dataset_name.append(name)
    test_balance.append(test_proportion)
    num_classes.append(nclasses)

meta_data = {'name': binary_dataset_name, 'train_size': dataset_train_size, 'test_size': dataset_test_size,'length':datset_length, 'test_balance':test_balance, "nr_classes":num_classes}
meta_df = pd.DataFrame(data=meta_data)
print(meta_df)

In [None]:

from InceptionTimeModule_torch import Inception, InceptionBlock
import torch.nn as nn
from torch.utils.data import DataLoader as DL
import torch
import torch.utils.data as data_utils  
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

indx = 11
dataset_obj = dataset_list[indx]
x_train = torch.tensor(dataset_obj['data_train'], dtype=torch.float)
y_train = torch.tensor(dataset_obj['target_train'], dtype=torch.long)
x_test = torch.tensor(dataset_obj['data_test'], dtype=torch.float)
y_test = torch.tensor(dataset_obj['target_test'], dtype=torch.long)
train_tensor = data_utils.TensorDataset(x_train, y_train)



# What parameters?
T = meta_df.loc[indx,'length'] 
N_train = meta_df.loc[indx,'train_size']  
bottleneck_dim = 0  # Paper suggests this
base_kernel_size = 40    # QUESTION IS THIS THE FILTER LENGTH? In that case, paper suggests higher value for higher time series length. Reasonable value to start with?   
residual_bool = False   # Paper suggests so


C = meta_df.loc[indx, "nr_classes"]
print(C)
num_in_channels = 1


class Flatten(nn.Module):
	def __init__(self, out_features):
		super(Flatten, self).__init__()
		self.output_dim = out_features

	def forward(self, x):
		return x.view(-1, self.output_dim)
    
class Reshape(nn.Module):
	def __init__(self, out_shape):
		super(Reshape, self).__init__()
		self.out_shape = out_shape

	def forward(self, x):
		return x.view(-1, *self.out_shape)



# Init Inceptiontime obj
InceptionTime = nn.Sequential(
                    Reshape(out_shape=(1,T)),
                    InceptionBlock(
                        in_channels=1, 
                        n_filters=32, 
                        kernel_sizes=[5, 11, 23],
                        bottleneck_channels=32,
                        use_residual=True,
                        activation=nn.ReLU()
                    ),
                    InceptionBlock(
                        in_channels=32*4, 
                        n_filters=32, 
                        kernel_sizes=[5, 11, 23],
                        bottleneck_channels=32,
                        use_residual=True,
                        activation=nn.ReLU()
                    ),
                    nn.AdaptiveAvgPool1d(output_size=1),
                    Flatten(out_features=32*4*1),
                    nn.Linear(in_features=4*32*1, out_features=4))


# Create dataloader
BATCH_SIZE = int(N_train/6)    # Default from InceptionTime paper
training_loader = DL(train_tensor, batch_size=BATCH_SIZE, shuffle=True)


# Define loss function
loss_fn = torch.nn.CrossEntropyLoss()


# Define optimizer
optimizer = torch.optim.Adam(InceptionTime.parameters())


# Write training loop
running_loss = 0
for epoch in range(40):
    for i, data in enumerate(training_loader):
        optimizer.zero_grad()       
        inputs, labels = data
        outputs = InceptionTime(inputs)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
    print(epoch, loss.item(), i)


InceptionTime.train(False)
with torch.no_grad():
    y_train_pred = np.argmax(InceptionTime(torch.tensor(x_train).float()).detach(), axis=1)
    y_pred = np.argmax(InceptionTime(torch.tensor(x_test).float()).detach(), axis=1)

print(accuracy_score(y_true=y_train, y_pred=y_train_pred))

print(accuracy_score(y_true=y_test, y_pred=y_pred))
      