In [18]:
"""
Created on July 19, 2018
Revised on Oct 29, 2018

@author: steven.cy.chuang
"""

import numpy as np

In [None]:
def dummy(y, num_class):
    """
    This method is to get dummy variables for numerical numpy array such as [1, 0, 1, 2] into [[0,1,0],[1,0,0],[0,1,0],[0,0,1]].
    For categorical variables, please reffer pandas.get_dummies().
    Args:
        y (numpy array): the array to get dummy variables.
        num_class (int): the number of distinct variables.
    Returns: 
        y_dummy (numpy ndarray): 2d arrary of dummy variables.
    """
    num_inst = len(y)
    y_dummy = np.zeros([num_inst, num_class])
    for i in range(num_inst):
        y_dummy[i, y[i]] = 1
    return y_dummy

In [None]:
def inv_dummy(y):
    """
    This method is to invert dummy variables into one dimension numerical numpy array.
    For example, [[0,1,0],[1,0,0],[0,1,0],[0,0,1],[0,0,0]] will be into [1, 0, 1, 2, 3].
    It should be noted that new label(the number of dummy variables plus 1) will be added for all elements are 0.
    Args:
        y (numpy ndarray): 2d arrary of dummy variables.
    Returns:
        y_inv (numpy array): 1d numerical numpy array.
    """
    num_inst = y.shape[0]
    num_dum = y.shape[1]
    y_inv = np.zeros(num_inst)
    for i in range(num_inst):
        if len(np.where(y[i] == 1)[0])==0:
            y_inv[i] = num_dum # add new label for all elements are 0
        else:
            y_inv[i] = np.where(y[i] == 1)[0][0]
    return y_inv.astype('int')

In [39]:
def split_data(data, label=None, ratio=[0.8, 0.1]):
    """
    Split the data as training, testing, validation datasets with input ratio.
    The ratio array presents the ratios of training and testing dataset.
    If the summation is less than 1, it means that the rest ratio is for validation dataset.
    Args:
        data (numpy ndarray): the original dataset, it will be split by the 1st dimension
        label (numpy array): the pairs containing {channel: adjust value}. It is optional to input.
        ratio (list[float]): the list to represent the ration of each dataset. 
            Default [0.8, 0.1] means 80% for training, 10% for testing, and rest 10% for validation.
    Returns:
        the split datasets: training, testing, validation. 
            If input with labels, that will be (data_train, lab_train), (data_test, lab_test), (data_valid, lab_valid)
    """
    # copy and shuffle the data
    data = data.copy()
    num_inst = data.shape[0]
    indices = np.arange(num_inst)
    np.random.shuffle(indices)
    
    # determine the indices to split the data
    ind_train = indices[:round(num_inst*ratio[0])]
    ind_test = indices[round(num_inst*ratio[0]):round(num_inst*sum(ratio))]
    ind_valid = indices[round(num_inst*sum(ratio)):]
    
    # determine the datasets with the indices
    data_train = data[ind_train]
    data_test = data[ind_test]
    data_valid = data[ind_valid]
    
    # return the split datasets directly if there is no label 
    if label is None:
        return data_train, data_test, data_valid
    
    # split labels with the same indices if there ae labels
    else:
        label = label.copy()
        lab_train = label[ind_train]
        lab_test = label[ind_test]
        lab_valid = label[ind_valid]
        return (data_train, lab_train), (data_test, lab_test), (data_valid, lab_valid)

In [None]:
def add_noise(x, fact_noise=0.5, std=1., mean=0):
    """
    @ Deprecated function, need to rebuild.
    """
    x_noise = x + fact_noise * np.random.normal(loc=mean, scale=std, size=x.shape) 
    x_noise = np.clip(x_noise, 0., 1.)
    return x_noise