In [1]:
import numpy as np
import random
seed = 0
data_path = "/Users/stevenliu/time-series-adaption/time-series-domain-adaptation/data_unzip/"
save_path = "/Users/stevenliu/time-series-adaption/time-series-domain-adaptation/data_unzip/"
task = '3E'
file_name = "processed_file_%s.pkl"%task

# Load data

In [2]:
total_data = np.load(data_path+file_name, allow_pickle=True)
source_all_x = total_data['tr_data']
source_all_y = total_data['tr_lbl']
target_all_x = total_data['te_data']
target_all_y = total_data['te_lbl']

# Not one hot encoding

In [3]:
source_all_y_not_one_hot = np.argmax(source_all_y,axis=1)
target_all_y_not_one_hot = np.argmax(target_all_y,axis=1)

# Shuffle Data

In [4]:
np.random.seed(seed)
shuffled_indice_source = np.random.permutation(source_all_x.shape[0])
np.random.seed(seed)
shuffled_indice_target = np.random.permutation(target_all_x.shape[0])

source_all_x_shuffled = source_all_x[shuffled_indice_source]
source_all_y_shuffled = source_all_y_not_one_hot[shuffled_indice_source]
target_all_x_shuffled = target_all_x[shuffled_indice_target]
target_all_y_shuffled = target_all_y_not_one_hot[shuffled_indice_target]

# Construct class dict

In [5]:
def get_class_data_dict(data, lbl, num_class):
    '''
    construct a dict {label: data}
    '''
    lbl_not_one_hot = lbl
    result = {i:[] for i in range(num_class)}
    for label in result:
        index = np.argwhere(lbl_not_one_hot==label)[:,0] # np.argwhere return (num_lbl_in_this_class, 1)
        result[label] = data[index]

    return result

In [6]:
source_dict = get_class_data_dict(source_all_x_shuffled, source_all_y_shuffled, source_all_y.shape[1])
target_dict = get_class_data_dict(target_all_x_shuffled, target_all_y_shuffled, target_all_y.shape[1])

In [7]:
target_dict[0].shape

(109, 1600, 2)

# Separate Percentage

In [8]:
def separate_data_by_percentage(data_dict, known_percentage):
    unknown_label_x = []
    unknown_label_y = []
    known_label_x = []
    known_label_y = []
    for label, data in data_dict.items():
        num_in_this_label = data.shape[0]
        known_num_in_this_label = int(known_percentage * num_in_this_label)
        unknown_num_in_this_label = num_in_this_label - known_num_in_this_label
        np.random.seed(seed)
        known_indices = np.random.choice(num_in_this_label, known_num_in_this_label, replace=False)
        unknown_indices = np.delete(np.arange(0, num_in_this_label), known_indices)
        
        assert unknown_indices.shape[0]+known_indices.shape[0] == num_in_this_label
        assert known_indices.shape[0] == known_num_in_this_label
        assert unknown_indices.shape[0] == unknown_num_in_this_label
        assert any(np.isin(unknown_indices,known_indices)) == False
        
        known_label_x.extend(data[known_indices])
        unknown_label_x.extend(data[unknown_indices])
        known_label_y.extend([label] * known_num_in_this_label)
        unknown_label_y.extend([label] * unknown_num_in_this_label)
        
    return (np.array(known_label_x), np.array(known_label_y)), (np.array(unknown_label_x), np.array(unknown_label_y))
        

# Separate Percentage for target

In [16]:
for labeled_target_percentage in [0.3, 0.5, 0.7]: 
    (target_known_label_x, target_known_label_y), (target_unknown_label_x, target_unknown_label_y) = separate_data_by_percentage(target_dict, labeled_target_percentage)
    assert target_known_label_x.shape[0] + target_unknown_label_x.shape[0] == target_all_x_shuffled.shape[0]
    assert target_unknown_label_x.shape[0] == target_unknown_label_y.shape[0]
    assert target_known_label_x.shape[0] == target_known_label_y.shape[0]
    assert target_all_x_shuffled.shape[0] == target_all_y_shuffled.shape[0]
    np.save(save_path+"processed_file_not_one_hot_%s_%1.1f_target_known_label_x.npy"%(task, labeled_target_percentage), target_known_label_x)
    np.save(save_path+"processed_file_not_one_hot_%s_%1.1f_target_known_label_y.npy"%(task, labeled_target_percentage), target_known_label_y)
    np.save(save_path+"processed_file_not_one_hot_%s_%1.1f_target_unknown_label_x.npy"%(task, labeled_target_percentage), target_unknown_label_x)
    np.save(save_path+"processed_file_not_one_hot_%s_%1.1f_target_unknown_label_y.npy"%(task, labeled_target_percentage), target_unknown_label_y)


In [9]:
for labeled_source_percentage in [0.3, 0.5, 0.7]: 
    (source_known_label_x, source_known_label_y), (source_unknown_label_x, source_unknown_label_y) = separate_data_by_percentage(source_dict, labeled_source_percentage)
    assert source_known_label_x.shape[0] + source_unknown_label_x.shape[0] == source_all_x_shuffled.shape[0]
    assert source_unknown_label_x.shape[0] == source_unknown_label_y.shape[0]
    assert source_known_label_x.shape[0] == source_known_label_y.shape[0]
    assert source_all_x_shuffled.shape[0] == source_all_y_shuffled.shape[0]
    np.save(save_path+"processed_file_not_one_hot_%s_%1.1f_source_known_label_x.npy"%(task, labeled_source_percentage), source_known_label_x)
    np.save(save_path+"processed_file_not_one_hot_%s_%1.1f_source_known_label_y.npy"%(task, labeled_source_percentage), source_known_label_y)
    np.save(save_path+"processed_file_not_one_hot_%s_%1.1f_source_unknown_label_x.npy"%(task, labeled_source_percentage), source_unknown_label_x)
    np.save(save_path+"processed_file_not_one_hot_%s_%1.1f_source_unknown_label_y.npy"%(task, labeled_source_percentage), source_unknown_label_y)


In [10]:
labeled_source_percentage

0.7

In [17]:
source_unknown_label_x.shape[0]

4290

In [16]:
np.load(save_path+'processed_file_not_one_hot_%s_%1.1f_source_unknown_label_x.npy'%(task, labeled_source_percentage)).shape

(4290, 1600, 2)