# prepare

In [None]:
import os
import sys
import math
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


sys.path.append('../..')
import model.Baseline as baseline
import model.my_model as mymodel
import model.util_loss as util_ls
import model.util_dataloader as util_dl
import model.util_model as util_md
import data_process.util_data as util_dt


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("The model will be running on", device, "device")

# local
# file_save_dir = r'C:\Users\29492\Desktop\exp12'
# file_load_dir = r'D:\dataset\DataFrames\REDD'
# server
file_save_dir = r'../../data'
file_load_dir = r'../../data'

dm = util_md.DictManager(file_save_dir)
mwm = util_md.ModelWeightManager(file_save_dir)

In [None]:
dataframe_path = os.path.join(file_load_dir, r'house_1.csv')
df = pd.read_csv(dataframe_path)
df.set_index('time', inplace=True)
# df.describe(percentiles=[0.10,0.25,0.50,0.75,0.8,0.85,0.9,0.95])


house_1_cols = ['oven-3', 'oven-4', 'refrigerator-5', 'dishwaser-6', 'lighting-9', 'washer_dryer-10', 'microwave-11',
                'bathroom_gfi-12', 'electric_heat-13', 'stove-14', 'lighting-17', 'lighting-18', 'washer_dryer-19', 'washer_dryer-20']
filtered_columns = df[house_1_cols]
# filtered_columns.describe(percentiles=[0.10,0.25,0.50,0.6,0.75,0.8,0.85,0.9,0.95])


apps_name = filtered_columns.columns.to_list()
print(apps_name)


filtered_columns['sum'] = filtered_columns.sum(axis=1)
norm_data, min_val, max_val = util_dt.normalization_interval(filtered_columns.to_numpy(), 0, 1)
main = norm_data[:,-1]
apps = norm_data[:,:-1]
print(f'min: {min_val}; max: {max_val}')
print(np.min(main), np.max(main))
print(np.min(apps), np.max(apps))
util_dt.print_shape(main, apps)


on_threshold = [50, 50, 50, 50, 20, 50, 50, 50, 50, 50, 20, 20, 50, 50]
sliding_window_len = 599

In [None]:
in_seq_l = sliding_window_len
out_dim = len(apps_name)
print(f'in_seq_l: {in_seq_l}; out_dim: {out_dim}')

loss_factor=max_val-min_val
print(f'loss_factor:{loss_factor}')


metric_dict_one2one = {'MAE(apps)':util_ls.MAE(single_class=True),
                'SAE':util_ls.SignalAggregateError(single_class=True, period_len=450),
                'MAE(offon)':util_ls.MAE_off_on(single_class=True)}
metric_dict_one2all = {'MAE(apps)':util_ls.MAE(single_class=False),
                       'MAE(mean)':util_ls.MAE(single_class=True),
                'SAE(apps)':util_ls.SignalAggregateError(single_class=False, period_len=450),
                'SAE(mean)':util_ls.SignalAggregateError(single_class=True, period_len=450),
                'MAE(offon)':util_ls.MAE_off_on(single_class=False)}


apps_to_train = ['refrigerator-5', 'dishwaser-6', 'washer_dryer-10', 'microwave-11', 'bathroom_gfi-12']
for app in apps_to_train:
    print(apps_name.index(app), app)

In [None]:
w_main, w_apps = util_dt.generate_window_samples(input_1=main, input_2=apps, window_size_1=sliding_window_len,
                                     window_size_2=1, offset=math.floor(sliding_window_len/2))
w_main = np.expand_dims(w_main, axis=1)
w_apps_sum = w_apps.sum(axis=-1)
w_apps_sum = np.expand_dims(w_apps_sum, axis=1)
w_apps_on = (w_apps>(on_threshold-min_val)/(max_val-min_val)).astype('int')
util_dt.print_shape(w_main, w_apps, w_apps_sum, w_apps_on)

In [None]:
total_len = len(w_main)
num_sample_per_day = int(60*60*24/8)
num_days = total_len/num_sample_per_day
print(total_len, num_sample_per_day, num_days)

# generate error data

In [None]:
def mislabel_accurate_to_minute(data, timestap):
    df = pd.DataFrame(data,index=timestap)
    df = df.set_index(pd.DatetimeIndex(pd.to_datetime(df.index)))
    df_upsampled = pd.DataFrame(index=df.index)
    df_downsampled = df.resample('1T').max()
    merged_df = pd.merge_asof(df_upsampled, df_downsampled, left_index=True, right_index=True, direction='backward')
    return merged_df.to_numpy()


def mislabel_random_shift(data):
    data_corrupted = np.zeros_like(data)
    for dim in range(data.shape[-1]):
        data_origin = data[:,dim]
        activations = util_dt.get_activation_index_from_states(data_origin)
        min_value = 0
        max_value = len(data_origin)-1
        activations_corrupted = []
        for on, off in activations:
            while True:
                on_drift = int(np.random.normal(loc=0, scale=5))
                off_drift = int(np.random.normal(loc=0, scale=5))
                on_next = np.clip(on+on_drift, min_value, max_value)
                off_next = np.clip(off+off_drift, min_value, max_value)
                if on_next<off_next:
                    break
            activations_corrupted.append([on_next, off_next])
        activations_corrupted = np.array(activations_corrupted)
        data_generate = np.zeros((len(data_origin)),dtype=int)
        for index_start, index_end in activations_corrupted:
            data_generate[index_start:index_end]=1
        data_corrupted[:,dim] = data_generate
    return data_corrupted

def mislabel_event_missing(data, possibility=0.1):
    data_corrupted = np.zeros_like(data)
    for dim in range(data.shape[-1]):
        data_origin = data[:,dim]
        activations = util_dt.get_activation_index_from_states(data_origin)
        data_generate = np.zeros((len(data_origin)),dtype=int)
        for index_start, index_end in activations:
            if np.random.uniform(0, 1)<possibility:
                continue
            data_generate[index_start:index_end]=1
        data_corrupted[:,dim] = data_generate
    return data_corrupted


In [None]:
b_s = 450*20
eval_b_s = 450*20

num_day_for_test = 7
num_day_for_val = 7
num_day_for_train = int(num_days) - num_day_for_test - num_day_for_val

subset = util_dl.VariableDataset(torch.from_numpy(w_main[-1-num_sample_per_day*7:-1]).to(torch.float32),
                                torch.from_numpy(w_apps_on[-1-num_sample_per_day*7:-1]).to(torch.float32),
                                torch.from_numpy(w_apps_sum[-1-num_sample_per_day*7:-1]).to(torch.float32),
                                torch.from_numpy(w_apps[-1-num_sample_per_day*7:-1]).to(torch.float32))
subset.describe()
testset = util_dl.just_to_DataLoader(dataset=subset, batch_size=eval_b_s, shuffle_flag=False)


subset = util_dl.VariableDataset(torch.from_numpy(w_main[-1-num_sample_per_day*14:-1-num_sample_per_day*7]).to(torch.float32),
                                torch.from_numpy(w_apps_on[-1-num_sample_per_day*14:-1-num_sample_per_day*7]).to(torch.float32),
                                torch.from_numpy(w_apps_sum[-1-num_sample_per_day*14:-1-num_sample_per_day*7]).to(torch.float32),
                                torch.from_numpy(w_apps[-1-num_sample_per_day*14:-1-num_sample_per_day*7]).to(torch.float32))
subset.describe()
valset = util_dl.just_to_DataLoader(dataset=subset, batch_size=eval_b_s, shuffle_flag=False)


w_main = w_main[0:num_sample_per_day*num_day_for_train]
w_apps_on = w_apps_on[0:num_sample_per_day*num_day_for_train]
w_apps_sum = w_apps_sum[0:num_sample_per_day*num_day_for_train]
w_apps = w_apps[0:num_sample_per_day*num_day_for_train]
util_dt.print_shape(w_main, w_apps, w_apps_sum, w_apps_on)

In [None]:
exp_num = 5
epoch_n = 50

# 'mislabel_accurate_to_minute', 'mislabel_random_shift', 'mislabel_event_missing'
for strategy in ['mislabel_minute_missing', 'mislabel_shift_missing']:

    if strategy == 'mislabel_accurate_to_minute':
        print(f'matched {strategy}')
        w_apps_on_corrupted = mislabel_accurate_to_minute(w_apps_on,
                                                          df.index[int(sliding_window_len/2):int(sliding_window_len/2)+len(w_apps_on)])
    elif strategy == 'mislabel_random_shift':
        print(f'matched {strategy}')
        w_apps_on_corrupted = mislabel_random_shift(w_apps_on)
    elif strategy == 'mislabel_event_missing':
        print(f'matched {strategy}')
        w_apps_on_corrupted = mislabel_event_missing(w_apps_on)
    elif strategy == 'mislabel_minute_missing':
        print(f'matched {strategy}')
        w_apps_on_corrupted = mislabel_accurate_to_minute(w_apps_on,
                                                          df.index[int(sliding_window_len/2):int(sliding_window_len/2)+len(w_apps_on)])
        w_apps_on_corrupted = mislabel_event_missing(w_apps_on_corrupted)
    elif strategy == 'mislabel_shift_missing':
        print(f'matched {strategy}')
        w_apps_on_corrupted = mislabel_random_shift(w_apps_on)
        w_apps_on_corrupted = mislabel_event_missing(w_apps_on_corrupted)
    else:
        print('not found')



    dataset = util_dl.VariableDataset(torch.from_numpy(w_main).to(torch.float32),
                    torch.from_numpy(w_apps_on).to(torch.float32),
                    torch.from_numpy(w_apps_sum).to(torch.float32),
                    torch.from_numpy(w_apps).to(torch.float32))
    dataset.describe()
    trainset = util_dl.just_to_DataLoader(dataset=subset, batch_size=b_s, shuffle_flag=True)

    for i in range(exp_num):

        model = mymodel.Model_final(in_seq_l, out_dim)

        score_train_vali, score_train_vali_clas = util_md.train_val_Mark_DoubleTask_AutomaticWeightedLoss(device, model, epoch_n, trainset, valset,
                                            loss_reg_factor=loss_factor, return_metric_reg_dict=metric_dict_one2all, decay_params=[10,0.5])

        score_test, score_test_clas  = util_md.train_val_Mark_DoubleTask_AutomaticWeightedLoss(device, model, valset=testset, loss_reg_factor=loss_factor,
                                            return_metric_reg_dict=metric_dict_one2all)

        total_record = {'train_vali':score_train_vali, 'test':score_test,
                    'train_vali_clas':score_train_vali_clas, 'test_clas':score_test_clas}
        filename = '{dataset}_{model}_{appname}_{date}'.format(dataset='REDD',model='Final',appname='all',
                                                               date=f'0109-{strategy}-{i}')
        dm.save_dict(total_record, filename)
        mwm.save_model_weight(model, filename)

    del trainset, dataset
