In [3]:
import sys

sys.path.append('../')

import os
import torch
import pickle
import numpy as np
import pandas as pd
from tqdm.auto import tqdm, trange
import random

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

import torch
from torch.utils.data import Dataset, DataLoader

import gzip
import json

from infrastructure.randutils import *
from infrastructure.misc import *

def process_server(t_min, t_max, folds, max_samples, test_split, extrap):
    
    raw_path = os.path.join('raw', 'Server')
    
    train_idxs = np.load(os.path.join(raw_path,'train_idxs.npy'))
    train_vals = np.load(os.path.join(raw_path,'train_vals.npy'))

    test_idxs = np.load(os.path.join(raw_path,'test_idxs.npy'))
    test_vals = np.load(os.path.join(raw_path,'test_vals.npy'))

    data = np.hstack([np.vstack([train_idxs, test_idxs]), np.concatenate([train_vals,test_vals]).reshape([-1,1])])
    
#     print(np.unique(data[:,0]))
#     print(np.unique(data[:,1]))
#     print(np.unique(data[:,2]))
    
    data[:,0] = data[:,0]-1
    data[:,1] = data[:,1]-1
    data[:,2] = data[:,2]-1
    
#     print(np.unique(data[:,0]))
#     print(np.unique(data[:,1]))
#     print(np.unique(data[:,2]))
    
#     perm = generate_permutation_sequence(N=data.shape[0], seed=1)
#     data = data[perm, :]
    
    data = data[:max_samples, :]
    
    
    timestamp = data[:,-2].reshape([-1,1])
    y = data[:,-1].reshape([-1,1])
    
    # normalize/scale time and observations
    scaler_t = MinMaxScaler(feature_range=(t_min, t_max))
    scaler_t.fit(timestamp)

    timestamp_scaled = scaler_t.transform(timestamp)

    scaler_y = StandardScaler()
    scaler_y.fit(y)
    
    y_scaled = scaler_y.transform(y)
    
    data[:,-2] = timestamp_scaled.squeeze()
    data[:,-1] = y_scaled.squeeze()
    
    kf = KFold(n_splits=folds)
    
    train_list = []
    test_list = []
    
    for fold_idx, _ in kf.split(data):
        #cprint('r', data.shape)
        
        data_fold = data[fold_idx, :]
        
        sort_index = np.argsort(data_fold[:,-2])
        data_fold = data_fold[sort_index, :]
        
#         t_split = t_max - test_split*t_max
    
#         data_tr = data_fold[data_fold[:,-2]<=t_split]
#         data_te = data_fold[data_fold[:,-2]>t_split]

        if extrap:
            t_split = t_max - t_max*test_split
            tr_idx = data_fold[:,-2]<=t_split
            te_idx = data_fold[:,-2]>t_split
        else:
            t_split1 = (0.5-0.5*test_split)*t_max
            t_split2 = (0.5+0.5*test_split)*t_max

            #print(data[:,-2]<t_split1)
            #print(data[:,-2]>=t_split2)
            tr_idx = np.any([data_fold[:,-2]<t_split1, data_fold[:,-2]>=t_split2], axis=0)
            te_idx = np.all([data_fold[:,-2]>=t_split1, data_fold[:,-2]<t_split2], axis=0)
        #
            
        data_tr = data_fold[tr_idx]
        data_te = data_fold[te_idx]
        
        cprint('r', data_tr.shape)
        cprint('b', data_te.shape)
        
        #print(data_tr[:,-2].min(), data_tr[:,-2].max())
        #print(data_te[:,-2].min(), data_te[:,-2].max())
        
        #cprint('r', np.unique(data_tr[:,0]))
        #cprint('r', np.unique(data_tr[:,1]))
        #cprint('r', np.unique(data_tr[:,2]))
        
        train_list.append(data_tr)
        test_list.append(data_te)
    #
    
    D = {}
    D['nvec'] = [3,3,34]
    D['nmod'] = 2
    D['train_folds'] = train_list
    D['test_folds'] = test_list
    D['t_min'] = t_min
    D['t_max'] = t_max
    
    save_path = os.path.join('processed')
    if extrap:
        pickle_name = 'ServerExtrap' + '.pickle'
    else:
        pickle_name = 'ServerInterp' + '.pickle'

    create_path(save_path)

    with open(os.path.join(save_path, pickle_name), 'wb') as handle:
        pickle.dump(D, handle, protocol=pickle.HIGHEST_PROTOCOL)
    #

Data = process_server(
    t_min=0.001, 
    t_max=10.0, 
    folds=5, 
    max_samples=15000,
    test_split=0.4,
    extrap=True,
)

[31m(6325, 5)[0m
[34m(5675, 5)[0m
[31m(6269, 5)[0m
[34m(5731, 5)[0m
[31m(6299, 5)[0m
[34m(5701, 5)[0m
[31m(6290, 5)[0m
[34m(5710, 5)[0m
[31m(6305, 5)[0m
[34m(5695, 5)[0m
Directory 'processed' created successfully
