In [12]:
import sys

sys.path.append('../')

import os
import torch
import pickle
import numpy as np
import pandas as pd
from tqdm.auto import tqdm, trange
import random

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

import torch
from torch.utils.data import Dataset, DataLoader

import gzip
import json



from infrastructure.randutils import *
from infrastructure.misc import *
# from utils import *

# import gdown


# def download_by_name(fname, output=None, quiet=False):
#     url = 'https://drive.google.com/uc?id=12ymlWEcKhVuQ3syNb92zVMmowAsZwSZ4&export=download'
#     gdown.download(url, output=fname, quiet=quiet)

# raw_path = os.path.join('/workspace','data', 'raw', 'FitRec')
# create_path(raw_path)
# os.chdir(raw_path)

# raw_fname = 'endomondoHR_proper.json'

# download_by_name(fname=raw_fname, output=raw_path)

def load_raw_fitrec(test_split):
    
    raw_path = os.path.join('raw', 'FitRec','endomondoHR_proper.json')
    
#     data_tr = []
#     data_te = []

    data = []
    
    unique_sports = {}
    nunique_sports = 0
    
    steps = 0
    
    with open(raw_path) as f:
        for l in tqdm(f):
  
            d = eval(l)
            
            if d['sport'] in unique_sports.keys():
                sport_id = unique_sports[d['sport']]
            else:
                nunique_sports += 1
                unique_sports[d['sport']] = nunique_sports
            #
            
            timestamp = np.array(d['timestamp'])
            heart_rate = np.array(d['heart_rate'])
            altitude = np.array(d['altitude'])
            
            timestamp = timestamp - timestamp[0]

            if timestamp[-1] < 4500 or timestamp[-1] > 5500:
                continue
            
            assert timestamp.shape[0] == heart_rate.shape[0] == altitude.shape[0]
            nsamples = timestamp.shape[0]
            
            array_id = np.repeat(d['id'], nsamples).reshape([-1,1])
            array_sport = np.repeat(unique_sports[d['sport']], nsamples).reshape([-1,1])
            
            dij = np.hstack([
                array_id, 
                array_sport, 
                altitude.reshape([-1,1]),
                timestamp.reshape([-1,1]),
                heart_rate.reshape([-1,1]),
            ])

            data.append(dij)
   
            steps += 1
            if steps >= 1000:
                break
            #
        #
    #

    data = np.vstack(data)
        
    df = pd.DataFrame(data, columns=['user_id','sport_id','altitude','timestamp','heart_rate'])
    
    save_path = os.path.join('raw', 'FitRec')
    create_path(save_path)
    
    df.to_csv(os.path.join(save_path, 'heart_rate.csv.gz'), compression='gzip', index=False)
#


def process_fit_record(K_user, K_sport, K_alt, t_min, t_max, folds, max_samples, test_split, extrap):
    
    load_path = os.path.join('raw', 'FitRec', 'heart_rate.csv.gz')

    df = pd.read_csv(load_path, compression='gzip')

    #display(df)
    
    unique_users = df['user_id'].unique()
    
    train_list = []
    test_list = []
    
    for fold in range(folds):
        
        #print(df.size)
    
        K_choice = generate_random_choice(a=unique_users.size, N=K_user, seed=fold)
        
        topK_user = unique_users[K_choice]
        
        #cprint('g', topK_user)

        df_fold = df.loc[df['user_id'].isin(topK_user)]
        
        topK_sport = df_fold['sport_id'].value_counts().index[:K_sport]
        
        #cprint('g', topK_sport)
        
        #print(topK_sport)
        
        df_fold = df_fold.loc[df_fold['sport_id'].isin(topK_sport)]
        
        #print(df_fold['sport_id'].unique())
        
        #display(df_fold)
        
        df_fold.sport_id = pd.factorize(df_fold.sport_id)[0]
        df_fold.user_id = pd.factorize(df_fold.user_id)[0]
        
        #display(df_fold)
        
        data = df_fold.to_numpy().astype(float)
        
        altitude = data[:,-3].reshape([-1,1])
        #print(altitude.min())
        #print(altitude.max())
        
        bins_altitude = np.linspace(
            start=altitude.min(),
            stop=altitude.max(),
            num=K_alt+1
        )[1:-1]
        
        b_altitude = np.digitize(altitude, bins=bins_altitude)
        data[:,-3] = b_altitude.squeeze()
        
        timestamp = data[:,-2].reshape([-1,1])
        y = data[:,-1].reshape([-1,1])
        
        # normalize/scale time and observations
        scaler_t = MinMaxScaler(feature_range=(t_min, t_max))
        scaler_t.fit(timestamp)
    
        scaler_y = StandardScaler()
        scaler_y.fit(y)
        
        scaled_time = scaler_t.transform(timestamp)
        scaled_y = scaler_y.transform(y)
        
        data[:,-2] = scaled_time.squeeze()
        data[:,-1] = scaled_y.squeeze()
        
#         t_split = t_max * (1-test_split)
        
#         idx_tr = scaled_time.squeeze() <= t_split
#         idx_te = scaled_time.squeeze() > t_split
        
#         data_tr = data[idx_tr, :]
#         data_te = data[idx_te, :]

        if extrap:
            t_split = t_max - t_max*test_split
            tr_idx = data[:,-2]<=t_split
            te_idx = data[:,-2]>t_split
        else:
            t_split1 = (0.5-0.5*test_split)*t_max
            t_split2 = (0.5+0.5*test_split)*t_max

            #print(data[:,-2]<t_split1)
            #print(data[:,-2]>=t_split2)
            tr_idx = np.any([data[:,-2]<t_split1, data[:,-2]>=t_split2], axis=0)
            te_idx = np.all([data[:,-2]>=t_split1, data[:,-2]<t_split2], axis=0)
        #
        
        data_tr = data[tr_idx]
        data_te = data[te_idx]
        
        cprint('r', data_tr.shape)
        cprint('b', data_te.shape)
        
        print(data_tr[:,-2].min(), data_tr[:,-2].max())
        print(data_te[:,-2].min(), data_te[:,-2].max())
        
        train_list.append(data_tr)
        test_list.append(data_te)
        
    #
    
    D = {}
    D['nvec'] = [K_user, K_sport, K_alt]
    D['nmod'] = 3
    D['train_folds'] = train_list
    D['test_folds'] = test_list
    D['t_min'] = t_min
    D['t_max'] = t_max
    
    save_path = os.path.join('processed')
#     pickle_name = 'FitRec' + '.pickle'

    if extrap:
        pickle_name = 'FitRecExtrap' + '.pickle'
    else:
        pickle_name = 'FitRecInterp' + '.pickle'

    create_path(save_path)

    with open(os.path.join(save_path, pickle_name), 'wb') as handle:
        pickle.dump(D, handle, protocol=pickle.HIGHEST_PROTOCOL)
    #

Data = process_fit_record(
    K_user=30,
    K_sport=20,
    K_alt=20,
    t_min=0.001, 
    t_max=10.0, 
    folds=5, 
    max_samples=12000,
    test_split=0.4,
    extrap=True,
)

[31m(9867, 5)[0m
[34m(5133, 5)[0m
0.001 5.999296357615894
6.0011357615894045 10.0
[31m(10199, 5)[0m
[34m(4801, 5)[0m
0.001 5.999672667757775
6.00149099836334 10.0
[31m(10110, 5)[0m
[34m(4890, 5)[0m
0.001 5.9996646442360735
6.001503033645892 10.0
[31m(10226, 5)[0m
[34m(4774, 5)[0m
0.001 5.999307412128939
6.001128391914042 10.0
[31m(10060, 5)[0m
[34m(4940, 5)[0m
0.001 5.998215222141297
6.000035870356884 10.0
Directory 'processed' created successfully
