# Running Checks

In [2]:
%load_ext autoreload
%autoreload 2
%cd /mnt/c/Users/resha/Documents/Github/balancing_framework/

from gluonts.dataset.repository import get_dataset, dataset_names
from gluonts.dataset.util import to_pandas

import pickle
import pandas as pd
import numpy as np
import json
import time
import argparse
from tqdm import tqdm

from framework import run_measurements, viz
from fracdiff import frac_diff_bestd
from monash_data_utils import convert_tsf_to_dataframe, monash_df_to_gluonts_train_datasets
import os

from statsmodels.tsa.stattools import adfuller

def run_adf(series):
    adf_chunk_size = 100_000
    num_stat = (0,0) # number of stationary windows, total number of windows
    p_values = []
    for i in range(0, len(series), adf_chunk_size):
        data_chunk = series.dropna()[i:i+adf_chunk_size]
        if data_chunk.nunique()==1:
            print(f'series has only one unique value: {series.iloc[0]}')
            continue
        try:
            adf_result = adfuller(data_chunk) 
        except Exception as e:
            print(e)
            continue
        # print(f'{i} p-value={adf_result[1]}, lags={adf_result[2]}')
        num_stat = (num_stat[0], num_stat[1]+1)
        p_values.append(adf_result[1])
        if adf_result[1] < 0.05:
            num_stat = (num_stat[0]+1, num_stat[1])
    # if more than 50% of the p-values are above 0.05, then the data is not stationary
    stationary = num_stat[0] >= num_stat[1]/2
    return stationary


/mnt/c/Users/resha/Documents/Github/balancing_framework




In [None]:
# Loop through monash dir


monash_dir = "monash_data"
results = pd.DataFrame()
for dataset_name in tqdm(os.listdir(monash_dir)):
    print(dataset_name)
    loaded_data, frequency, forecast_horizon, contain_missing_values, contain_equal_length = convert_tsf_to_dataframe(f"{monash_dir}/{dataset_name}")
    dataset = monash_df_to_gluonts_train_datasets(loaded_data, frequency)
    series_lengths = []
    num_stat = 0

    for entry in tqdm(dataset.test):
        row = pd.Series(entry['target'])
        series_lengths.append(len(row))
        stat = run_adf(row)
        if stat: num_stat += 1
    
    results = pd.concat([results, pd.DataFrame([{
        'dataset_name': dataset_name,
        'num_series': len(dataset.test),
        'num_stat': num_stat,
        'pct_stat': num_stat/len(dataset.test),
        'mean_series_len': np.mean(series_lengths),
        'std_series_len': np.std(series_lengths),
        'min_series_len': np.min(series_lengths),
        'max_series_len': np.max(series_lengths),
    }])], ignore_index=True)
    results.to_csv('results_stat_checks.csv')
    

results



In [None]:
results = pd.read_csv('results_stat_checks.csv', index_col=0)
results.sort_values(['pct_stat'], ascending=True)

Unnamed: 0,dataset_name,num_series,num_stat,pct_stat,mean_series_len,std_series_len,min_series_len,max_series_len
41,tourism_monthly_dataset.tsf,366,3,0.008197,298.5792,55.57541,91,333
42,tourism_quarterly_dataset.tsf,427,4,0.009368,99.63466,20.312835,30,130
36,solar_weekly_dataset.tsf,137,5,0.036496,52.0,0.0,52,52
21,m4_daily_dataset.tsf,4227,195,0.046132,2371.383,1756.361126,107,9933
5,cif_2016_dataset.tsf,72,4,0.055556,98.72222,31.63956,28,120
16,m1_quarterly_dataset.tsf,203,14,0.068966,48.98522,16.870261,18,114
19,m3_quarterly_dataset.tsf,756,61,0.080688,48.94709,10.628396,24,72
43,tourism_yearly_dataset.tsf,518,46,0.088803,24.62741,5.331318,11,47
24,m4_quarterly_dataset.tsf,24000,2198,0.091583,100.2545,51.128442,24,874
26,m4_yearly_dataset.tsf,23000,2116,0.092,37.32426,24.523433,19,841


In [None]:
datasets = []
for dataset_name in dataset_names:
    try:
        datasets.append(get_dataset(dataset_name))
    except Exception as e:
        print(e)
        pass


[Errno 2] No such file or directory: '/home/reshawn/.gluonts/datasets/wind_farms_without_missing/metadata.json'


Download car_parts_dataset_without_missing_values.zip:: 40.0kB [00:02, 20.1kB/s]
creating json files: 100%|██████████| 2674/2674 [00:00<00:00, 140853.10it/s]
Download dominick_dataset.zip:: 11.8MB [00:33, 367kB/s]                            
creating json files: 100%|██████████| 115704/115704 [00:01<00:00, 104559.16it/s]
Download fred_md_dataset.zip:: 168kB [00:01, 113kB/s]
creating json files: 100%|██████████| 107/107 [00:00<00:00, 149397.65it/s]
Download pedestrian_counts_dataset.zip:: 4.38MB [00:16, 280kB/s]                            
creating json files: 100%|██████████| 66/66 [00:00<00:00, 131883.78it/s]
Download hospital_dataset.zip:: 80.0kB [00:01, 47.0kB/s]
creating json files: 100%|██████████| 767/767 [00:00<00:00, 174440.47it/s]
Download covid_deaths_dataset.zip:: 32.0kB [00:01, 23.7kB/s]
creating json files: 100%|██████████| 266/266 [00:00<00:00, 146396.12it/s]
Download kdd_cup_2018_dataset_without_missing_values.zip:: 2.32MB [00:06, 354kB/s]
creating json files: 100%|█████

The m3 data is available at https://forecasters.org/resources/time-series-data/m3-competition/ Please download the file and copy the files to this location: /home/reshawn/.gluonts/datasets/M3C.xls
The m3 data is available at https://forecasters.org/resources/time-series-data/m3-competition/ Please download the file and copy the files to this location: /home/reshawn/.gluonts/datasets/M3C.xls
The m3 data is available at https://forecasters.org/resources/time-series-data/m3-competition/ Please download the file and copy the files to this location: /home/reshawn/.gluonts/datasets/M3C.xls
The m3 data is available at https://forecasters.org/resources/time-series-data/m3-competition/ Please download the file and copy the files to this location: /home/reshawn/.gluonts/datasets/M3C.xls
M5 data is available on Kaggle (https://www.kaggle.com/c/m5-forecasting-accuracy/data). You first need to agree to the terms of the competition before being able to download the data. After you have done that, pl

Download australian_electricity_demand_dataset.zip:: 5.51MB [00:14, 396kB/s]                            
creating json files: 100%|██████████| 5/5 [00:00<00:00, 29289.83it/s]
Download electricity_hourly_dataset.zip:: 11.3MB [00:20, 577kB/s]                            
creating json files: 100%|██████████| 321/321 [00:00<00:00, 133489.15it/s]
Download electricity_weekly_dataset.zip:: 152kB [00:01, 106kB/s]
creating json files: 100%|██████████| 321/321 [00:00<00:00, 142820.79it/s]
Download rideshare_dataset_without_missing_values.zip:: 1.03MB [00:02, 372kB/s]
creating json files: 100%|██████████| 2304/2304 [00:00<00:00, 8530.86it/s]


[Errno 2] No such file or directory: '/home/reshawn/.gluonts/datasets/saugeenday/metadata.json'
[Errno 2] No such file or directory: '/home/reshawn/.gluonts/datasets/solar_10_minutes/metadata.json'


Download temperature_rain_dataset_without_missing_values.zip:: 24.0MB [00:39, 633kB/s]                            
creating json files: 100%|██████████| 32072/32072 [00:00<00:00, 76236.04it/s]
Download vehicle_trips_dataset_without_missing_values.zip:: 48.0kB [00:01, 42.3kB/s]
creating json files: 100%|██████████| 329/329 [00:00<00:00, 149020.09it/s]


In [None]:
#
results = pd.DataFrame()
for dataset_name in tqdm(dataset_names):
    try:
        dataset = get_dataset(dataset_name)
        print(dataset_name)
    except Exception as e:
        print(e)
        continue
    series_lengths = []
    num_stat = 0

    for entry in dataset.test:
        row = pd.Series(entry['target'])
        series_lengths.append(len(row))
        stat = run_adf(row)
        if stat: num_stat += 1
    
    results = pd.concat([results, pd.DataFrame([{
        'dataset_name': dataset_name,
        'num_series': len(dataset.test),
        'num_stat': num_stat,
        'pct_stat': num_stat/len(dataset.test),
        'mean_series_len': np.mean(series_lengths),
        'std_series_len': np.std(series_lengths),
        'min_series_len': np.min(series_lengths),
        'max_series_len': np.max(series_lengths),
    }])], ignore_index=True)
    results.to_csv('results_stat_checks_gluonts.csv')
    

results



In [18]:
results = pd.read_csv('results_stat_checks_gluonts.csv', index_col=0)  # results_stat_checks
results['mean_series_len'] = np.round(results['mean_series_len'], 3)
results['std_series_len'] = np.round(results['std_series_len'], 3)

In [19]:
series_len = []
for i, row in results.iterrows():
    lenstring = f'{row["mean_series_len"]} ± {row["std_series_len"]}'
    series_len.append(lenstring)

results['mean_series_len_string'] = series_len
results

Unnamed: 0,dataset_name,num_series,num_stat,pct_stat,mean_series_len,std_series_len,min_series_len,max_series_len,pct_nonstat,mean_series_len_string
0,constant,10,10,1.0,30.0,0.0,30,30,0.0,30.0 ± 0.0
1,exchange_rate,40,0,0.0,6161.0,42.426,6101,6221,1.0,6161.0 ± 42.426
2,solar-energy,959,959,1.0,7105.0,48.0,7033,7177,0.0,7105.0 ± 48.0
3,electricity,2247,2177,0.968847,21140.0,48.0,21068,21212,0.031153,21140.0 ± 48.0
4,traffic,6034,6034,1.0,14132.0,48.0,14060,14204,0.0,14132.0 ± 48.0
5,exchange_rate_nips,40,0,0.0,6161.0,42.426,6101,6221,1.0,6161.0 ± 42.426
6,electricity_nips,2590,2179,0.841313,3992.37,146.584,1105,4000,0.158687,3992.37 ± 146.584
7,traffic_nips,6741,6628,0.983237,4000.0,0.0,4000,4000,0.016763,4000.0 ± 0.0
8,solar_nips,959,959,1.0,7105.0,48.0,7033,7177,0.0,7105.0 ± 48.0
9,wiki2000_nips,10000,7107,0.7107,852.0,42.426,792,912,0.2893,852.0 ± 42.426


# Series Selection

In [10]:
results1 = pd.read_csv('results_stat_checks.csv', index_col=0)
results2 = pd.read_csv('results_stat_checks_gluonts.csv', index_col=0)

In [12]:
results1['pct_nonstat'] = 1 - results1['pct_stat']
results2['pct_nonstat'] = 1 - results2['pct_stat']
results1.to_csv('results_stat_checks.csv')
results2.to_csv('results_stat_checks_gluonts.csv')


In [1]:
# re run checks to store more on the series 

%load_ext autoreload
%autoreload 2
%cd /mnt/c/Users/resha/Documents/Github/balancing_framework/

from gluonts.dataset.repository import get_dataset, dataset_names
from gluonts.dataset.util import to_pandas

import pickle
import pandas as pd
import numpy as np
import json
import time
import argparse
from tqdm import tqdm

from framework import run_measurements, viz
from fracdiff import frac_diff_bestd
from monash_data_utils import convert_tsf_to_dataframe, monash_df_to_gluonts_train_datasets
import os

from statsmodels.tsa.stattools import adfuller

def run_adf(series):
    adf_chunk_size = 100_000
    num_stat = (0,0) # number of stationary windows, total number of windows
    p_values = []
    for i in range(0, len(series), adf_chunk_size):
        data_chunk = series.dropna()[i:i+adf_chunk_size]
        if data_chunk.nunique()==1:
            print(f'series has only one unique value: {series.iloc[0]}')
            continue
        try:
            adf_result = adfuller(data_chunk) 
        except Exception as e:
            print(e)
            continue
        # print(f'{i} p-value={adf_result[1]}, lags={adf_result[2]}')
        num_stat = (num_stat[0], num_stat[1]+1)
        p_values.append(adf_result[1])
        if adf_result[1] < 0.05:
            num_stat = (num_stat[0]+1, num_stat[1])
    # if more than 50% of the p-values are above 0.05, then the data is not stationary
    stationary = num_stat[0] >= num_stat[1]/2
    return stationary, np.mean(p_values)


/mnt/c/Users/resha/Documents/Github/balancing_framework




In [None]:
# Loop through monash dir


monash_dir = "monash_data"
results = pd.DataFrame()
for dataset_name in tqdm(os.listdir(monash_dir)):
    print(dataset_name)
    loaded_data, frequency, forecast_horizon, contain_missing_values, contain_equal_length = convert_tsf_to_dataframe(f"{monash_dir}/{dataset_name}")
    dataset = monash_df_to_gluonts_train_datasets(loaded_data, frequency)

    for index, entry in tqdm(enumerate(dataset.test)):
        row = pd.Series(entry['target'])
        stat, pva = run_adf(row)
        if not stat: 
            new_row = pd.DataFrame([{
                'dataset_name': dataset_name,
                'series_idx': index,
                'avg_pvalue': pva,
                'series_length': len(row),
            }])
            new_row.to_csv('series_pick_checks.csv', mode='a', index=False)
    

results



In [None]:
#
results = pd.DataFrame()
for dataset_name in tqdm(dataset_names):
    try:
        dataset = get_dataset(dataset_name)
        print(dataset_name)
    except Exception as e:
        print(e)
        continue

    for index, entry in enumerate(dataset.test):
        row = pd.Series(entry['target'])
        stat, pva = run_adf(row)
        if not stat: 
            new_row = pd.DataFrame([{
                'dataset_name': dataset_name,
                'series_idx': index,
                'avg_pvalue': pva,
                'series_length': len(row),
            }])
            new_row.to_csv('series_pick_checks_gluonts.csv', mode='a', index=False)
    

results



# Series Selection Results

In [2]:
%load_ext autoreload
%autoreload 2
%cd /mnt/c/Users/resha/Documents/Github/balancing_framework/

import pandas as pd
pd.set_option('display.max_rows', None)
mres = pd.read_csv('series_pick_checks.csv', header=None).rename(columns={0:'dataset_name', 1:'series_idx', 2:'avg_pvalue', 3:'series_length'})
gres = pd.read_csv('series_pick_checks_gluonts.csv', header=None).rename(columns={0:'dataset_name', 1:'series_idx', 2:'avg_pvalue', 3:'series_length'})
gres['dataset_name'] = gres['dataset_name'] + ' (g)'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
/mnt/c/Users/resha/Documents/Github/balancing_framework


In [3]:
# combine
res = pd.concat([mres, gres], ignore_index=True)
print(len(res))
res.drop_duplicates(inplace=True)
print(len(res))

396433
396433


In [34]:
res.sort_values('series_length', inplace=True, ascending=False)
res.to_csv('combined_pick_checks.csv')

In [4]:
ignore = ['m4_weekly', 'm4_daily', 'm4_hourly', 'london_smart_meters_without_missing (g)', 'electricity_hourly_dataset.tsf', 'electricity_hourly (g)', 'electricity (g)', 'exchange_rate (g)', 'exchange_rate_nips (g)', 'bitcoin_dataset_with_missing_values.tsf', 'm4_daily_dataset.tsf',]

In [None]:
res[res['series_length']>30_000].sort_values('avg_pvalue', ascending=False)

  res[res['series_length']>30_000][~res['dataset_name'].isin(ignore)].sort_values('avg_pvalue', ascending=False)


Unnamed: 0,dataset_name,series_idx,avg_pvalue,series_length


In [None]:
res[~res['dataset_name'].isin(ignore)].sort_values('series_length', ascending=False)