In [13]:
%load_ext autoreload
%autoreload 2
%cd /mnt/c/Users/resha/Documents/Github/balancing_framework/

from gluonts.dataset.repository import get_dataset, dataset_names
from gluonts.dataset.util import to_pandas

import pickle
import pandas as pd
import numpy as np
import json
import time
import argparse
from tqdm import tqdm

from framework import run_measurements, viz
from fracdiff import frac_diff_bestd
from monash_data_utils import convert_tsf_to_dataframe, monash_df_to_gluonts_train_datasets
import os


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
/mnt/c/Users/resha/Documents/Github/balancing_framework


In [2]:

# print(f"Available datasets: {dataset_names}")
dataset = None
monash_dir = "monash_data"
series_num = -1

# dataset_name = 'london_smart_meters_without_missing' # m4_daily_dataset ; london_smart_meters_without_missing
# series_to_pull = 867 # m4, 1165 ; london, 1610 867
dataset_name = 'm4_daily_dataset' # m4_daily_dataset ; london_smart_meters_without_missing
series_to_pull = 1165 # m4, 1165 ; london, 1610 867

if os.path.exists(f"{monash_dir}/{dataset_name}.tsf"):
    loaded_data, frequency, forecast_horizon, contain_missing_values, contain_equal_length = convert_tsf_to_dataframe(f"{monash_dir}/{dataset_name}.tsf")
    if forecast_horizon is None: forecast_horizon = 24
    dataset = monash_df_to_gluonts_train_datasets(loaded_data, frequency, forecast_horizon)

if dataset is None:
    if dataset_name in dataset_names:
        dataset = get_dataset(dataset_name)
    else:
        raise ValueError(f"Dataset {dataset_name} not found in gluonts availables or local monash files.")

for entry in tqdm(dataset.test):
    series_num += 1
    if series_num > series_to_pull:
        break
    if series_num != series_to_pull:
        continue
    row = pd.Series(entry['target'])


 28%|██▊       | 1166/4227 [00:00<00:00, 1064321.75it/s]


In [3]:
m4_row = row.copy()

In [14]:

df_fd, fd_change_pct = frac_diff_bestd(row.to_frame() )
df_fd.dropna(inplace=True)
df_fd.reset_index(drop=True, inplace=True)

0
0 stationary with d=0.9500000000000001 thresh=0.01 stat windows =1 out of 1 p-values = [0.006812139404067202]
changed 1 out of 1 columns; 100.0%


In [24]:
import numpy as np
import pandas as pd

def get_weights(diff_amt, size):
    # The algorithm below executes the iterative estimation (section 5.4.2, page 78)
    weights = [1.]  # create an empty list and initialize the first element with 1.
    for k in range(1, size):
        weights_ = -weights[-1] * (diff_amt - k + 1) / k  # compute the next weight
        weights.append(weights_)

    # Now, reverse the list, convert into a numpy column vector
    weights = np.array(weights[::-1]).reshape(-1, 1)
    return weights

import numpy as np
import pandas as pd

def frac_undiff(y_df, diff_amt, init_history, thresh=0.01):
    # prepare weights
    n = y_df.shape[0] + (0 if getattr(init_history, "shape", (None,))[0] is None else init_history.shape[0])
    weights_full = np.asarray(get_weights(diff_amt, n)).ravel()

    # compute skip
    cum = np.cumsum(np.abs(weights_full))
    cum /= cum[-1]
    skip = int(cum[cum > thresh].shape[0])

    # normalize init_history to DataFrame with correct columns
    if isinstance(init_history, dict):
        init_df = pd.concat({k: pd.Series(v).squeeze() for k, v in init_history.items()}, axis=1)
    else:
        init_df = init_history.copy()
    if init_df.shape[0] != skip:
        # allow full original series: take its first 'skip' rows
        if init_df.shape[0] > skip:
            init_df = init_df.iloc[:skip]
        else:
            raise ValueError(f"init_history must have at least {skip} rows; got {init_df.shape[0]}")

    cols = y_df.columns
    combined_index = list(init_df.index) + list(y_df.index)
    x_rec = pd.DataFrame(index=combined_index, columns=cols, dtype='float64')

    # fill initial history (ensure scalars)
    for col in cols:
        if col not in init_df:
            raise ValueError(f"Initial history missing column {col}")
        vals = np.asarray(init_df[col]).ravel()
        x_rec.loc[init_df.index, col] = vals

    # sequential inversion
    for col in cols:
        for t in y_df.index:
            idx = combined_index.index(t)
            t_step = idx
            w = weights_full[: t_step + 1]  # 1-D
            acc = 0.0
            for k in range(1, len(w)):
                prev_idx = idx - k
                acc += float(w[k]) * float(x_rec.iat[prev_idx, x_rec.columns.get_loc(col)])
            rhs = float(y_df.at[t, col]) - acc
            x_rec.iat[idx, x_rec.columns.get_loc(col)] = rhs / float(w[0])

    return x_rec



In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Create a figure with two subplots, arranged vertically
# The sharex=True argument ensures both plots share the same x-axis for easier comparison
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 8), sharex=False)

# Plot the first series on the top subplot (ax1)
ax1.plot(df_fd[0],color='b')
ax1.set_title('M4 Daily Series 1165 (FD)', fontsize=14)
ax1.set_ylabel('Original Value', fontsize=12)
ax1.grid(True)

# Plot the second series on the bottom subplot (ax2)
ax2.plot(london_row, color='r')
ax2.set_title('London Series 867', fontsize=14)
ax2.set_xlabel('Index', fontsize=12)
ax2.set_ylabel('Original Value', fontsize=12)
ax2.grid(True)

# Adjust the layout to prevent titles and labels from overlapping
plt.tight_layout()
# plt.savefig(f'compare_series.png')