In [5]:
import sys
sys.path.append("../")

import numpy as np
import pandas as pd
import pprint
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from kats.consts import TimeSeriesData
from statsmodels.tsa.seasonal import STL
from kats.utils.simulator import Simulator
from sklearn.preprocessing import StandardScaler
from kats.tsfeatures.tsfeatures import TsFeatures

import warnings
warnings.simplefilter(action='ignore')

In [6]:
sim = Simulator(n=90, freq="D", start = "2021-01-01") # simulate 90 days of data
random_seed = 100

# generate 10 TimeSeriesData with arima_sim
np.random.seed(random_seed) # setting numpy seed
arima_sim_list = [sim.arima_sim(ar=[0.1, 0.05], ma = [0.04, 0.1], d = 1) for _ in range(10)]

# generate 10 TimeSeriesData with trend shifts
trend_sim_list = [
    sim.trend_shift_sim(
        cp_arr = [30, 60, 75],
        trend_arr=[3, 15, 2, 8],
        intercept=30,
        noise=50,
        seasonal_period=7,
        seasonal_magnitude=np.random.uniform(10, 100),
        random_seed=random_seed
    ) for _ in range(10)
]


# generate 10 TimeSeriesData with level shifts
level_shift_list = [
    sim.level_shift_sim(
        cp_arr = [30, 60, 75],
        level_arr=[1.35, 1.05, 1.35, 1.2],
        noise=0.05,
        seasonal_period=7,
        seasonal_magnitude=np.random.uniform(0.1, 1.0),
        random_seed=random_seed
    ) for _ in range(10)
]

ts_list = arima_sim_list + trend_sim_list + level_shift_list

In [9]:
ts = ts_list[0]
# Step 1. initiate TsFeatures
model = TsFeatures()

# Step 2. use .transform() method, and apply on the target time series data
output_features = model.transform(ts)
len(output_features)



40

In [10]:
import collections
import logging
from unittest import TestCase

import numpy as np
import pandas as pd
from ax.modelbridge.registry import Models, SearchSpace
from ax.service.utils.instantiation import parameter_from_json
from kats.consts import TimeSeriesData
from kats.models.arima import ARIMAModel
from kats.models.holtwinters import HoltWintersModel
from kats.models.metalearner.get_metadata import GetMetaData
from kats.models.metalearner.metalearner_hpt import MetaLearnHPT
from kats.models.metalearner.metalearner_modelselect import (
    MetaLearnModelSelect,
)
from kats.models.metalearner.metalearner_predictability import (
    MetaLearnPredictability,
)
from kats.models.prophet import ProphetModel
from kats.models.sarima import SARIMAModel
from kats.models.stlf import STLFModel
from kats.models.theta import ThetaModel
from kats.tsfeatures.tsfeatures import TsFeatures


DATA = pd.DataFrame(
    {
        "time": pd.date_range("2020-05-06", periods=60, freq="D"),
        "y": np.arange(1, 61),
    }
)
TSData = TimeSeriesData(DATA)

# TS which is too short
TSData_short = TimeSeriesData(DATA.iloc[:8, :])

# TS which has constant values only
DATA_const = DATA.copy()
DATA_const["y"] = 1
TSData_const = TimeSeriesData(DATA_const)

# TS which has NAN values
DATA_nan = DATA.copy()
DATA_nan.iloc[10, 1] = np.nan
TSData_nan = TimeSeriesData(DATA_nan)

# TS which has INF values
DATA_inf = DATA.copy()
DATA_inf.iloc[10, 1] = np.inf
TSData_inf = TimeSeriesData(DATA_inf)

# TS which doesn't have constant frequency
DATA_gap = DATA.copy()
DATA_gap = DATA_gap.drop([3, 4])
TSData_gap = TimeSeriesData(DATA_gap)

# TS which is not univariate
DATA_multi = pd.DataFrame(
    {
        "time": pd.date_range("2020-05-06", periods=60, freq="D"),
        "y": np.arange(1, 61),
        "z": np.random.randn(60),
    }
)
TSData_multi = TimeSeriesData(DATA_multi)

# Base Models
base_models = {
    "arima": ARIMAModel,
    "holtwinters": HoltWintersModel,
    "sarima": SARIMAModel,
    "prophet": ProphetModel,
    "stlf": STLFModel,
    "theta": ThetaModel,
}


def generate_test_ts():
    # time series with negative data, which contains Nan for TsFeatures
    time = pd.date_range("2020-05-06", "2020-11-17", freq="D")
    ts = pd.DataFrame(np.random.randn(len(time)), columns=["value"])
    ts["time"] = time
    ts1 = TimeSeriesData(ts)
    # predictable time series
    ts = pd.DataFrame(np.abs(np.random.randn(len(time))), columns=["value"])
    ts["time"] = time
    ts2 = TimeSeriesData(ts)
    return (ts1, ts2)


def generate_meta_data(n):
    # generate meta data to initialize MetaLearnModelSelect
    spaces = {m: base_models[m].get_parameter_search_space() for m in base_models}

    m = len(base_models)
    res = np.abs(np.random.uniform(0, 1.0, n * m)).reshape(n, -1)
    features = np.random.randn(n * 40).reshape(n, -1)
    generators = {
        m: Models.UNIFORM(
            SearchSpace([parameter_from_json(item) for item in spaces[m]])
        )
        for m in spaces
    }
    models = list(base_models.keys())
    ans = []
    for i in range(n):
        hpt = {}
        j = 0
        for m in base_models:
            hpt[m] = (generators[m].gen(1).arms[0].parameters, res[i, j])
            j += 1
        ans.append(
            {
                "hpt_res": hpt,
                "best_model": np.random.choice(models),
                "features": {str(k): features[i, k] for k in range(features.shape[1])},
            }
        )
    return ans


def generate_meta_data_by_model(model, n, d=40):
    model = model.lower()
    if model in base_models:
        model = base_models[model]
    space = model.get_parameter_search_space()
    generator = Models.UNIFORM(
        SearchSpace([parameter_from_json(item) for item in space])
    )
    x = np.random.randn(n * d).reshape(n, -1)
    x = pd.DataFrame(x)
    y = [generator.gen(1).arms[0].parameters for i in range(n)]
    y = pd.DataFrame(y)
    return x, y


def equals(v1, v2):
    # check whether v1 and v2 are equal
    try:
        if isinstance(v1, pd.DataFrame):
            return v1.equals(v2)
        elif isinstance(v1, np.ndarray):
            return np.array_equal(v1, v2)
        elif isinstance(v1, list) and (len(v1) == len(v2)):
            comp = [equals(v1[i], v2[i]) for i in range(len(v1))]
            return np.sum(comp) == len(comp)
        else:
            return False
    except Exception as e:
        msg = "fail to compare the inputs and exception message is " + e
        raise ValueError(msg)


class testMetaLearner(TestCase):
    def test_get_meta_data(self) -> None:
        # test GetMetaData using a simple case
        metadata = GetMetaData(data=TSData, num_trials=2, num_arms=1)
        res = metadata.get_meta_data()

        # test meta data output
        self.assertEqual(
            list(res.keys()),
            ["hpt_res", "features", "best_model", "search_method", "error_method"],
        )

        # test meta data output - HPT part
        self.assertEqual(
            list(res["hpt_res"].keys()),
            ["arima", "holtwinters", "prophet", "theta", "stlf", "sarima"],
        )

    def test_inputdata_errors(self) -> None:
        # test input data error (time series' type is not TimeSeriesData)
        self.assertRaises(ValueError, GetMetaData, DATA)

        # test input data error (time series is not univariate)
        self.assertRaises(ValueError, GetMetaData, TSData_multi)

        # test input data error (time series is too short)
        self.assertRaises(ValueError, GetMetaData, TSData_short)

        # test input data error (time series only contains constant value)
        self.assertRaises(ValueError, GetMetaData, TSData_const)

        # test input data error (time series contains nan)
        self.assertRaises(ValueError, GetMetaData, TSData_nan)

        # test input data error (time series contains inf)
        self.assertRaises(ValueError, GetMetaData, TSData_inf)

        # test input data error (time series doesn't have constant freq)
        self.assertRaises(ValueError, GetMetaData, TSData_gap)


class MetaLearnModelSelectTest(TestCase):
    def test_initialize(self) -> None:

        self.assertRaises(ValueError, MetaLearnModelSelect, [])

        self.assertRaises(ValueError, MetaLearnModelSelect, [{}] * 40)

        self.assertRaises(ValueError, MetaLearnModelSelect, [{"hpt_res": [None]}] * 40)

        self.assertRaises(
            ValueError,
            MetaLearnModelSelect,
            [{"hpt_res": [None], "features": [None]}] * 40,
        )

        self.assertRaises(
            ValueError,
            MetaLearnModelSelect,
            [{"hpt_res": [1.0], "features": {"f": 1.0}, "best_model": "best"}] * 40,
        )


In [11]:
samples = generate_meta_data(n=35)
mlms = MetaLearnModelSelect(samples)

# Test preprocess
mlms.preprocess(downsample=True, scale=True)

# Test rescale
mtx = mlms.metadataX.values

In [12]:
np.sum(np.abs(np.average(mtx, axis=0)) < 1e-10), mtx.shape[1],

(40, 40)

In [13]:
np.sum(np.abs(np.std(mtx, axis=0) - 1) < 1e-8), mtx.shape[1]

(40, 40)

In [14]:
mlms.train(method="RandomForest")
# Test prediction consistency
t1, t2 = generate_test_ts()
t2_df = t2.to_dataframe().copy()
pred = mlms.pred(t2)
pred_fuzzy = mlms.pred_fuzzy(t2)
pred_all = mlms.pred(t2, n_top=2)

In [15]:
if pred != pred_fuzzy["label"][0] or pred != pred_all[0]:
    msg = f"Prediction is not consistent! Results are: self.pred: {pred}, self.pred_fuzzy: {pred_fuzzy}, self.pred(, n_top=2): {pred_all}"
    logging.error(msg)
    raise ValueError(msg)
# Test case for time series with nan features
_ = mlms.pred(t1)
# Test pred_by_feature and its consistency
feature = np.random.randn(3 * mlms.metadataX.shape[1]).reshape(3, -1)
feature2 = feature.copy()
pred = mlms.pred_by_feature(feature)
pred_all = mlms.pred_by_feature(feature, n_top=2)



In [17]:
t1, t2 = generate_test_ts()
t2_df = t2.to_dataframe().copy()
feature1 = np.random.randn(3 * 40).reshape(3, -1)
feature2 = [np.random.randn(40), np.random.randn(40)]
feature3 = pd.DataFrame(np.random.randn(3 * 40).reshape(3, -1))
feature1_copy, feature2_copy, feature3_copy = (
    feature1.copy(),
    list(feature2),
    feature3.copy(),
)
for model in ["prophet"]:
    x, y = generate_meta_data_by_model(model, 150, 40)
    # Check default models initialization and training
    mlhpt = MetaLearnHPT(x, y, default_model=model)
    mlhpt.get_default_model()
    # self.assertRaises(ValueError, mlhpt.build_network, [20])
    mlhpt.build_network()
    mlhpt.train()
    # Test case for time series with nan features
    _ = (mlhpt.pred(t1).parameters[0],)
    mlhpt.pred(t2)
    mlhpt.pred_by_feature(feature1)
    mlhpt.pred_by_feature(feature2)
    mlhpt.pred_by_feature(feature3)
    # Check prediction consistency:
    dict1 = mlhpt.pred(t2).parameters[0]
    t2.value /= t2.value.max()
    dict2 = mlhpt.pred_by_feature(pd.DataFrame([TsFeatures().transform(t2)]))[0]

Multi-task neural network structure:
MultitaskNet(
  (shared_layer): ModuleList(
    (0): Linear(in_features=40, out_features=40, bias=True)
  )
  (cat_layer_combo): ModuleList(
    (0): ModuleList(
      (0): Linear(in_features=40, out_features=5, bias=True)
      (1): Linear(in_features=5, out_features=2, bias=True)
    )
    (1): ModuleList(
      (0): Linear(in_features=40, out_features=5, bias=True)
      (1): Linear(in_features=5, out_features=2, bias=True)
    )
    (2): ModuleList(
      (0): Linear(in_features=40, out_features=2, bias=True)
      (1): Linear(in_features=2, out_features=2, bias=True)
    )
    (3): ModuleList(
      (0): Linear(in_features=40, out_features=3, bias=True)
      (1): Linear(in_features=3, out_features=2, bias=True)
    )
    (4): ModuleList(
      (0): Linear(in_features=40, out_features=5, bias=True)
      (1): Linear(in_features=5, out_features=10, bias=True)
    )
    (5): ModuleList(
      (0): Linear(in_features=40, out_features=5, bias=True)

In [18]:
len(dict2)

7

In [21]:
from typing import cast, Dict
features = cast(Dict[str, float], TsFeatures(hw_params=False).transform(ts))
expected = {
    # statistics_features
    "length": 25.0,
    "mean": 0.0,
    "var": 1.0,
    "entropy": 0.8808,
    "lumpiness": 0.2423,
    "stability": 0.0148,
    "flat_spots": 1.0,
    "hurst": -1.3972,
    "std1st_der": 0.618,
    "crossing_points": 10.0,
    "binarize_mean": 0.16,
    "unitroot_kpss": 0.1567,
    "heterogeneity": 3.1459,
    "histogram_mode": -0.4543,
    "linearity": 0.0,
    # stl_features
    "trend_strength": 0.5364,
    "seasonality_strength": 0.4646,
    "spikiness": 0.0004,
    "peak": 6.0,
    "trough": 5.0,
    # level_shift_features
    "level_shift_idx": 0.0,
    "level_shift_size": 0.0046,
    # acfpacf_features
    "y_acf1": 0.2265,
    "y_acf5": 0.1597,
    "diff1y_acf1": -0.5021,
    "diff1y_acf5": 0.3465,
    "diff2y_acf1": -0.6838,
    "diff2y_acf5": 0.6092,
    "y_pacf5": 0.2144,
    "diff1y_pacf5": 0.4361,
    "diff2y_pacf5": 4.4276,
    "seas_acf1": -0.1483,
    "seas_pacf1": -0.0064,
    # special_ac
    "firstmin_ac": 4.0,
    "firstzero_ac": 4.0,
    # holt_params
    "holt_alpha": 0.0,
    "holt_beta": 0.0
    # hw_params
    # cusum_detector
    # robust_stat_detector
    # bocp_detector
    # outlier_detector
    # trend_detector
    # nowcasting
    # seasonalities
}

In [26]:
features

{'length': 90,
 'mean': -4.973228083549793,
 'var': 50.69499812650379,
 'entropy': 0.2742447620827895,
 'lumpiness': 10.258210327109449,
 'stability': 45.07760417461487,
 'flat_spots': 1,
 'hurst': 0.4188436896564726,
 'std1st_der': 0.8773588739369633,
 'crossing_points': 5,
 'binarize_mean': 0.43333333333333335,
 'unitroot_kpss': 0.41641147078333335,
 'heterogeneity': 73.29527168434541,
 'histogram_mode': -11.841676172131818,
 'linearity': 0.8346355269096618,
 'trend_strength': 0.9853025999592567,
 'seasonality_strength': 0.3521955818150291,
 'spikiness': 0.00020455870537077636,
 'peak': 1,
 'trough': 6,
 'level_shift_idx': 23,
 'level_shift_size': 0.7134342301151566,
 'y_acf1': 0.9597578784708428,
 'y_acf5': 4.0361834721280365,
 'diff1y_acf1': 0.1830233735938267,
 'diff1y_acf5': 0.0794760417768679,
 'diff2y_acf1': -0.4816907863327952,
 'diff2y_acf5': 0.24476824866108501,
 'y_pacf5': 0.9862593061001352,
 'diff1y_pacf5': 0.07981792144706332,
 'diff2y_pacf5': 0.36145785941160113,
 'seas

In [27]:
samples = generate_meta_data(n=35)
mlms = MetaLearnModelSelect(samples)

# Test preprocess
mlms.preprocess(downsample=True, scale=True)

# Test rescale
mtx = mlms.metadataX.values

# test variable-wise zero-mean
np.sum(np.abs(np.average(mtx, axis=0)) < 1e-10), mtx.shape[1]

(40, 40)

In [28]:
len(np.unique(list(collections.Counter(mlms.metadataY).values())))

1

In [29]:
# test variable-wise unit std
np.sum(np.abs(np.std(mtx, axis=0) - 1) < 1e-8), mtx.shape[1]

# Test train
mlms.train(method="RandomForest")
# Test prediction consistency
t1, t2 = generate_test_ts()
t2_df = t2.to_dataframe().copy()
pred = mlms.pred(t2)