In [8]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
from utils.utils import load_config
from datasets.SyntheticData import SyntheticData
import numpy as np
import pandas as pd
from torch_geometric.utils import to_dense_adj
import os
from datasets.RealEpidemics import RealEpidemics

## two-step-SINDy

In [10]:
from models.baseline.TSS.NumericalDerivatives import NumericalDeriv
from models.baseline.TSS.ElementaryFunctions_Matrix import ElementaryFunctions_Matrix
from models.baseline.TSS.TwoPhaseInference import TwoPhaseInference

## Utils

In [11]:
def load_data_tss(config, snr_db = -1, real_epid=False):
    if not real_epid:
        dataset = SyntheticData(
            root=config['data_folder'],
            dynamics=config['name'],
            t_span=config['t_span'],
            t_max=config['t_eval_steps'],
            num_samples=config['num_samples'],
            seed=config['seed'],
            n_ics=config['n_iter'],
            input_range=config['input_range'],
            device=config['device'],
            horizon = config['horizon'],
            history = config['history'],
            stride=config.get('stride', 5),
            predict_deriv=config.get("predict_deriv", False),
            snr_db=snr_db,
            **config['integration_kwargs']
        )
    else:
        dataset = RealEpidemics(
            root = './data_real_epid_covid_orig',
            name = 'RealEpid',
            predict_deriv=True,
            scale=False,
        )
    
    raw_data = dataset.raw_data_sampled.cpu().detach().numpy() # shape: (ics, time_steps, n_nodes, 1)
    time = dataset.t_sampled
    
    edge_index = dataset[0].edge_index
    A = to_dense_adj(edge_index)[0].cpu().detach().numpy()
    
    return raw_data, A, time


def get_matrix_tss(raw_data, time, A, Dim=1, selfPolyOrder = 3, act_index=False):
    dt = time[0, 1] - time[0, 0]
    dt = dt.item()
    Nnodes = A.shape[0]
    
    data = []
    num_deriv = []
    Matrix = []
    
    for ic in range(raw_data.shape[0]):
        data_ic = raw_data[ic].squeeze(-1)  # shape: (time_steps, n_nodes)
        num_deriv_ic = NumericalDeriv(
            TimeSeries=data_ic,
            dim=1,
            Nnodes=data_ic.shape[1],
            deltT=dt
        )   # pd DatafRame
        
        data_ic = data_ic[2:-2,:]
        data.append(data_ic)
        num_deriv.append(num_deriv_ic)
        matrix_ic = ElementaryFunctions_Matrix(
            data_ic, 
            Dim, 
            Nnodes, 
            A, 
            selfPolyOrder, 
            coupledPolyOrder = 1, 
            PolynomialIndex = True, 
            TrigonometricIndex = True, 
            ExponentialIndex = True, 
            FractionalIndex = False, 
            ActivationIndex = act_index, 
            RescalingIndex = False, 
            CoupledPolynomialIndex = True,
            CoupledTrigonometricIndex = True, 
            CoupledExponentialIndex = True, 
            CoupledFractionalIndex = False,
            CoupledActivationIndex = act_index, 
            CoupledRescalingIndex = False
        )
        
        Matrix.append(matrix_ic)
        

    data = np.concatenate(data, axis=0)
    num_deriv = pd.concat(num_deriv, ignore_index=True)
    Matrix = pd.concat(Matrix, ignore_index=True)
    Matrix = Matrix.replace([np.inf, -np.inf], np.nan).dropna(axis=1)
    
    return Matrix, num_deriv, data


def two_step_sindy(Matrix, num_deriv, Nnodes, out_path, Dim = 1, plotstart = 0.5, plotend = 0.9, Keep = 10, SampleTimes = 20, Batchsize = 1,
                   snr_db = -1):
    Lambda = pd.DataFrame([[0.01, 0.5, 1]])
    os.makedirs(out_path, exist_ok=True)
    
    for dim in range(Dim):
        InferredResults, _, _, _ = TwoPhaseInference(
            Matrix, 
            num_deriv, 
            Nnodes, 
            dim, 
            Dim, 
            Keep, 
            SampleTimes,
            Batchsize, 
            Lambda, 
            plotstart, 
            plotend
        )
        
        save_file = f"{out_path}/results_dim={dim}.csv" if snr_db < 0 else f"{out_path}/results_dim={dim}_{snr_db}_db.csv"
        InferredResults.to_csv(save_file)
    
    
        

## Two Phase Inference

In [None]:
# config_path = 'configs/config_pred_deriv/config_ic1/config_biochemical.yml'
# conf = load_config(config_path)
# conf['n_iter'] = 1
# conf['t_eval_steps'] = 20000
# conf['data_folder'] = './data_tmp'

# raw_data, A, time = load_data_tss(conf)

# Matrix, num_deriv, _ = get_matrix_tss(
#     raw_data=raw_data,
#     time = time,
#     A=A,
#     Dim=1,
#     selfPolyOrder=1
# )

# two_step_sindy(
#     Matrix=Matrix,
#     num_deriv=num_deriv,
#     Nnodes=A.shape[0],
#     out_path=f'./saved_models_optuna/tss/{conf['name']}-{conf['n_iter']}'
# )

0.9999875395443762
Best threshold: 0.001
Elementary functions discovered by Phase 1 without constant.
x1ix1j           -4.992071e-01
x1               -4.629238e-01
tanx1            -1.882994e-02
sinx1ix1j        -3.248205e-04
x1isinx1j        -2.287443e-04
expx1j            1.000000e-10
x1iexpx1j         1.000000e-10
expx1ix1j         1.000000e-10
sinx1jMinusx1i    1.018141e-04
x1jMinusx1i       5.803586e-04
dtype: float64
Elementary functions discovered by Phase 1 with constant.
x1ix1j           -4.992071e-01
x1               -4.629238e-01
tanx1            -1.882994e-02
sinx1ix1j        -3.248205e-04
x1isinx1j        -2.287443e-04
x1iexpx1j         1.000000e-10
expx1ix1j         1.000000e-10
sinx1jMinusx1i    1.018141e-04
x1jMinusx1i       5.803586e-04
constant          9.914368e-01
dtype: float64
-1417952.2121795572 -708839.2301855906 -1624295.0632193047
1.0 0.499903469310875 1.145521724404643
This equation may contain a constant term.
Index(['x1ix1j', 'x1', 'tanx1', 'sinx1ix1j', 'x1

### Clean data

In [12]:
configs = [
    'configs/config_pred_deriv/config_ic1/config_kuramoto.yml',
    'configs/config_pred_deriv/config_ic1/config_biochemical.yml',
    'configs/config_pred_deriv/config_ic1/config_epidemics.yml',
    'configs/config_pred_deriv/config_ic1/config_population.yml'
]

for conf_path in configs:
    conf= load_config(config_path=conf_path)
    raw_data, A, time = load_data_tss(conf)
    
    Matrix, num_deriv, _ = get_matrix_tss(
        raw_data=raw_data,
        time = time,
        A=A,
        Dim=1,
        selfPolyOrder=3
    )
    
    two_step_sindy(
        Matrix=Matrix,
        num_deriv=num_deriv,
        Nnodes=A.shape[0],
        out_path=f'./saved_models_optuna/tss/{conf['name']}-{conf['n_iter']}_nofrac'
    )

0.999998146963679
Best threshold: 0.001
Elementary functions discovered by Phase 1 without constant.
expx1jMinusx1i   -2.287488e-07
x1                1.000000e-10
x1x1x1            1.000000e-10
x1x1              1.000000e-10
cosx1             1.000000e-10
expx1j            1.000000e-10
expx1ix1j         1.000000e-10
x1j               1.000000e-10
x1iexpx1j         1.000000e-10
sinx1jMinusx1i    4.993155e-01
dtype: float64
Elementary functions discovered by Phase 1 with constant.
expx1jMinusx1i   -2.287488e-07
x1                1.000000e-10
x1x1x1            1.000000e-10
x1x1              1.000000e-10
cosx1             1.000000e-10
expx1ix1j         1.000000e-10
x1j               1.000000e-10
x1iexpx1j         1.000000e-10
sinx1jMinusx1i    4.993155e-01
constant          2.000024e+00
dtype: float64
-1932.2213277133046 25150.91604441822 -10688.710415686202
1.0 -13.01658132206996 5.531825087727295
Index(['expx1jMinusx1i', 'x1', 'x1x1x1', 'x1x1', 'cosx1', 'expx1ix1j', 'x1j',
       'x1iexp

### Noise

In [13]:
configs = [
    'configs/config_pred_deriv/config_ic1/config_kuramoto.yml',
    'configs/config_pred_deriv/config_ic1/config_biochemical.yml',
    'configs/config_pred_deriv/config_ic1/config_epidemics.yml',
    'configs/config_pred_deriv/config_ic1/config_population.yml'
]

snr_db_levels = [70, 50, 20]

for conf_path in configs:
    for snr_db in snr_db_levels:
        
        conf = load_config(config_path=conf_path)
        raw_data, A, time = load_data_tss(conf, snr_db=snr_db)
        
        Matrix, num_deriv, _ = get_matrix_tss(
            raw_data=raw_data,
            time = time,
            A=A,
            Dim=1,
            selfPolyOrder=3
        )
        
        two_step_sindy(
            Matrix=Matrix,
            num_deriv=num_deriv,
            Nnodes=A.shape[0],
            out_path=f'./saved_models_optuna/tss/{conf['name']}-{conf['n_iter']}_nofrac',
            snr_db=snr_db
        )

0.07138707760374308
Best threshold: 0.001
Elementary functions discovered by Phase 1 without constant.
expx1jMinusx1i   -7.006677e-07
expx1ix1j         1.000000e-10
x1x1x1            1.000000e-10
x1                1.000000e-10
cosx1             1.000000e-10
x1isinx1j         1.000000e-10
x1iexpx1j         1.000000e-10
sinx1             1.000000e-10
tanx1             5.763843e-06
sinx1jMinusx1i    4.991261e-01
dtype: float64
Elementary functions discovered by Phase 1 with constant.
expx1jMinusx1i   -7.006677e-07
expx1ix1j         1.000000e-10
x1x1x1            1.000000e-10
x1                1.000000e-10
cosx1             1.000000e-10
x1iexpx1j         1.000000e-10
sinx1             1.000000e-10
tanx1             5.763843e-06
sinx1jMinusx1i    4.991261e-01
constant          2.000208e+00
dtype: float64
42992.520849586916 48287.17509309217 42990.520849586916
1.0 1.123152914480848 0.9999534802807447
This equation may contain a constant term.
Index(['expx1jMinusx1i', 'expx1ix1j', 'x1x1x1', '

## Real Epid

In [7]:
raw_data, A, time = load_data_tss(None, real_epid=True)
    
Matrix, num_deriv, _ = get_matrix_tss(
    raw_data=raw_data,
    time = time,
    A=A,
    Dim=1,
    selfPolyOrder=1,
    act_index=True
)

two_step_sindy(
    Matrix=Matrix,
    num_deriv=num_deriv,
    Nnodes=A.shape[0],
    out_path=f'./saved_models_optuna/tss/real_epid_covid_scratch'
)

0.7752239971558382
Best threshold: 1.219
Elementary functions discovered by Phase 1 without constant.
sinx1                    1.000000e-10
cosx1                    1.000000e-10
tanx1                    1.000000e-10
sig_x1_10                1.000000e-10
sig_x1_15                1.000000e-10
x1iregx1jgamma5          1.000000e-10
regx1jMinusx1igamma2     1.000000e-10
x1isigx1jalpha10beta1    1.098885e-02
x1isinx1j                4.671793e-02
x1                       2.055820e+00
dtype: float64
Elementary functions discovered by Phase 1 with constant.
sinx1                    1.000000e-10
cosx1                    1.000000e-10
tanx1                    1.000000e-10
sig_x1_10                1.000000e-10
sig_x1_15                1.000000e-10
regx1jMinusx1igamma2     1.000000e-10
x1isigx1jalpha10beta1    1.098885e-02
x1isinx1j                4.671793e-02
x1                       2.055820e+00
constant                 2.215953e+03
dtype: float64
5989.475278120497 6041.8583434810735 5987.47527812

## Re-fitting coefficients

In [9]:
from sklearn.linear_model import LinearRegression 

righ_sides = [
    "./saved_models_optuna/tss/real_epid_covid/orig_csv/right_side_components_covid.csv",
    "./saved_models_optuna/tss/real_epid_covid/orig_csv/right_side_components_H1N1.csv",
    "./saved_models_optuna/tss/real_epid_covid/orig_csv/right_side_components_Sars.csv"
]

lef_sides = [
    "./saved_models_optuna/tss/real_epid_covid/orig_csv/left_side_components_covid.csv",
    "./saved_models_optuna/tss/real_epid_covid/orig_csv/left_side_components_H1N1.csv",
    "./saved_models_optuna/tss/real_epid_covid/orig_csv/left_side_components_Sars.csv"
]

n_nodes = [82, 21, 4]
names = ['covid', 'h1n1', 'sars']


for j, (rs, ls) in enumerate(zip(righ_sides, lef_sides)):
    X_all = pd.read_csv(rs)
    y_all = pd.read_csv(ls)
    N = n_nodes[j]
    X_mat = X_all.values
    y_mat = y_all.values
    num = len(X_mat[0])
    num2 = len(y_mat[0])
    L = int(len(X_mat)/N)
    times = N
    Coef = np.zeros(shape=(2,times))
    for i in range(0,times):
        X = X_all.iloc[i*L:(i+1)*L,:]
        y = y_all.iloc[i*L:(i+1)*L,:]
        
        cutoff = int(0.9 * len(X))
        X = X.iloc[:cutoff, :]
        y = y.iloc[:cutoff, :]
        
        v1 = X['x']
        v2 = X['sigxjminusxi']
        y1 = y['X']
        Xin = pd.concat([v1,v2],axis=1)
        model = LinearRegression(fit_intercept=False)
        model.fit(Xin,y1)
        a = model.coef_
        a = (pd.DataFrame(a)).values
        Coef[0,i] = a[0]
        Coef[1,i] = a[1]
        
    Coef = pd.DataFrame(Coef)
    # print(Coef)
    Coef.to_csv(f"./saved_models_optuna/tss/real_epid_covid/orig_csv/inf_coeffs_test_{names[j]}.csv", index=0)

