# ANM_NCPOP_GenerateData
* Generate the time series data from LDS with parameters: g, f_dash, proc_noise_std, obs_noise_std, T
* Visualize the generated time series

#__Step 1: Get start__

* mount drive
* set envirment
* install packages

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir("/content/drive/MyDrive/Colab Notebooks/NCPOP/")
# os.chdir("/content/drive/MyDrive/Colab Notebooks/NCPOP-Colab Notebooks/Test_Causality_Datasets/Real_data/Krebs_Cycle/Details_Krebs_Cycle/MetricsDAG/")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#__Step 2: Generate IID Time sets__

In [27]:
from BuiltinDataSet import DAG
import numpy as np
import networkx as nx
import logging

class ANMNCPOP_GenerateData(object):
    '''
    Simulate IID datasets for causal structure learning.

    Parameters
    ----------
    W: np.ndarray
        Weighted adjacency matrix for the target causal graph.
    n: int
        Number of samples for standard trainning dataset.
    T: int
        Number of timeseries for standard trainning dataset.
    method: str, (linear or nonlinear), default='linear'
        Distribution for standard trainning dataset.
    sem_type: str
        gauss, exp, gumbel, uniform, logistic (linear);
        mlp, mim, gp, gp-add, quadratic (nonlinear).
    noise_scale: float
        Scale parameter of noise distribution in linear SEM.
    '''

    def __init__(self, W, n=1000, T=20, method='linear',
                 sem_type='gauss', noise_scale=1.0):

        self.B = (W != 0).astype(int)
        if method == 'linear':
            self.XX = ANMNCPOP_GenerateData._simulate_linear_sem(
                    W, n, T, sem_type, noise_scale)
        elif method == 'nonlinear':
            self.XX = ANMNCPOP_GenerateData._simulate_nonlinear_sem(
                    W, n, T, sem_type, noise_scale)
        logging.info('Finished synthetic dataset')

    @staticmethod
    def _simulate_linear_sem(W, n, T, sem_type, noise_scale):
        """
        Simulate samples from linear SEM with specified type of noise.
        For uniform, noise z ~ uniform(-a, a), where a = noise_scale.

        Parameters
        ----------
        W: np.ndarray
            [d, d] weighted adj matrix of DAG.
        n: int
            Number of samples, n=inf mimics population risk.
        T: int
        Number of timeseries for standard trainning dataset.
        sem_type: str
            gauss, exp, gumbel, uniform, logistic.
        noise_scale: float
            Scale parameter of noise distribution in linear SEM.

        Return
        ------
        XX: np.ndarray
            [T, n, d] sample matrix, [d, d] if n and T=inf
        """
        def _simulate_single_equation(X, w, scale):
            """X: [n, num of parents], w: [num of parents], x: [n]"""
            if sem_type == 'gauss':
                z = np.random.normal(scale=scale, size=T)
                x = X @ w + z
            elif sem_type == 'exp':
                z = np.random.exponential(scale=scale, size=T)
                x = X @ w + z
            elif sem_type == 'gumbel':
                z = np.random.gumbel(scale=scale, size=T)
                x = X @ w + z
            elif sem_type == 'uniform':
                z = np.random.uniform(low=-scale, high=scale, size=T)
                x = X @ w + z
            elif sem_type == 'logistic':
                x = np.random.binomial(1, sigmoid(X @ w)) * 1.0
            else:
                raise ValueError('Unknown sem type. In a linear model, \
                                 the options are as follows: gauss, exp, \
                                 gumbel, uniform, logistic.')
            return x

        d = W.shape[0]
        if noise_scale is None:
            scale_vec = np.ones(d)
        elif np.isscalar(noise_scale):
            scale_vec = noise_scale * np.ones(d)
        else:
            if len(noise_scale) != d:
                raise ValueError('noise scale must be a scalar or has length d')
            scale_vec = noise_scale
        G_nx =  nx.from_numpy_array(W, create_using=nx.DiGraph)
        if not nx.is_directed_acyclic_graph(G_nx):
            raise ValueError('W must be a DAG')
        if np.isinf(T):  # population risk for linear gauss SEM
            if sem_type == 'gauss':
                # make 1/d X'X = true cov
                X = np.sqrt(d) * np.diag(scale_vec) @ np.linalg.inv(np.eye(d) - W)
                return X
            else:
                raise ValueError('population risk not available')
        # empirical risk
        ordered_vertices = list(nx.topological_sort(G_nx))
        assert len(ordered_vertices) == d
        X = np.zeros([T, d])
        XX = np.zeros((T, n, d))
        for j in ordered_vertices:
            parents = list(G_nx.predecessors(j))
            X[:, j] = _simulate_single_equation(X[:, parents], W[parents, j], scale_vec[j])
        for ns in range(n):
            XX[:, ns] = X
        return XX

    @staticmethod
    def _simulate_nonlinear_sem(W, n, T, sem_type, noise_scale):
        """
        Simulate samples from nonlinear SEM.

        Parameters
        ----------
        B: np.ndarray
            [d, d] binary adj matrix of DAG.
        n: int
            Number of samples.
        T: int
            Number of times.
        sem_type: str
            mlp, mim, gp, gp-add, or quadratic.
        noise_scale: float
            Scale parameter of noise distribution in linear SEM.

        Return
        ------
        XX: np.ndarray
            [T, n, d] sample matrix
        """
        if sem_type == 'quadratic':
            return GenerateData._simulate_quad_sem(W, T, noise_scale)

        def _simulate_single_equation(X, scale):
            """X: [n, num of parents], x: [n]"""
            z = np.random.normal(scale=scale, size=n)
            pa_size = X.shape[1]
            if pa_size == 0:
                return z
            if sem_type == 'mlp':
                hidden = 100
                W1 = np.random.uniform(low=0.5, high=2.0, size=[pa_size, hidden])
                W1[np.random.rand(*W1.shape) < 0.5] *= -1
                W2 = np.random.uniform(low=0.5, high=2.0, size=hidden)
                W2[np.random.rand(hidden) < 0.5] *= -1
                x = sigmoid(X @ W1) @ W2 + z
            elif sem_type == 'mim':
                w1 = np.random.uniform(low=0.5, high=2.0, size=pa_size)
                w1[np.random.rand(pa_size) < 0.5] *= -1
                w2 = np.random.uniform(low=0.5, high=2.0, size=pa_size)
                w2[np.random.rand(pa_size) < 0.5] *= -1
                w3 = np.random.uniform(low=0.5, high=2.0, size=pa_size)
                w3[np.random.rand(pa_size) < 0.5] *= -1
                x = np.tanh(X @ w1) + np.cos(X @ w2) + np.sin(X @ w3) + z
            elif sem_type == 'gp':
                from sklearn.gaussian_process import GaussianProcessRegressor
                gp = GaussianProcessRegressor()
                x = gp.sample_y(X, random_state=None).flatten() + z
            elif sem_type == 'gp-add':
                from sklearn.gaussian_process import GaussianProcessRegressor
                gp = GaussianProcessRegressor()
                x = sum([gp.sample_y(X[:, i, None], random_state=None).flatten()
                        for i in range(X.shape[1])]) + z
            else:
                raise ValueError('Unknown sem type. In a nonlinear model, \
                                 the options are as follows: mlp, mim, \
                                 gp, gp-add, or quadratic.')
            return x

        B = (W != 0).astype(int)
        d = B.shape[0]
        if noise_scale is None:
            scale_vec = np.ones(d)
        elif np.isscalar(noise_scale):
            scale_vec = noise_scale * np.ones(d)
        else:
            if len(noise_scale) != d:
                raise ValueError('noise scale must be a scalar or has length d')
            scale_vec = noise_scale
        X = np.zeros([T, d])
        XX = np.zeros((T, n, d))
        G_nx =  nx.from_numpy_array(B, create_using=nx.DiGraph)
        ordered_vertices = list(nx.topological_sort(G_nx))
        assert len(ordered_vertices) == d
        for j in ordered_vertices:
            parents = list(G_nx.predecessors(j))
            X[:, j] = _simulate_single_equation(X[:, parents], scale_vec[j])
        for ns in range(n):
            XX[:, ns] = X

        return XX



#__Step 3: Test_GenerateIID__

In [28]:
method = 'linear'
sem_type = 'gauss'
num_nodes = 6
num_edges = 15
num_datasets = 10
T=20
# Weighted adjacency matrix for the target causal graph
weighted_random_dag = DAG.erdos_renyi(n_nodes=num_nodes, n_edges=num_edges, seed=1)
# _simulate_linear_sem(W =weighted_random_dag, n = num_datasets, sem_type = 'gauss', noise_scale=1.0)
dataset = ANMNCPOP_GenerateData(W=weighted_random_dag, n=num_datasets, T=20, method=method, sem_type=sem_type)
true_dag, data = dataset.B, dataset.XX
# print(weighted_random_dag)
print(true_dag)
print(data.shape)
np.save('ANMNCPOP_GenerateTimeIID.npy', data)

[[0 0 1 0 1 0]
 [1 0 1 1 1 1]
 [0 0 0 0 1 0]
 [0 0 1 0 1 0]
 [0 0 0 0 0 0]
 [1 0 1 0 1 0]]
(20, 10, 6)


#__Generate LDS Timesets*__

In [None]:
from inputlds import*
import numpy as np

class DataGenerate(object):
    """Generator based on NCPOP Regressor

    References
    ----------
    Kozdoba, Mark and Marecek, Jakub and Tchrakian, Tigran and Mannor, Shie,
    "On-line learning of linear dynamical systems: Exponential forgetting in kalman filters",
    In Proceedings of the AAAI Conference on Artificial Intelligence, 2019

    Zhou, Quan and Marecek, Jakub,
    "Proper Learning of Linear Dynamical Systems as a Non-Commutative Polynomial Optimisation Problem",
    arXiv, 2020

    Examples
    --------
    """

    def __init__(self, **kwargs):
        super(DataGenerate, self).__init__()

    def data_generation(self, g, f_dash, proc_noise_std, obs_noise_std, T):
        '''
        Generate the T*len(f_dash) time series data from Linear dynamical system with proc_noise and obs_noise

        Parameters
        ----------
        g: Hidden state parameter
        f_dash: Observation state parameter
        proc_noise_std: Hidden state noise
        obs_noise_std: Observation state noise
        T: Time

        Returns
        -------
        list: T*len(f_dash) list

        Examples
        --------
        >>> from inputlds import*
        >>> import numpy as np
        >>> T=10
        >>> g = np.matrix([[0.8,0,0],[0,0.9,0],[0,0,0.1]])
        >>> f_dash = np.matrix([[1.0,0.5,0.3],[0.1,0.1,0.1]])
        >>> proc_noise_std=0.01
        >>> obs_noise_std=0.01
        >>> ANM_NCPOP_DataGenerate().data_generation(g,f_dash,proc_noise_std,obs_noise_std,T)

        '''

        n=len(g)
        m=len(f_dash)
        ds1 = dynamical_system(g,np.zeros((n,m)),f_dash,np.zeros((m,m)),
                process_noise='gaussian',
                observation_noise='gaussian',
                process_noise_std=proc_noise_std,
                observation_noise_std=obs_noise_std)
        inputs = np.zeros((m,T))
        h0=np.ones(ds1.d) # initial state
        ds1.solve(h0=h0, inputs=inputs, T=T)
        return np.asarray(ds1.outputs).reshape(T,m).tolist()


    def data_generation_dim(self, m, n, proc_noise_std,obs_noise_std,T):
        '''
        Generate the T*m time series data from Linear dynamical system with proc_noise and obs_noise

        Parameters
        ----------
        n: Hidden state dimension
        m: Observation state dimension
        proc_noise_std: Hidden state noise
        obs_noise_std: Observation state noise
        T: Time

        Returns
        -------
        list:  T*m list

        Examples
        --------
        >>> from inputlds import*
        >>> import numpy as np
        >>> n=3
        >>> m=2
        >>> T=20
        >>> proc_noise_std=0.01
        >>> obs_noise_std=0.01
        >>> ANM_NCPOP_DataGenerate().data_generation(m, n, proc_noise_std, obs_noise_std, T)
        '''

        g = np.random.randint(0, 2, (n,n))
        f_dash = np.random.randint(0, 2, (m,n))
        ds1 = dynamical_system(g,np.zeros((n,m)),f_dash,np.zeros((m,m)),
                process_noise='gaussian',
                observation_noise='gaussian',
                process_noise_std=proc_noise_std,
                observation_noise_std=obs_noise_std)
        inputs = np.zeros((m,T))
        h0=np.ones(ds1.d) # initial state
        ds1.solve(h0=h0, inputs=inputs, T=T)
        return np.asarray(ds1.outputs).reshape(T,m).tolist()



# Test Data Generation class

In [None]:
from inputlds import*
import numpy as np
T=10
g = np.matrix([[0.8,0,0],[0,0.9,0],[0,0,0.1]])
f_dash = np.matrix([[1.0,0.5,0.3],[0.1,0.1,0.1]])
proc_noise_std=0.01
obs_noise_std=0.01
my_array = DataGenerate().data_generation(g,f_dash,proc_noise_std,obs_noise_std,T)
my_array
df = pd.DataFrame(my_array)
df.to_csv('lds_data.csv', index=False, header=['col1','col2'])

[[1.2747847483078838, 0.19594359308281661],
 [1.0454801175607462, 0.14571383137678165],
 [0.859624443447321, 0.11136022132381274],
 [0.7522937590854241, 0.10262556381907625],
 [0.6384014976018458, 0.10143255711088069],
 [0.4983139644085479, 0.0856117047216572],
 [0.4550613076727763, 0.0647366171990469],
 [0.3835754692208594, 0.04253721891070902],
 [0.33822278060577826, 0.044561053378408044],
 [0.2729466435786535, 0.048692423439118775]]

In [None]:
from inputlds import*
import numpy as np
n=3
m=2
T=10
proc_noise_std=0.01
obs_noise_std=0.01
DataGenerate().data_generation_dim(m,n,proc_noise_std,obs_noise_std,T)

[[0.0014143954478208506, 3.00304475804575],
 [-0.00860099022621434, 3.966124614146327],
 [-0.007635469446289098, 4.934599419454813],
 [0.01215965486252258, 5.9442183250672675],
 [0.010714011287707595, 6.904107192953694],
 [-0.007857632146403469, 7.87991343574115],
 [0.018804761747808166, 8.92670669775614],
 [-0.0023952497348262093, 9.929117447988316],
 [0.009015414863869095, 10.938080588526418],
 [0.009820536471639578, 11.921019820155285]]