# ANMNCPOP_Synthetic_Data_Generation
* In our example, weighted random DAG is produced according to num_nodes and num_edges. Test raw datasets is the time series generating from weighted random DAG and SEM type. For further analysis, all useful infomation is extracted and saved as **npz file** storing causality Data as NumPy array x and y.

* In our paper, we generated LinearGauss_6_15, LinearGauss_6_15_TS and Krebs_Cycle data in respect to testing orignal two dimensions data and multi-features time series.

# LinearGauss_6_15

LinearGauss_6_15.npz

* __x__: is an array with two dimensions(Features_Samples) generated from weighted random DAG and SEM type.
* __y__: represents the weighted random DAG generating the artificial data.

# Krebs_Cycle

Krebs_Cycle_16_43_TS.npz

Multiple features time series are saved under file Krebs_Cycle_TS. The causal_matrix is saved as true_graph.csv.

* __x__(Features_Samples_Timesets): is an array in shape(F, S, T), where the number of row F is features_num, the number of column S is smples_num and the number of deep T is timesets.
* __y__(Features_Features): is a nonsymmetric square matrix.

# LinearGauss_6_15_TS
LinearGauss_6_15_TS.npz

* __x__(Features_Samples_Timesets): is an unitary three dimensions time seires array generated from weighted random DAG and SEM type with high dimension hidden state.
* __y__(Features_Features): represents the weighted random DAG generating the artificial data.


# __Get start__

* mount drive
* set envirment

In [99]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir("/content/drive/MyDrive/Colab Notebooks/NCPOP/Causal_Models_Learning/Test/")
# os.chdir("/content/drive/MyDrive/Colab Notebooks/NCPOP/")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [148]:
# from Generate_SyntheticData import*
from networkx.algorithms import bipartite
from scipy.special import expit as sigmoid
from itertools import combinations
from pickle import TRUE
from random import sample
from copy import deepcopy
from tqdm import tqdm
import numpy as np
import pandas as pd
import networkx as nx
import logging
import tarfile
import os
import re
# import time
import random


def set_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)

class Generate_Synthetic_Data(object):
    '''
    Simulate IID datasets for causal structure learning.

    Parameters
    ------------------------------------------------------------------------------------------------
    File_PATH
            Read data path
    n: int
            Number of samples for standard trainning dataset.
    T: int
            Number of timeseries for standard trainning dataset.
    method: str, (linear or nonlinear), default='linear'
            Distribution for standard trainning dataset.
    sem_type: str
            gauss, exp, gumbel, uniform, logistic (linear);
            mlp, mim, gp, gp-add, quadratic (nonlinear).
    nodes: series
            Notes of samples for standard trainning dataset.
    edges: series
            Edges of samples for standard training dataset.
    noise_scale: float
            Scale parameter of noise distribution in linear SEM.

    Returns
    ------------------------------------------------------------------------------------------------
    Raw_data: npz
            x：[d, n, T] sample time series
            y: true_dag
    File_PATH_Datasets:
            Route of saving test data

    Examples 1
    -------------------------------------------------------------------------------------------------
    >>> method = 'linear'
    >>> sem_type = 'gauss'
    >>> nodes = range(6,12,3)
    >>> edges = range(10,20,5)
    >>> T=200
    >>> num_datasets = 120
    >>> File_PATH = '../Test/Examples/Test_data/'
    >>> noise_scale = 1.0
    >>> _ts = Generate_Synthetic_Data(File_PATH, num_datasets, T, method, sem_type, nodes, edges, noise_scale)
    >>> _ts.genarate_data()

    Examples 2
    -------------------------------------------------------------------------------------------------
    >>> noise_type = {
    >>>     'nonlinear': ['gp-add', 'mlp', 'mim', 'gp', 'quadratic'],
    >>>     'linear':  ['gauss', 'exp', 'gumbel', 'uniform', 'logistic']
    >>> }
    >>> sem_type = ['linear', 'nonlinear']
    >>> nodes = range(6,12,3)
    >>> edges = range(10,20,5)
    >>> T=200
    >>> num_datasets = 120
    >>> File_PATH = '../Test/Examples/Test_data/'
    >>> noise_scale = 1.0

    >>> for m in sem_type :
    >>>   for s in noise_type[m]:
    >>>     _ts = Generate_Synthetic_Data(File_PATH, num_datasets, T, m, s, nodes, edges, noise_scale)
    >>>     print(File_PATH, num_datasets, T, m, s, nodes, edges, noise_scale)
    >>>     _ts.genarate_data()
    '''

    def __init__(self, File_PATH, n, T, method, sem_type, nodes, edges, noise_scale):
        self.File_PATH = File_PATH
        self.n = n
        self.T = T
        self.method = method
        self.sem_type =sem_type
        self.filename = self.method.capitalize()+'SEM_' + self.sem_type.capitalize() +'Noise'
        self.nodes = nodes
        self.edges = edges
        self.noise_scale = noise_scale

    def genarate_data(self):
        ################################################  Create Ground Tier Folders #############################################
        self.File_PATH_Base = self.File_PATH +'Result_'+ self.filename +'/'

        ################################################  Create First Tier Folders #############################################
        self.File_PATH_Datasets = self.File_PATH_Base + 'Datasets_'+ self.filename +'/'
        if not os.path.exists(self.File_PATH_Datasets):
            os.makedirs(self.File_PATH_Datasets)
        print('ANM-NCPOP INFO: Created Datasets' + ' File!')

        nodes_num = len(self.nodes)
        edges_num = len(self.edges)
        count = 0
        tqdm_csv=os.listdir(self.File_PATH_Datasets)
        while len(tqdm_csv) < nodes_num* edges_num:
            print('ANM-NCPOP INFO: Generating '+ self.filename + ' Dataset!')
            if self.method == 'linear':
                for nn in nodes:
                    for ne in edges:
                        count += 1
                        w = DAG.erdos_renyi(n_nodes=nn, n_edges=ne, seed=1)
                        self.B = (w != 0).astype(int)
                        self.XX = Generate_Synthetic_Data._simulate_linear_sem(self.B, self.n, self.T, self.sem_type, self.noise_scale)
                        data_name = self.filename+'_'+str(nn)+'Nodes_'+str(ne)+'Edges_TS'
                        np.savez(self.File_PATH_Datasets +data_name+'.npz', x=self.XX , y=self.B)
                        logging.info('ANM-NCPOP INFO: Finished synthetic dataset')
                        print('ANM-NCPOP INFO: '+ data_name + ' IS DONE!')
                print('ANM-NCPOP INFO: '+ str(count) + ' datasets are generated!')
                break
            elif self.method == 'nonlinear':
                for nn in nodes:
                    for ne in edges:
                        count += 1
                        w = DAG.erdos_renyi(n_nodes=nn, n_edges=ne, seed=1)
                        self.B = (w != 0).astype(int)
                        self.XX = Generate_Synthetic_Data._simulate_nonlinear_sem(self.B, self.n, self.T, self.sem_type, self.noise_scale)
                        data_name = self.filename+'_'+str(nn)+'Nodes_'+str(ne)+'Edges_TS'
                        np.savez(self.File_PATH_Datasets +data_name+'.npz', x=self.XX , y=self.B)
                        logging.info('ANM-NCPOP INFO: Finished synthetic dataset')
                        print('ANM-NCPOP INFO: '+ data_name + ' IS DONE!')
                print('ANM-NCPOP INFO: '+ str(count) + ' datasets are generated!')
                break
            else:
                raise ValueError('Unknown distribution type. Only linear and nonlinear types are accepted.')

            # time.sleep(30)
        print('ANM-NCPOP INFO: Finished '+ self.filename +' dataset generation, which can be found under route: '+ self.File_PATH_Datasets)

    @staticmethod
    def _simulate_linear_sem(W, n, T, sem_type, noise_scale=1.0):
        """
        Simulate samples from linear SEM with specified type of noise.
        For uniform, noise z ~ uniform(-a, a), where a = noise_scale.

        Parameters
        ----------
        W: np.ndarray
            [d, d] weighted adj matrix of DAG.
        n: int
            Number of samples, n=inf mimics population risk.
        T: int
        Number of timeseries for standard trainning dataset.
        sem_type: str
            gauss, exp, gumbel, uniform, logistic.
        noise_scale: float
            Scale parameter of noise distribution in linear SEM.

        Return
        ------
        XX: np.ndarray
            [T, n, d] sample matrix, [d, d] if n and T=inf
        """
        def _simulate_single_equation(X, w, scale):
            """X: [n, num of parents], w: [num of parents], x: [n]"""
            if sem_type == 'gauss':
                z = np.random.normal(scale=scale, size=T)
                x = X @ w + z
            elif sem_type == 'exp':
                z = np.random.exponential(scale=scale, size=T)
                x = X @ w + z
            elif sem_type == 'gumbel':
                z = np.random.gumbel(scale=scale, size=T)
                x = X @ w + z
            elif sem_type == 'uniform':
                z = np.random.uniform(low=-scale, high=scale, size=T)
                x = X @ w + z
            elif sem_type == 'logistic':
                x = np.random.binomial(1, sigmoid(X @ w)) * 1.0
            else:
                raise ValueError('Unknown sem type. In a linear model, \
                                 the options are as follows: gauss, exp, \
                                 gumbel, uniform, logistic.')
            return x

        d = W.shape[0]
        if noise_scale is None:
            scale_vec = np.ones(d)
        elif np.isscalar(noise_scale):
            scale_vec = noise_scale * np.ones(d)
        else:
            if len(noise_scale) != d:
                raise ValueError('noise scale must be a scalar or has length d')
            scale_vec = noise_scale
        G_nx =  nx.from_numpy_array(W, create_using=nx.DiGraph)
        if not nx.is_directed_acyclic_graph(G_nx):
            raise ValueError('W must be a DAG')
        if np.isinf(T):  # population risk for linear gauss SEM
            if sem_type == 'gauss':
                # make 1/d X'X = true cov
                X = np.sqrt(d) * np.diag(scale_vec) @ np.linalg.inv(np.eye(d) - W)
                return X
            else:
                raise ValueError('population risk not available')
        # empirical risk
        ordered_vertices = list(nx.topological_sort(G_nx))
        assert len(ordered_vertices) == d
        X = np.zeros([T, d])
        XX = np.zeros((d, n, T))
        for j in ordered_vertices:
            parents = list(G_nx.predecessors(j))
            X[:, j] = _simulate_single_equation(X[:, parents], W[parents, j], scale_vec[j])
        for ns in range(n):
            XX[:, ns] = np.transpose(X)
        return XX

    @staticmethod
    def _simulate_nonlinear_sem(W, n, T, sem_type, noise_scale=1.0):
        """
        Simulate samples from nonlinear SEM.

        Parameters
        ----------
        B: np.ndarray
            [d, d] binary adj matrix of DAG.
        n: int
            Number of samples.
        T: int
            Number of times.
        sem_type: str
            mlp, mim, gp, gp-add, or quadratic.
        noise_scale: float
            Scale parameter of noise distribution in linear SEM.

        Return
        ------
        XX: np.ndarray
            [d, n, T] sample matrix
        """
        if sem_type == 'quadratic':
            return Generate_SyntheticData._simulate_quad_sem(W, T, noise_scale)

        def _simulate_single_equation(X, scale):
            """X: [n, num of parents], x: [n]"""
            z = np.random.normal(scale=scale, size=n)
            pa_size = X.shape[1]
            if pa_size == 0:
                return z
            if sem_type == 'mlp':
                hidden = 100
                W1 = np.random.uniform(low=0.5, high=2.0, size=[pa_size, hidden])
                W1[np.random.rand(*W1.shape) < 0.5] *= -1
                W2 = np.random.uniform(low=0.5, high=2.0, size=hidden)
                W2[np.random.rand(hidden) < 0.5] *= -1
                x = sigmoid(X @ W1) @ W2 + z
            elif sem_type == 'mim':
                w1 = np.random.uniform(low=0.5, high=2.0, size=pa_size)
                w1[np.random.rand(pa_size) < 0.5] *= -1
                w2 = np.random.uniform(low=0.5, high=2.0, size=pa_size)
                w2[np.random.rand(pa_size) < 0.5] *= -1
                w3 = np.random.uniform(low=0.5, high=2.0, size=pa_size)
                w3[np.random.rand(pa_size) < 0.5] *= -1
                x = np.tanh(X @ w1) + np.cos(X @ w2) + np.sin(X @ w3) + z
            elif sem_type == 'gp':
                from sklearn.gaussian_process import GaussianProcessRegressor
                gp = GaussianProcessRegressor()
                x = gp.sample_y(X, random_state=None).flatten() + z
            elif sem_type == 'gp-add':
                from sklearn.gaussian_process import GaussianProcessRegressor
                gp = GaussianProcessRegressor()
                x = sum([gp.sample_y(X[:, i, None], random_state=None).flatten()
                        for i in range(X.shape[1])]) + z
            else:
                raise ValueError('Unknown sem type. In a nonlinear model, \
                                 the options are as follows: mlp, mim, \
                                 gp, gp-add, or quadratic.')
            return x

        B = (W != 0).astype(int)
        d = B.shape[0]
        if noise_scale is None:
            scale_vec = np.ones(d)
        elif np.isscalar(noise_scale):
            scale_vec = noise_scale * np.ones(d)
        else:
            if len(noise_scale) != d:
                raise ValueError('noise scale must be a scalar or has length d')
            scale_vec = noise_scale

        X = np.zeros([n, d])
        G_nx =  nx.from_numpy_array(B, create_using=nx.DiGraph)
        ordered_vertices = list(nx.topological_sort(G_nx))
        assert len(ordered_vertices) == d
        for j in ordered_vertices:
            parents = list(G_nx.predecessors(j))
            X[:, j] = _simulate_single_equation(X[:, parents], scale_vec[j])
        return X
        XX = np.zeros((d, n, T))
        for ns in range(n):
            XX[:, ns] = np.transpose(X)
        return XX



class DAG(object):
    '''
    A class for simulating random (causal) DAG, where any DAG generator
    method would return the weighed/binary adjacency matrix of a DAG.
    Besides, we recommend using the python package "NetworkX"
    to create more structures types.
    '''

    @staticmethod
    def _random_permutation(M):
        # np.random.permutation permutes first axis only
        P = np.random.permutation(np.eye(M.shape[0]))
        return P.T @ M @ P

    @staticmethod
    def _random_acyclic_orientation(B_und):
        B = np.tril(DAG._random_permutation(B_und), k=-1)
        B_perm = DAG._random_permutation(B)
        return B_perm

    @staticmethod
    def _graph_to_adjmat(G):
        return nx.to_numpy_array(G)

    @staticmethod
    def _BtoW(B, d, w_range):
        U = np.random.uniform(low=w_range[0], high=w_range[1], size=[d, d])
        U[np.random.rand(d, d) < 0.5] *= -1
        W = (B != 0).astype(float) * U
        return W

    @staticmethod
    def _low_rank_dag(d, degree, rank):
        """
        Simulate random low rank DAG with some expected degree.

        Parameters
        ----------
        d: int
            Number of nodes.
        degree: int
            Expected node degree, in + out.
        rank: int
            Maximum rank (rank < d-1).

        Return
        ------
        B: np.nparray
            Initialize DAG.
        """
        prob = float(degree) / (d - 1)
        B = np.triu((np.random.rand(d, d) < prob).astype(float), k=1)
        total_edge_num = np.sum(B == 1)
        sampled_pa = sample(range(d - 1), rank)
        sampled_pa.sort(reverse=True)
        sampled_ch = []
        for i in sampled_pa:
            candidate = set(range(i + 1, d))
            candidate = candidate - set(sampled_ch)
            sampled_ch.append(sample(candidate, 1)[0])
            B[i, sampled_ch[-1]] = 1
        remaining_pa = list(set(range(d)) - set(sampled_pa))
        remaining_ch = list(set(range(d)) - set(sampled_ch))
        B[np.ix_(remaining_pa, remaining_ch)] = 0
        after_matching_edge_num = np.sum(B == 1)

        # delta = total_edge_num - after_matching_edge_num
        # mask B
        maskedB = B + np.tril(np.ones((d, d)))
        maskedB[np.ix_(remaining_pa, remaining_ch)] = 1
        B[maskedB == 0] = 1

        remaining_ch_set = set([i + d for i in remaining_ch])
        sampled_ch_set = set([i + d for i in sampled_ch])
        remaining_pa_set = set(remaining_pa)
        sampled_pa_set = set(sampled_pa)

        edges = np.transpose(np.nonzero(B))
        edges[:, 1] += d
        bigraph = nx.Graph()
        bigraph.add_nodes_from(range(2 * d))
        bigraph.add_edges_from(edges)
        M = nx.bipartite.maximum_matching(bigraph, top_nodes=range(d))
        while len(M) > 2 * rank:
            keys = set(M.keys())
            rmv_cand = keys & (remaining_pa_set | remaining_ch_set)
            p = sample(rmv_cand, 1)[0]
            c = M[p]
            # destroy p-c
            bigraph.remove_edge(p, c)
            M = nx.bipartite.maximum_matching(bigraph, top_nodes=range(d))

        new_edges = np.array(bigraph.edges)
        for i in range(len(new_edges)):
            new_edges[i,].sort()
        new_edges[:, 1] -= d

        BB = np.zeros((d, d))
        B = np.zeros((d, d))
        BB[new_edges[:, 0], new_edges[:, 1]] = 1

        if np.sum(BB == 1) > total_edge_num:
            delta = total_edge_num - rank
            BB[sampled_pa, sampled_ch] = 0
            rmv_cand_edges = np.transpose(np.nonzero(BB))
            if delta <= 0:
                raise RuntimeError(r'Number of edges is below the rank, please \
                                   set a larger edge or degree \
                                   (you can change seed or increase degree).')
            selected = np.array(sample(rmv_cand_edges.tolist(), delta))
            B[selected[:, 0], selected[:, 1]] = 1
            B[sampled_pa, sampled_ch] = 1
        else:
            B = deepcopy(BB)

        B = B.transpose()
        return B

    @staticmethod
    def erdos_renyi(n_nodes, n_edges, weight_range=None, seed=None):

        assert n_nodes > 0
        set_random_seed(seed)
        # Erdos-Renyi
        creation_prob = (2 * n_edges) / (n_nodes ** 2)
        G_und = nx.erdos_renyi_graph(n=n_nodes, p=creation_prob, seed=seed)
        B_und = DAG._graph_to_adjmat(G_und)
        B = DAG._random_acyclic_orientation(B_und)
        if weight_range is None:
            return B
        else:
            W = DAG._BtoW(B, n_nodes, weight_range)
        return W

    @staticmethod
    def scale_free(n_nodes, n_edges, weight_range=None, seed=None):

        assert (n_nodes > 0 and n_edges >= n_nodes and n_edges < n_nodes * n_nodes)
        set_random_seed(seed)
        # Scale-free, Barabasi-Albert
        m = int(round(n_edges / n_nodes))
        G_und = nx.barabasi_albert_graph(n=n_nodes, m=m)
        B_und = DAG._graph_to_adjmat(G_und)
        B = DAG._random_acyclic_orientation(B_und)
        if weight_range is None:
            return B
        else:
            W = DAG._BtoW(B, n_nodes, weight_range)
        return W

    @staticmethod
    def bipartite(n_nodes, n_edges, split_ratio = 0.2, weight_range=None, seed=None):

        assert n_nodes > 0
        set_random_seed(seed)
        # Bipartite, Sec 4.1 of (Gu, Fu, Zhou, 2018)
        n_top = int(split_ratio * n_nodes)
        n_bottom = n_nodes -  n_top
        creation_prob = n_edges/(n_top*n_bottom)
        G_und = bipartite.random_graph(n_top, n_bottom, p=creation_prob, directed=True)
        B_und = DAG._graph_to_adjmat(G_und)
        B = DAG._random_acyclic_orientation(B_und)
        if weight_range is None:
            return B
        else:
            W = DAG._BtoW(B, n_nodes, weight_range)
        return W

    @staticmethod
    def hierarchical(n_nodes, degree=5, graph_level=5, weight_range=None, seed=None):

        assert n_nodes > 1
        set_random_seed(seed)
        prob = float(degree) / (n_nodes - 1)
        B = np.tril((np.random.rand(n_nodes, n_nodes) < prob).astype(float), k=-1)
        point = sample(range(n_nodes - 1), graph_level - 1)
        point.sort()
        point = [0] + [x + 1 for x in point] + [n_nodes]
        for i in range(graph_level):
            B[point[i]:point[i + 1], point[i]:point[i + 1]] = 0
        if weight_range is None:
            return B
        else:
            W = DAG._BtoW(B, n_nodes, weight_range)
        return W

    @staticmethod
    def low_rank(n_nodes, degree=1, rank=5, weight_range=None, seed=None):

        assert n_nodes > 0
        set_random_seed(seed)
        B = DAG._low_rank_dag(n_nodes, degree, rank)
        if weight_range is None:
            return B
        else:
            W = DAG._BtoW(B, n_nodes, weight_range)
        return W


##__Test 1__

In [149]:
method = 'linear'
sem_type = 'gauss'
nodes = range(6,12,3)
edges = range(10,20,5)
T=200
num_datasets = 120
File_PATH = '../Test/Examples/Test_data/'
noise_scale = 1.0
_ts = Generate_Synthetic_Data(File_PATH, num_datasets, T, method, sem_type, nodes, edges, noise_scale)

In [150]:
_ts.genarate_data()

ANM-NCPOP INFO: Created Datasets File!
ANM-NCPOP INFO: Generating LinearSEM_GaussNoise Dataset!
ANM-NCPOP INFO: LinearSEM_GaussNoise_6Nodes_10Edges_TS IS DONE!
ANM-NCPOP INFO: LinearSEM_GaussNoise_6Nodes_15Edges_TS IS DONE!
ANM-NCPOP INFO: LinearSEM_GaussNoise_9Nodes_10Edges_TS IS DONE!
ANM-NCPOP INFO: LinearSEM_GaussNoise_9Nodes_15Edges_TS IS DONE!
ANM-NCPOP INFO: 4 datasets are generated!
ANM-NCPOP INFO: Finished LinearSEM_GaussNoise dataset generation, which can be found under route: ../Test/Examples/Test_data/Result_LinearSEM_GaussNoise/Datasets_LinearSEM_GaussNoise/


##__Test 2__

In [140]:
noise_type = {
    'nonlinear': ['gp-add', 'mlp', 'mim', 'gp', 'quadratic'],
    'linear':  ['gauss', 'exp', 'gumbel', 'uniform', 'logistic']
}
sem_type = ['linear', 'nonlinear']
nodes = range(6,12,3)
edges = range(10,20,5)
T=200
num_datasets = 120
File_PATH = '../Test/Examples/Test_data/'
noise_scale = 1.0

for m in sem_type :
  for s in noise_type[m]:
    _ts = Generate_Synthetic_Data(File_PATH, num_datasets, T, m, s, nodes, edges, noise_scale)
    print(File_PATH, num_datasets, T, m, s, nodes, edges, noise_scale)
    _ts.genarate_data()

../Test/Examples/Test_data/ 120 200 linear gauss range(6, 12, 3) range(10, 20, 5) 1.0
ANM-NCPOP INFO: Created Datasets File!
ANM-NCPOP INFO: Finished LinearSEM_GaussNoise dataset generation, which can be found under route: ../Test/Examples/Test_data/Result_LinearSEM_GaussNoise/Datasets_LinearSEM_GaussNoise/
../Test/Examples/Test_data/ 120 200 linear exp range(6, 12, 3) range(10, 20, 5) 1.0
ANM-NCPOP INFO: Created Datasets File!
ANM-NCPOP INFO: Finished LinearSEM_ExpNoise dataset generation, which can be found under route: ../Test/Examples/Test_data/Result_LinearSEM_ExpNoise/Datasets_LinearSEM_ExpNoise/
../Test/Examples/Test_data/ 120 200 linear gumbel range(6, 12, 3) range(10, 20, 5) 1.0
ANM-NCPOP INFO: Created Datasets File!
ANM-NCPOP INFO: Finished LinearSEM_GumbelNoise dataset generation, which can be found under route: ../Test/Examples/Test_data/Result_LinearSEM_GumbelNoise/Datasets_LinearSEM_GumbelNoise/
../Test/Examples/Test_data/ 120 200 linear uniform range(6, 12, 3) range(10, 

# __Step-by-Step__

In [None]:
!pip install networkx



##__Step 1: Generate IID Timesets Class__

In [133]:
# from Generate_SyntheticData import*
from itertools import combinations
# from BuiltinDataSet import DAG
from pickle import TRUE
import numpy as np
import pandas as pd
import networkx as nx
import logging
import tarfile
import os
import re
# from castle.datasets.simulator import DAG


class Generate_SyntheticData(object):
    '''
    Simulate IID datasets for causal structure learning.

    Parameters
    ----------
    W: np.ndarray
        Weighted adjacency matrix for the target causal graph.
    n: int
        Number of samples for standard trainning dataset.
    T: int
        Number of timeseries for standard trainning dataset.
    method: str, (linear or nonlinear), default='linear'
        Distribution for standard trainning dataset.
    sem_type: str
        gauss, exp, gumbel, uniform, logistic (linear);
        mlp, mim, gp, gp-add, quadratic (nonlinear).
    noise_scale: float
        Scale parameter of noise distribution in linear SEM.
    '''

    def __init__(self, W, n=1000, T=500, method='linear',
                 sem_type='gauss', noise_scale=1.0):

        self.B = (W != 0).astype(int)
        if method == 'linear':
            self.XX = Generate_SyntheticData._simulate_linear_sem(
                    W, n, T, sem_type, noise_scale)
        elif method == 'nonlinear':
            self.XX = Generate_SyntheticData._simulate_nonlinear_sem(
                    W, n, T, sem_type, noise_scale)
        logging.info('Finished synthetic dataset')

    @staticmethod
    def _simulate_linear_sem(W, n, T, sem_type, noise_scale):
        """
        Simulate samples from linear SEM with specified type of noise.
        For uniform, noise z ~ uniform(-a, a), where a = noise_scale.

        Parameters
        ----------
        W: np.ndarray
            [d, d] weighted adj matrix of DAG.
        n: int
            Number of samples, n=inf mimics population risk.
        T: int
        Number of timeseries for standard trainning dataset.
        sem_type: str
            gauss, exp, gumbel, uniform, logistic.
        noise_scale: float
            Scale parameter of noise distribution in linear SEM.

        Return
        ------
        XX: np.ndarray
            [T, n, d] sample matrix, [d, d] if n and T=inf
        """
        def _simulate_single_equation(X, w, scale):
            """X: [n, num of parents], w: [num of parents], x: [n]"""
            if sem_type == 'gauss':
                z = np.random.normal(scale=scale, size=T)
                x = X @ w + z
            elif sem_type == 'exp':
                z = np.random.exponential(scale=scale, size=T)
                x = X @ w + z
            elif sem_type == 'gumbel':
                z = np.random.gumbel(scale=scale, size=T)
                x = X @ w + z
            elif sem_type == 'uniform':
                z = np.random.uniform(low=-scale, high=scale, size=T)
                x = X @ w + z
            elif sem_type == 'logistic':
                x = np.random.binomial(1, sigmoid(X @ w)) * 1.0
            else:
                raise ValueError('Unknown sem type. In a linear model, \
                                 the options are as follows: gauss, exp, \
                                 gumbel, uniform, logistic.')
            return x

        d = W.shape[0]
        if noise_scale is None:
            scale_vec = np.ones(d)
        elif np.isscalar(noise_scale):
            scale_vec = noise_scale * np.ones(d)
        else:
            if len(noise_scale) != d:
                raise ValueError('noise scale must be a scalar or has length d')
            scale_vec = noise_scale
        G_nx =  nx.from_numpy_array(W, create_using=nx.DiGraph)
        if not nx.is_directed_acyclic_graph(G_nx):
            raise ValueError('W must be a DAG')
        if np.isinf(T):  # population risk for linear gauss SEM
            if sem_type == 'gauss':
                # make 1/d X'X = true cov
                X = np.sqrt(d) * np.diag(scale_vec) @ np.linalg.inv(np.eye(d) - W)
                return X
            else:
                raise ValueError('population risk not available')
        # empirical risk
        ordered_vertices = list(nx.topological_sort(G_nx))
        assert len(ordered_vertices) == d
        X = np.zeros([T, d])
        for j in ordered_vertices:
            parents = list(G_nx.predecessors(j))
            X[:, j] = _simulate_single_equation(X[:, parents], W[parents, j], scale_vec[j])
        XX = np.zeros((d, n, T))
        for nn in range(n):
            X_trans = np.transpose(X)
            for dd in range(d):
                XX[dd, nn, :] = list(X_trans[dd])
        return XX

    @staticmethod
    def _simulate_nonlinear_sem(W, n, T, sem_type, noise_scale):
        """
        Simulate samples from nonlinear SEM.

        Parameters
        ----------
        B: np.ndarray
            [d, d] binary adj matrix of DAG.
        n: int
            Number of samples.
        T: int
            Number of time.
        sem_type: str
            mlp, mim, gp, gp-add, or quadratic.
        noise_scale: float
            Scale parameter of noise distribution in linear SEM.

        Return
        ------
        XX: np.ndarray
            [n, d] sample matrix
        """
        if sem_type == 'quadratic':
            return _simulate_quad_sem(W, n, noise_scale)

        def _simulate_single_equation(X, scale):
            """X: [n, num of parents], x: [n]"""
            z = np.random.normal(scale=scale, size=n)
            pa_size = X.shape[1]
            if pa_size == 0:
                return z
            if sem_type == 'mlp':
                hidden = 100
                W1 = np.random.uniform(low=0.5, high=2.0, size=[pa_size, hidden])
                W1[np.random.rand(*W1.shape) < 0.5] *= -1
                W2 = np.random.uniform(low=0.5, high=2.0, size=hidden)
                W2[np.random.rand(hidden) < 0.5] *= -1
                x = sigmoid(X @ W1) @ W2 + z
            elif sem_type == 'mim':
                w1 = np.random.uniform(low=0.5, high=2.0, size=pa_size)
                w1[np.random.rand(pa_size) < 0.5] *= -1
                w2 = np.random.uniform(low=0.5, high=2.0, size=pa_size)
                w2[np.random.rand(pa_size) < 0.5] *= -1
                w3 = np.random.uniform(low=0.5, high=2.0, size=pa_size)
                w3[np.random.rand(pa_size) < 0.5] *= -1
                x = np.tanh(X @ w1) + np.cos(X @ w2) + np.sin(X @ w3) + z
            elif sem_type == 'gp':
                from sklearn.gaussian_process import GaussianProcessRegressor
                gp = GaussianProcessRegressor()
                x = gp.sample_y(X, random_state=None).flatten() + z
            elif sem_type == 'gp-add':
                from sklearn.gaussian_process import GaussianProcessRegressor
                gp = GaussianProcessRegressor()
                x = sum([gp.sample_y(X[:, i, None], random_state=None).flatten()
                        for i in range(X.shape[1])]) + z
            else:
                raise ValueError('Unknown sem type. In a nonlinear model, \
                                 the options are as follows: mlp, mim, \
                                 gp, gp-add, or quadratic.')
            return x

        B = (W != 0).astype(int)
        d = B.shape[0]
        if noise_scale is None:
            scale_vec = np.ones(d)
        elif np.isscalar(noise_scale):
            scale_vec = noise_scale * np.ones(d)
        else:
            if len(noise_scale) != d:
                raise ValueError('noise scale must be a scalar or has length d')
            scale_vec = noise_scale

        X = np.zeros([n, d])
        G_nx = nx.DiGraph(B)
        ordered_vertices = list(nx.topological_sort(G_nx))
        assert len(ordered_vertices) == d
        for j in ordered_vertices:
            parents = list(G_nx.predecessors(j))
            X[:, j] = _simulate_single_equation(X[:, parents], scale_vec[j])
        return X

        XX = np.zeros((d, n, T))
        for nn in range(n):
            X_trans = np.transpose(X)
            for dd in range(d):
                XX[dd, :, T] = list(X_trans[dd])
        return XX

    @staticmethod
    def _simulate_quad_sem(W, n, noise_scale):
        """
        Simulate samples from SEM with specified type of noise.
        Coefficient is randomly drawn but specifically designed
        to avoid overflow issues.

        Parameters
        ----------
        W: np.ndarray
            weigthed DAG.
        n: int
            Number of samples.
        noise_scale: float
            Scale parameter of noise distribution in linear SEM.

        Return
        ------
        X: np.ndarray
            [n,d] sample matrix
        """
        def generate_quadratic_coef(random_zero=True):
            if random_zero and np.random.randint(low=0, high=2):
                return 0
            else:
                coef = np.random.uniform(low=0.5, high=1)
                if np.random.randint(low=0, high=2):
                    coef *= -1
                return coef

        G = nx.DiGraph(W)
        d = W.shape[0]
        X = np.zeros([n, d])
        ordered_vertices = list(nx.topological_sort(G))
        assert len(ordered_vertices) == d
        for j in ordered_vertices:
            parents = list(G.predecessors(j))

            if len(parents) == 0:
                eta = np.zeros([n])
            elif len(parents) == 1:
                # We don't generate random zero coefficient if there is only one parent
                eta = np.zeros([n])
                used_parents = set()
                p = parents[0]
                num_terms = 0

                # Linear term
                coef = generate_quadratic_coef(random_zero=False)
                if coef != 0:
                    eta += coef * X[:, p]
                    used_parents.add(p)
                    num_terms += 1

                # Squared term
                coef = generate_quadratic_coef(random_zero=False)
                if coef != 0:
                    eta += coef * np.square(X[:, p])
                    used_parents.add(p)
                    num_terms += 1

                if num_terms > 0:
                    eta /= num_terms    # Compute average

                # Remove parent if both coef is zero
                if p not in used_parents:
                    W[p, j] = 0
            else:    # More than 1 parent
                eta = np.zeros([n])
                used_parents = set()
                num_terms = 0

                for p in parents:
                    # Linear terms
                    coef = generate_quadratic_coef(random_zero=True)
                    if coef > 0:
                        eta += coef * X[:, p]
                        used_parents.add(p)
                        num_terms += 1

                    # Squared terms
                    coef = generate_quadratic_coef(random_zero=True)
                    if coef > 0:
                        eta += coef * np.square(X[:, p])
                        used_parents.add(p)
                        num_terms += 1

                # Cross terms
                for p1, p2 in combinations(parents, 2):
                    coef = generate_quadratic_coef(random_zero=True)
                    if coef > 0:
                        eta += coef * X[:, p1] * X[:, p2]
                        used_parents.add(p1)
                        used_parents.add(p2)
                        num_terms += 1

                if num_terms > 0:
                    eta /= num_terms    # Compute average

                # Remove parent if both coef is zero
                unused_parents = set(parents) - used_parents
                if p in unused_parents:
                    W[p, j] = 0

            X[:, j] = eta + np.random.normal(scale=noise_scale, size=n)

        return X



##__Step 2: Generate DAG Class__

In [126]:
# coding=utf-8

# Huawei Technologies Co., Ltd.
#
# Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
#
# Copyright (c) Xun Zheng (https://github.com/xunzheng/notears)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import random
from random import sample
import numpy as np
import pandas as pd
import networkx as nx
from networkx.algorithms import bipartite
from tqdm import tqdm
from copy import deepcopy
from itertools import combinations
from scipy.special import expit as sigmoid


def set_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)


class DAG(object):
    '''
    A class for simulating random (causal) DAG, where any DAG generator
    method would return the weighed/binary adjacency matrix of a DAG.
    Besides, we recommend using the python package "NetworkX"
    to create more structures types.
    '''

    @staticmethod
    def _random_permutation(M):
        # np.random.permutation permutes first axis only
        P = np.random.permutation(np.eye(M.shape[0]))
        return P.T @ M @ P

    @staticmethod
    def _random_acyclic_orientation(B_und):
        B = np.tril(DAG._random_permutation(B_und), k=-1)
        B_perm = DAG._random_permutation(B)
        return B_perm

    @staticmethod
    def _graph_to_adjmat(G):
        return nx.to_numpy_array(G)

    @staticmethod
    def _BtoW(B, d, w_range):
        U = np.random.uniform(low=w_range[0], high=w_range[1], size=[d, d])
        U[np.random.rand(d, d) < 0.5] *= -1
        W = (B != 0).astype(float) * U
        return W

    @staticmethod
    def _low_rank_dag(d, degree, rank):
        """
        Simulate random low rank DAG with some expected degree.

        Parameters
        ----------
        d: int
            Number of nodes.
        degree: int
            Expected node degree, in + out.
        rank: int
            Maximum rank (rank < d-1).

        Return
        ------
        B: np.nparray
            Initialize DAG.
        """
        prob = float(degree) / (d - 1)
        B = np.triu((np.random.rand(d, d) < prob).astype(float), k=1)
        total_edge_num = np.sum(B == 1)
        sampled_pa = sample(range(d - 1), rank)
        sampled_pa.sort(reverse=True)
        sampled_ch = []
        for i in sampled_pa:
            candidate = set(range(i + 1, d))
            candidate = candidate - set(sampled_ch)
            sampled_ch.append(sample(candidate, 1)[0])
            B[i, sampled_ch[-1]] = 1
        remaining_pa = list(set(range(d)) - set(sampled_pa))
        remaining_ch = list(set(range(d)) - set(sampled_ch))
        B[np.ix_(remaining_pa, remaining_ch)] = 0
        after_matching_edge_num = np.sum(B == 1)

        # delta = total_edge_num - after_matching_edge_num
        # mask B
        maskedB = B + np.tril(np.ones((d, d)))
        maskedB[np.ix_(remaining_pa, remaining_ch)] = 1
        B[maskedB == 0] = 1

        remaining_ch_set = set([i + d for i in remaining_ch])
        sampled_ch_set = set([i + d for i in sampled_ch])
        remaining_pa_set = set(remaining_pa)
        sampled_pa_set = set(sampled_pa)

        edges = np.transpose(np.nonzero(B))
        edges[:, 1] += d
        bigraph = nx.Graph()
        bigraph.add_nodes_from(range(2 * d))
        bigraph.add_edges_from(edges)
        M = nx.bipartite.maximum_matching(bigraph, top_nodes=range(d))
        while len(M) > 2 * rank:
            keys = set(M.keys())
            rmv_cand = keys & (remaining_pa_set | remaining_ch_set)
            p = sample(rmv_cand, 1)[0]
            c = M[p]
            # destroy p-c
            bigraph.remove_edge(p, c)
            M = nx.bipartite.maximum_matching(bigraph, top_nodes=range(d))

        new_edges = np.array(bigraph.edges)
        for i in range(len(new_edges)):
            new_edges[i,].sort()
        new_edges[:, 1] -= d

        BB = np.zeros((d, d))
        B = np.zeros((d, d))
        BB[new_edges[:, 0], new_edges[:, 1]] = 1

        if np.sum(BB == 1) > total_edge_num:
            delta = total_edge_num - rank
            BB[sampled_pa, sampled_ch] = 0
            rmv_cand_edges = np.transpose(np.nonzero(BB))
            if delta <= 0:
                raise RuntimeError(r'Number of edges is below the rank, please \
                                   set a larger edge or degree \
                                   (you can change seed or increase degree).')
            selected = np.array(sample(rmv_cand_edges.tolist(), delta))
            B[selected[:, 0], selected[:, 1]] = 1
            B[sampled_pa, sampled_ch] = 1
        else:
            B = deepcopy(BB)

        B = B.transpose()
        return B

    @staticmethod
    def erdos_renyi(n_nodes, n_edges, weight_range=None, seed=None):

        assert n_nodes > 0
        set_random_seed(seed)
        # Erdos-Renyi
        creation_prob = (2 * n_edges) / (n_nodes ** 2)
        G_und = nx.erdos_renyi_graph(n=n_nodes, p=creation_prob, seed=seed)
        B_und = DAG._graph_to_adjmat(G_und)
        B = DAG._random_acyclic_orientation(B_und)
        if weight_range is None:
            return B
        else:
            W = DAG._BtoW(B, n_nodes, weight_range)
        return W

    @staticmethod
    def scale_free(n_nodes, n_edges, weight_range=None, seed=None):

        assert (n_nodes > 0 and n_edges >= n_nodes and n_edges < n_nodes * n_nodes)
        set_random_seed(seed)
        # Scale-free, Barabasi-Albert
        m = int(round(n_edges / n_nodes))
        G_und = nx.barabasi_albert_graph(n=n_nodes, m=m)
        B_und = DAG._graph_to_adjmat(G_und)
        B = DAG._random_acyclic_orientation(B_und)
        if weight_range is None:
            return B
        else:
            W = DAG._BtoW(B, n_nodes, weight_range)
        return W

    @staticmethod
    def bipartite(n_nodes, n_edges, split_ratio = 0.2, weight_range=None, seed=None):

        assert n_nodes > 0
        set_random_seed(seed)
        # Bipartite, Sec 4.1 of (Gu, Fu, Zhou, 2018)
        n_top = int(split_ratio * n_nodes)
        n_bottom = n_nodes -  n_top
        creation_prob = n_edges/(n_top*n_bottom)
        G_und = bipartite.random_graph(n_top, n_bottom, p=creation_prob, directed=True)
        B_und = DAG._graph_to_adjmat(G_und)
        B = DAG._random_acyclic_orientation(B_und)
        if weight_range is None:
            return B
        else:
            W = DAG._BtoW(B, n_nodes, weight_range)
        return W

    @staticmethod
    def hierarchical(n_nodes, degree=5, graph_level=5, weight_range=None, seed=None):

        assert n_nodes > 1
        set_random_seed(seed)
        prob = float(degree) / (n_nodes - 1)
        B = np.tril((np.random.rand(n_nodes, n_nodes) < prob).astype(float), k=-1)
        point = sample(range(n_nodes - 1), graph_level - 1)
        point.sort()
        point = [0] + [x + 1 for x in point] + [n_nodes]
        for i in range(graph_level):
            B[point[i]:point[i + 1], point[i]:point[i + 1]] = 0
        if weight_range is None:
            return B
        else:
            W = DAG._BtoW(B, n_nodes, weight_range)
        return W

    @staticmethod
    def low_rank(n_nodes, degree=1, rank=5, weight_range=None, seed=None):

        assert n_nodes > 0
        set_random_seed(seed)
        B = DAG._low_rank_dag(n_nodes, degree, rank)
        if weight_range is None:
            return B
        else:
            W = DAG._BtoW(B, n_nodes, weight_range)
        return W




class Topology(object):
    """
    A class for generating some classical (undirected) network structures,
    in which any graph generator method would return the adjacency matrix of
    a network structure.
    In fact, we recommend to directly use the python package "NetworkX"
    to create various structures you need.
    """

    @staticmethod
    def erdos_renyi(n_nodes, n_edges, seed=None):
        """
        Generate topology matrix

        Parameters
        ----------
        n_nodes : int, greater than 0
            The number of nodes.
        n_edges : int, greater than 0
            Use to calculate probability for edge creation.
        seed : integer, random_state, or None (default)
            Indicator of random number generation state.

        Returns
        -------
        B: np.matrix
        """
        assert n_nodes > 0, 'The number of nodes must be greater than 0.'
        creation_prob = (2*n_edges)/(n_nodes**2)
        G = nx.erdos_renyi_graph(n=n_nodes, p=creation_prob, seed=seed)
        B = nx.to_numpy_array(G)
        return B


class THPSimulation(object):
    """
    A class for simulating event sequences with
    THP (Topological Hawkes Process) setting.

    Parameters
    ----------
    causal_matrix: np.matrix
        The casual matrix.
    topology_matrix: np.matrix
        Interpreted as an adjacency matrix to generate graph.
        Has two dimension, should be square.
    mu_range: tuple, default=(0.00005, 0.0001)
    alpha_range: tuple, default=(0.005, 0.007)
    """

    def __init__(self, causal_matrix, topology_matrix,
                 mu_range=(0.00005, 0.0001), alpha_range=(0.005, 0.007)):

        assert (isinstance(causal_matrix, np.ndarray) and
                causal_matrix.ndim == 2 and
                causal_matrix.shape[0] == causal_matrix.shape[1]),\
            'casual_matrix should be np.matrix object, two dimension, square.'
        assert (isinstance(topology_matrix, np.ndarray) and
                topology_matrix.ndim == 2 and
                topology_matrix.shape[0] == topology_matrix.shape[1]),\
            'topology_matrix should be np.matrix object, two dimension, square.'

        self._causal_matrix = (causal_matrix != 0).astype(int)

        self._topo = nx.from_numpy_array(topology_matrix,
                                          create_using=nx.Graph)

        self._mu_range = mu_range
        self._alpha_range = alpha_range

    def simulate(self, T, max_hop=1, beta=10):
        """
        Generate simulation data.
        """
        N = self._causal_matrix.shape[0]

        mu = np.random.uniform(*self._mu_range, N)

        alpha = np.random.uniform(*self._alpha_range, [N, N])
        alpha = alpha * self._causal_matrix
        alpha = np.ones([max_hop+1, N, N]) * alpha

        immigrant_events = dict()
        for node in self._topo.nodes:
            immigrant_events[node] = self._trigger_events(mu, 0, T, beta)

        base_events = immigrant_events.copy()
        events = immigrant_events.copy()
        while sum(map(len, base_events.values())) != 0:
            offspring_events = dict()
            for node in tqdm(self._topo.nodes):
                offspring_events[node] = []
                for k in range(max_hop+1):
                    k_base_events = []
                    for neighbor in self._get_k_hop_neighbors(
                            self._topo, node, k):
                        k_base_events += base_events[neighbor]
                    k_new_events = [self._trigger_events(
                        alpha[k, i], start_time, duration, beta)
                        for (i, start_time, duration) in k_base_events]
                    for event_group in k_new_events:
                        offspring_events[node] += event_group
                events[node] += offspring_events[node]
            base_events = offspring_events

        Xn_list = []
        for node, event_group in events.items():
            Xn = pd.DataFrame(event_group,
                              columns=['event', 'timestamp', 'duration'])
            Xn.insert(0, 'node', node)
            Xn_list.append(Xn.reindex(columns=['event', 'timestamp', 'node']))
        X = pd.concat(Xn_list, sort=False, ignore_index=True)
        return X

    @staticmethod
    def _trigger_events(intensity_vec, start_time, duration, beta):

        events = []
        for i, intensity in enumerate(intensity_vec):
            if intensity:
                trigger_time = start_time
                while True:
                    trigger_time = round(trigger_time + np.random.exponential(
                        1 / intensity))
                    if trigger_time > start_time + duration:
                        break
                    sub_duration = (np.max((0, np.random.exponential(beta)))).round()
                    events.append((i, trigger_time, sub_duration))
        return events

    @staticmethod
    def _get_k_hop_neighbors(G, node, k):
        if k == 0:
            return {node}
        else:
            return (set(nx.single_source_dijkstra_path_length(G, node, k).keys())
                    - set(nx.single_source_dijkstra_path_length(
                        G, node, k - 1).keys()))


##__Step 3: Test Generated Time Series__

In [None]:
if __name__ == "__main__":
    ############################################################################################################
    #########################################  Test ###########################
    ############################################################################################################

    method = 'linear'
    sem_type = 'gauss'
    num_nodes = 9
    num_edges = 20
    num_datasets = 120
    T=500
    # Weighted adjacency matrix for the target causal graph
    '''
    true_graph_matrix = DAG.erdos_renyi(n_nodes=10, n_edges=10)
    topology_matrix = Topology.erdos_renyi(n_nodes=20, n_edges=20)
    simulator = THPSimulation(true_graph_matrix, topology_matrix,
                                  mu_range=(0.00005, 0.0001),
                                  alpha_range=(0.005, 0.007))
    _data = simulator.simulate(T=25000, max_hop=2)'''

    # replaced 'to_numpy_matrix' with 'to_numpy_array'
    weighted_random_dag = DAG.erdos_renyi(n_nodes=num_nodes, n_edges=num_edges, seed=1)
    # _simulate_linear_sem(W =weighted_random_dag, n = num_datasets, sem_type = 'gauss', noise_scale=1.0)
    dataset = Generate_SyntheticData(W=weighted_random_dag, n=num_datasets, T=T, method=method, sem_type=sem_type)
    true_dag, data = dataset.B, dataset.XX
    file_name = method.capitalize()+sem_type.capitalize()+'_'+str(num_nodes)+'_'+str(num_edges)+'_TS'
    np.savez(file_name+'.npz', x=dataset.XX, y=dataset.B)
    print('INFO: Check for '+file_name + '!')

INFO: Check for LinearGauss_9_20_TS!


In [None]:
if __name__ == "__main__":
    ############################################################################################################
    ############### Method， SEM Type, Numbers of nodes, Numbers of edges, Numbers of datasets and T ###########
    ############################################################################################################

    sem_type = {
        'nonlinear': ['gp-add','mlp', 'mim', 'gp', 'quadratic'],
        'linear':  ['gauss', 'exp', 'gumbel', 'uniform', 'logistic']
    }

    # Creat DataFrame
    sem_type = pd.DataFrame(sem_type)

    T=500
    count = 0
    nodenum = range(6,10,3)
    edgenum = range(10,16,5)
    for m in ['nonlinear', 'linear']:
      for s in sem_type[m]:
        for n in nodenum:
          for e in edgenum:
            count = count +1
            weighted_random_dag = DAG.erdos_renyi(n_nodes=n, n_edges=e, weight_range=(0.5, 2.0), seed=1)
            dataset = Generate_SyntheticData(W=weighted_random_dag, n=120, T=T, method=m, sem_type=s)
            true_dag, XX = dataset.B, dataset.XX
            file_name = m.capitalize()+s.capitalize()+'_'+str(n)+'Nodes_'+str(e)+'Edges_TS'
            Save_path = "/content/drive/MyDrive/Colab Notebooks/NCPOP/Causal_Models_Learning/Test/Datasets/Synthetic datasets/Generate_SyntheticData/"
            np.savez(Save_path+file_name+'.npz', x=XX , y=true_dag)
            print('INFO: Check for '+file_name + '!')
    print('In total, '+str(count) + ' datasets are generated!')

INFO: Check for NonlinearGp-add_6Nodes_10Edges_TS!
INFO: Check for NonlinearGp-add_6Nodes_15Edges_TS!
INFO: Check for NonlinearGp-add_9Nodes_10Edges_TS!
INFO: Check for NonlinearGp-add_9Nodes_15Edges_TS!
INFO: Check for NonlinearMlp_6Nodes_10Edges_TS!
INFO: Check for NonlinearMlp_6Nodes_15Edges_TS!
INFO: Check for NonlinearMlp_9Nodes_10Edges_TS!
INFO: Check for NonlinearMlp_9Nodes_15Edges_TS!
INFO: Check for NonlinearMim_6Nodes_10Edges_TS!
INFO: Check for NonlinearMim_6Nodes_15Edges_TS!
INFO: Check for NonlinearMim_9Nodes_10Edges_TS!
INFO: Check for NonlinearMim_9Nodes_15Edges_TS!
INFO: Check for NonlinearGp_6Nodes_10Edges_TS!
INFO: Check for NonlinearGp_6Nodes_15Edges_TS!
INFO: Check for NonlinearGp_9Nodes_10Edges_TS!
INFO: Check for NonlinearGp_9Nodes_15Edges_TS!
INFO: Check for NonlinearQuadratic_6Nodes_10Edges_TS!
INFO: Check for NonlinearQuadratic_6Nodes_15Edges_TS!
INFO: Check for NonlinearQuadratic_9Nodes_10Edges_TS!
INFO: Check for NonlinearQuadratic_9Nodes_15Edges_TS!
INFO: Ch

# Backup

In [135]:
File_PATH = "Test/Datasets/Real_data/Telephone/"
file_name = 'Telephone'

File_PATH = "Test/Datasets/Real_data/Microwave/"
file_name = '25V_474N_Microwave'

Krebs_Cycle
File_PATH = "Test/Datasets/Synthetic_data/Kreb_Cycles/"
file_name = 'Krebs_Cycle'

dt = Real_Data_Standardization(File_PATH, file_name)
dt.Produce_Rawdata()

NameError: name 'Krebs_Cycle' is not defined

In [None]:
weighted_random_dag = DAG.erdos_renyi(n_nodes=6, n_edges=15, weight_range=(0.5, 2.0), seed=1)
print(weighted_random_dag)
pd.DataFrame(weighted_random_dag).to_csv('weighted_random_dag_6_15.csv',index=False)

[[-0.         -0.         -1.50461906 -0.         -1.76946638 -0.        ]
 [ 1.28682224 -0.          0.84436582 -1.30162086  1.87094304  1.18580721]
 [-0.          0.          0.          0.          1.70413626 -0.        ]
 [-0.          0.          1.74372036 -0.          0.90957496  0.        ]
 [-0.         -0.          0.         -0.          0.          0.        ]
 [-0.7131802   0.         -1.11880826  0.          1.43604498 -0.        ]]


## __Generate IID Datasets__

In [None]:
from castle.datasets import DAG, IIDSimulation

sem_type = {
    'nonlinear': ['gp-add','mlp', 'mim', 'gp', 'quadratic'],
    'linear':  ['gauss', 'exp', 'gumbel', 'uniform', 'logistic']
}

# creat DataFrame
sem_type = pd.DataFrame(sem_type)

DataSize = range(5,45,5)
nodenum = range(3,18,3)
edgenum = range(5,20,5)
for m in ['nonlinear','linear']:
  for s in sem_type[m]:
    for n in nodenum:
      for e in edgenum:
        weighted_random_dag = DAG.erdos_renyi(n_nodes=n, n_edges=e, weight_range=(0.5, 2.0), seed=1)
        dataset = IIDSimulation(W=weighted_random_dag, n=100, method=m, sem_type=s)
        true_dag, X = dataset.B, dataset.X
        # save numpy to npz file
        Save_path = "/content/drive/MyDrive/Colab Notebooks/NCPOP/Causal_Models_Learning/Test/Datasets/Synthetic datasets/Generate_SyntheticData/"
        sname = Save_path+ m+s + '_'+str(n)+'_'+str(e)
        np.savez(sname+'.npz', x=X , y=true_dag)
        print('INFO: Check for '+sname + '!')

(16, 120, 500) (16, 16)


##__Test_GenerateIID__

In [None]:
from BuiltinDataSet import*
from pickle import TRUE
import numpy as np
import pandas as pd
import networkx as nx
import logging
import tarfile
import os
import re
weighted_random_dag = DAG.erdos_renyi(n_nodes=6, n_edges=15, seed=1)
dataset = IIDSimulation(W=weighted_random_dag, n=500, method='linear', sem_type='gauss')
true_dag, data = dataset.B, dataset.X
np.savez('./Test_Causality_Datasets/Synthetic datasets/linearGauss_6_15.npz', x=true_dag, y=data)


##__Generate LDS Timesets*__

In [None]:
from inputlds import*
import numpy as np

class DataGenerate(object):
    """Generator based on NCPOP Regressor

    References
    ----------
    Kozdoba, Mark and Marecek, Jakub and Tchrakian, Tigran and Mannor, Shie,
    "On-line learning of linear dynamical systems: Exponential forgetting in kalman filters",
    In Proceedings of the AAAI Conference on Artificial Intelligence, 2019

    Zhou, Quan and Marecek, Jakub,
    "Proper Learning of Linear Dynamical Systems as a Non-Commutative Polynomial Optimisation Problem",
    arXiv, 2020

    Examples
    --------
    """

    def __init__(self, **kwargs):
        super(DataGenerate, self).__init__()

    def data_generation(self, g, f_dash, proc_noise_std, obs_noise_std, T):
        '''
        Generate the T*len(f_dash) time series data from Linear dynamical system with proc_noise and obs_noise

        Parameters
        ----------
        g: Hidden state parameter
        f_dash: Observation state parameter
        proc_noise_std: Hidden state noise
        obs_noise_std: Observation state noise
        T: Time

        Returns
        -------
        list: T*len(f_dash) list

        Examples
        --------
        >>> from inputlds import*
        >>> import numpy as np
        >>> T=10
        >>> g = np.matrix([[0.8,0,0],[0,0.9,0],[0,0,0.1]])
        >>> f_dash = np.matrix([[1.0,0.5,0.3],[0.1,0.1,0.1]])
        >>> proc_noise_std=0.01
        >>> obs_noise_std=0.01
        >>> ANM_NCPOP_DataGenerate().data_generation(g,f_dash,proc_noise_std,obs_noise_std,T)

        '''

        n=len(g)
        m=len(f_dash)
        ds1 = dynamical_system(g,np.zeros((n,m)),f_dash,np.zeros((m,m)),
                process_noise='gaussian',
                observation_noise='gaussian',
                process_noise_std=proc_noise_std,
                observation_noise_std=obs_noise_std)
        inputs = np.zeros((m,T))
        h0=np.ones(ds1.d) # initial state
        ds1.solve(h0=h0, inputs=inputs, T=T)
        return np.asarray(ds1.outputs).reshape(T,m).tolist()


    def data_generation_dim(self, m, n, proc_noise_std,obs_noise_std,T):
        '''
        Generate the T*m time series data from Linear dynamical system with proc_noise and obs_noise

        Parameters
        ----------
        n: Hidden state dimension
        m: Observation state dimension
        proc_noise_std: Hidden state noise
        obs_noise_std: Observation state noise
        T: Time

        Returns
        -------
        list:  T*m list

        Examples
        --------
        >>> from inputlds import*
        >>> import numpy as np
        >>> n=3
        >>> m=2
        >>> T=20
        >>> proc_noise_std=0.01
        >>> obs_noise_std=0.01
        >>> ANM_NCPOP_DataGenerate().data_generation(m, n, proc_noise_std, obs_noise_std, T)
        '''

        g = np.random.randint(0, 2, (n,n))
        f_dash = np.random.randint(0, 2, (m,n))
        ds1 = dynamical_system(g,np.zeros((n,m)),f_dash,np.zeros((m,m)),
                process_noise='gaussian',
                observation_noise='gaussian',
                process_noise_std=proc_noise_std,
                observation_noise_std=obs_noise_std)
        inputs = np.zeros((m,T))
        h0=np.ones(ds1.d) # initial state
        ds1.solve(h0=h0, inputs=inputs, T=T)
        return np.asarray(ds1.outputs).reshape(T,m).tolist()



## Test Data Generation class

In [None]:
from inputlds import*
import numpy as np
T=10
g = np.matrix([[0.8,0,0],[0,0.9,0],[0,0,0.1]])
f_dash = np.matrix([[1.0,0.5,0.3],[0.1,0.1,0.1]])
proc_noise_std=0.01
obs_noise_std=0.01
my_array = DataGenerate().data_generation(g,f_dash,proc_noise_std,obs_noise_std,T)
my_array
df = pd.DataFrame(my_array)
df.to_csv('lds_data.csv', index=False, header=['col1','col2'])

[[1.2747847483078838, 0.19594359308281661],
 [1.0454801175607462, 0.14571383137678165],
 [0.859624443447321, 0.11136022132381274],
 [0.7522937590854241, 0.10262556381907625],
 [0.6384014976018458, 0.10143255711088069],
 [0.4983139644085479, 0.0856117047216572],
 [0.4550613076727763, 0.0647366171990469],
 [0.3835754692208594, 0.04253721891070902],
 [0.33822278060577826, 0.044561053378408044],
 [0.2729466435786535, 0.048692423439118775]]

In [None]:
from inputlds import*
import numpy as np
n=3
m=2
T=10
proc_noise_std=0.01
obs_noise_std=0.01
DataGenerate().data_generation_dim(m,n,proc_noise_std,obs_noise_std,T)

[[0.0014143954478208506, 3.00304475804575],
 [-0.00860099022621434, 3.966124614146327],
 [-0.007635469446289098, 4.934599419454813],
 [0.01215965486252258, 5.9442183250672675],
 [0.010714011287707595, 6.904107192953694],
 [-0.007857632146403469, 7.87991343574115],
 [0.018804761747808166, 8.92670669775614],
 [-0.0023952497348262093, 9.929117447988316],
 [0.009015414863869095, 10.938080588526418],
 [0.009820536471639578, 11.921019820155285]]