#ANMNCPOP_Real_Data_Standardization
Welcome to the example for real data standardization in causal discovery!

For further analysis, all useful infomation is extracted and saved as .npz file.

# Input Data Form:
The data can be any format that is supported by the Real_Data_Standardization() function, currently including npy, tar.gz, csv and tsv.

**Two Dimensions Causality Data:**
* **npz file**

Storing causality Data as NumPy array x and y under a npz file.

* **tar.gz file**

Archiving and compressing causality files and folders as a tar.gz file.

* **csv files**

Raw data and casaul matrix are saved as separate csv files.

**Multiple Features Time Series:**
* **tsv files**

Single sample trajectory with multiple features Time Series as shape of (F features, T timeSets) - incluing S smples, i.e. S number of .tsv files

# Standardization process:
* Causality Data stored as NumPy array B and XX under a npz file.
* For multiple features time series data, all time series will be saved as a (Feature_num, Sample_num, Time) three dimensions array for applying ANM-NCPOP.

# Output Data Form
* **Raw_data:**

An array saves F features, S smples and T timesets Time Series.

* **True_dag:**

A causal matrix is saved as shape of (F features, F features)

Learned underlying causal relationships between obeservations, according to expert experience or ground true causality.

# Example
In our example, multiple features time series are saved under Krebs_Cycle.npz as a casaul matrix and an array, separately.

* __x__: is an array in shape(F, S, T), where the number of row F is features_num, the number of column S is smples_num and the number of deep T is timesets.
* __y__: is a nonsymmetric square matrix.


#__Step 1: Get start__


* mount drive
* set envirment
* install packages

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir("/content/drive/MyDrive/Colab Notebooks/NCPOP-Colab Notebooks/Test_Causality_Datasets/Real_data/Krebs_Cycle/")
# os.chdir("/content/drive/MyDrive/Colab Notebooks/NCPOP-Colab Notebooks/Test_Causality_Datasets/Real_data/Krebs_Cycle/Details_Krebs_Cycle/MetricsDAG/")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#__Step 2: Standardized Data class__

In [None]:
# from Data_Standardization import*
from pickle import TRUE
import os
import re
import numpy as np
import pandas as pd
import tarfile
from itertools import combinations

class Real_Data_Standardization(object):
    '''
    A class for preparing data to simulate random (causal) DAG.

    Parameters
    ----------
    File_PATH: str
        Save file path
    File_NAME: str
        Read data name
    '''

    def __init__(self, File_PATH, Data_NAME):
        self.File_PATH = File_PATH
        self.Data_NAME = Data_NAME

    def Produce_Rawdata(self):

        def readable_File(FilePATH):
            read_Dir=os.listdir(FilePATH)
            count = 0
            readable_F = []
            for f in read_Dir:
                file = os.path.join(FilePATH, f)
                if os.path.isdir(file):
                    count = count+1
                else:
                    readable_F.append(f)
            return count,readable_F

        self.Read_File = readable_File(self.File_PATH)[1]
        self.TS_path = self.File_PATH + self.Data_NAME + '_TS/'

        # Check empty files under riute
        if len(self.File_PATH) == 0:
            print('INFO: No Data Under the Current Route!')
        else:
            File_NAME = []
            File_TYPE = []

            # Delete files and list readable Files
            for i in self.Read_File:
                File_NAME.append(re.split("\.", i)[0])
                File_TYPE.append(re.split("\.", i)[1])

            ###################################### Deal with Two Dimensions Causality Data ###################################
            if self.Data_NAME+'.npz' in self.Read_File:
                Tests_data = np.load(self.File_PATH + self.Data_NAME+'.npz', allow_pickle=True)
                Raw_data = Tests_data['x']
                true_dag = Tests_data['y']
                print('INFO: Check for '+self.Data_NAME +'.npz'+ '!')

            elif self.Data_NAME+'.tar.gz' in self.Read_File:
                # open file
                file = tarfile.open(self.File_PATH + self.Data_NAME + '.tar.gz')

                # print file names
                file_names = file.getnames()
                print(file_names)

                # extract files
                file.extractall(self.File_PATH)

                # close file
                file.close()

                Raw_data = pd.read_csv(self.File_PATH+file_names[1])
                true_dag = np.load(self.File_PATH+file_names[2])

                # save numpy to npz file
                np.savez(self.File_PATH + self.Data_NAME+'.npz', x=Raw_data , y=true_dag)
                print('INFO: Check for '+self.Data_NAME +'.npz'+ '!')

            elif self.Data_NAME+'.csv' in self.Read_File:
                Raw_data = pd.read_csv(self.File_PATH+ self.Data_NAME+'.csv', header=0, index_col=0)
                true_dag = pd.read_csv(self.File_PATH+'true_graph.csv', header=0, index_col=0)

                # save numpy to npz file
                np.savez(self.File_PATH + self.Data_NAME+'.npz', x=Raw_data , y=true_dag)
                print('INFO: Check for '+self.Data_NAME +'.npz'+ '!')

            ################################ Deal with Multi-dimensions Causality Data ###################################
            elif os.path.exists(self.TS_path):
                read_Dir_TS=os.listdir(self.TS_path)
                Timeseries_List_path = self.File_PATH+'series_list.csv'
                Read_Timeseries = pd.read_csv(Timeseries_List_path)
                # print(len(Read_Timeseries), len(read_Dir_TS))
                if len(Read_Timeseries) >= len(read_Dir_TS):
                    print('INFO: Start Analyzing '+ self.Data_NAME + ' Time Series File!')
                    TS_List = read_Dir_TS
                else:
                    print('INFO: Start Analyzing '+ self.Data_NAME + ' Time Series List!')
                    TS_List = Read_Timeseries['Series_num']
                lds = pd.read_csv(self.TS_path+ TS_List[0], delimiter='\t', index_col=0, header=None)
                # print(lds)
                n = len(TS_List)
                T = lds.shape[1]
                # d = lds.shape[0]
                # print(d, T, n)
                df = np.transpose(lds)
                feature_name = df.columns
                d = len(feature_name)
                Raw_data = np.zeros((d, n, T))
                for ns in range(n):
                    X = pd.read_csv(self.TS_path+ TS_List[ns], delimiter='\t', index_col=0, header=None)
                    df = np.transpose(X)
                    feature_name = df.columns
                    for fn in range(d):
                        Raw_data[fn, ns, :] = list(df[feature_name[fn]])
                # print(Raw_data.shape)
                # save numpy to npz file
                matrix = np.zeros((d, d))
                np.fill_diagonal(matrix, 0)
                np.fill_diagonal(matrix[:, 1:], 1)
                np.savez(self.File_PATH + self.Data_NAME+'.npz', x=Raw_data , y=matrix)
                print('INFO: Check for '+self.Data_NAME +'.npz'+ '!')

            else:
                print('INFO: Wrong DataType!')


#__Step 3: Test__

In [None]:
if __name__ == "__main__":
    ############################################################################################################
    ############################################ SETTING File_PATH and file_name ###############################
    ############################################################################################################

    File_PATH = "/content/drive/MyDrive/Colab Notebooks/NCPOP-Colab Notebooks/Test_Causality_Datasets/Real_data/Krebs_Cycle/"
    file_name = 'Krebs_Cycle'
    dt = Real_Data_Standardization(File_PATH, file_name)
    dt.Produce_Rawdata()

TypeError: ANMPOP_ReadData.Produce_Rawdata() missing 1 required positional argument: 'self'

In [None]:
# Test  Two Dimensions Causality Data
if __name__ == "__main__":
    ############################################################################################################
    ############################################ SETTING File_PATH and file_name ###############################
    ############################################################################################################

    File_PATH = "./Test_Datasets/Real_data/"
    file_name = 'linearGauss_6_15'
    dt = ANMPOP_ReadData(File_PATH, file_name)
    dt.Produce_Rawdata()

# Test Three Dimensions Causality Time Series Data
if __name__ == "__main__":
    ############################################################################################################
    ############################################ SETTING File_PATH and file_name ###############################
    ############################################################################################################

    File_PATH = "./Test_Datasets/Real_data/"
    file_name = 'Krebs_Cycle'
    dt = ANMPOP_ReadData(File_PATH, file_name)
    dt.Produce_Rawdata()

# Backup

##Debug class

In [None]:
import pandas as pd
import numpy as np
import tarfile
import os
import re
import sys
from itertools import combinations
from pickle import TRUE

File_PATH = "/content/drive/MyDrive/Colab Notebooks/NCPOP-Colab Notebooks/Test_Causality_Datasets/Real_data/Krebs_Cycle/"
os.chdir(File_PATH)
read_Dir=os.listdir(File_PATH)
Data_NAME = 'Krebs_Cycle'
Readable_File = readable_File(File_PATH)[1]
num_readable_File = len(read_Dir) - readable_File(File_PATH)[0]
TS_path = File_PATH+Data_NAME+'_TS/'

def readable_File(path):
    read_Dir=os.listdir(path)
    count = 0
    readable_File = []
    for f in read_Dir:
      file = os.path.join(path, f)
      if os.path.isdir(file):
        count = count+1
      else:
        readable_File.append(f)
    return count,readable_File

read_Dir_TS=os.listdir(TS_path)
Timeseries_List_path = File_PATH+'series_list.csv'
Read_Timeseries = pd.read_csv(Timeseries_List_path)
# print(len(Read_Timeseries), len(read_Dir_TS))
if len(Read_Timeseries) >= len(read_Dir_TS):
  print('INFO: Start Analyzing '+ Data_NAME + ' Time Series File!')
  TS_List = read_Dir_TS
else:
  print('INFO: Start Analyzing '+ Data_NAME + ' Time Series List!')
  TS_List = Read_Timeseries['Series_num']


INFO: Start Analyzing Krebs_Cycle Time Series List!


##Tested

In [None]:
from pickle import TRUE
import os
import re
import numpy as np
import pandas as pd
import tarfile
from itertools import combinations

File_PATH = "/content/drive/MyDrive/Colab Notebooks/NCPOP-Colab Notebooks/Test_Causality_Datasets/Real_data/Krebs_Cycle/"
os.chdir(File_PATH)
# os.chdir("/content/drive/MyDrive/Colab Notebooks/NCPOP-Colab Notebooks/Test_Causality_Datasets/Real_data/Krebs_Cycle/")

# self.File_PATH
read_Dir=os.listdir(File_PATH)
Data_NAME = 'real_dataset_processed'
print(File_PATH)

def readable_File(path):
    read_Dir=os.listdir(path)
    count = 0
    readable_File = []
    for f in read_Dir:
      file = os.path.join(path, f)
      if os.path.isdir(file):
        count = count+1
      else:
        readable_File.append(f)
    return count,readable_File

readable_File(File_PATH)[1]


/content/drive/MyDrive/Colab Notebooks/NCPOP-Colab Notebooks/Test_Causality_Datasets/Real_data/Krebs_Cycle/


['series_list.csv',
 'real_dataset_processed.csv',
 'true_graph.csv',
 'stock-market.txt',
 'linearGauss_6_15.npz',
 '18V_55N_Wireless.tar.gz']

In [None]:
# File_NAME = data\\
Readable_File = readable_File(File_PATH)[1]
num_readable_File = len(read_Dir) - readable_File(File_PATH)[0]
TS_path = File_PATH+Data_NAME+'_TS/'
if len(read_Dir) == 0:
    print('INFO: No Data Under the Current Route!')
else:
    File_NAME = []
    File_TYPE = []
    for i in Readable_File:
      File_NAME.append(re.split("\.", i)[0])
      File_TYPE.append(re.split("\.", i)[1])

    # Deal with Two Dimensions Causality Data
    if Data_NAME+'.npz' in Readable_File:
      Tests_data = np.load(Data_NAME+'.npz', allow_pickle=True)
      Raw_data = Tests_data['x']
      # print(Raw_data)
      true_dag = Tests_data['y']
      # print(true_dag)

    elif Data_NAME+'.tar.gz' in Readable_File:
      # open file
      file = tarfile.open(File_PATH+Data_NAME+'.tar.gz')

      # print file names
      file_names = file.getnames()
      # extract files
      file.extractall(File_PATH)

      # close file
      file.close()

      # x = np.load(File_PATH+file_names[1])
      Raw_data = pd.read_csv(File_PATH+file_names[1])
      # print(Raw_data)
      true_dag = np.load(File_PATH+file_names[2])
      # print(true_dag)
      '''
      # 将两个 numpy 数组保存到 npz 文件中
      np.savez(Data_NAME+'.npz', x=Raw_data , y=true_dag)

      # 从 npz 文件中加载数据
      data = np.load(Data_NAME+'.npz')
      Raw_data = Tests_data['x']
      true_dag = Tests_data['y']
      '''

    elif Data_NAME+'.csv' in Readable_File:
      Raw_data = pd.read_csv(File_PATH+ Data_NAME+'.csv', header=0, index_col=0)
      print(Raw_data)
      true_dag = pd.read_csv(File_PATH+'true_graph.csv', header=0, index_col=0)
      print(true_dag)
############################################################
    # Deal with Multi-dimensions Data
    elif os.path.exists(TS_path):
      read_Dir_TS=os.listdir(TS_path)
      Timeseries_List_path = File_PATH+'series_list.csv'
      Read_Timeseries = pd.read_csv(Timeseries_List_path)
      # print(len(Read_Timeseries), len(read_Dir_TS))
      if len(Read_Timeseries) >= len(read_Dir_TS):
        print('INFO: Start Analyzing '+ Data_NAME + ' Time Series File!')
        TS_List = read_Dir_TS
      else:
        print('INFO: Start Analyzing '+ Data_NAME + ' Time Series List!')
        TS_List = Read_Timeseries['Series_num']

      # for i, j in combinations(range(len(TS_List)), 2):
      [i, j] = [1,2]
      x = pd.read_csv(TS_path+TS_List[i], header=0, index_col=0)
      y = pd.read_csv(TS_path+TS_List[j], header=0, index_col=0)
      print(TS_List[i], TS_List[j])
    else:
      print('INFO: Wrong DataType!')

     A_1  A_2  A_3  A_4  A_5  A_6  A_7  A_8  A_9  A_10  ...  A_47  A_48  A_49  \
A_0                                                     ...                     
0      1    0    0    0    0    0    4    0    1     0  ...     0     0     0   
0      0    0    0    0    0    0    4    0    0     0  ...     3     0     0   
0      0    0    0    0    0    3    2    0    1     0  ...     3     2     0   
0      2    0    0    0    0    0    4    0    3     0  ...     0     2     0   
0      4    0    0    0    0    0   11    0    4     0  ...     0     2     0   
..   ...  ...  ...  ...  ...  ...  ...  ...  ...   ...  ...   ...   ...   ...   
0      0    0    0    0    0    0    0    0    0     0  ...     3     0     0   
0      1    0    0    0    0    0    5    0    4     0  ...     0     3     0   
0      3    0    0    0    0    0    4    0    1     0  ...     0     2     0   
0      1    0    0    0    0    0    1    0    1     0  ...     0     1     0   
0      0    0    0    0    0