# Create looti emulators for observables dependent on z & k

## Example 1: linear power spectrum $P_{lin}$

## 1) Create a pandas dataframe from data files

Load looti module read_files

In [None]:
from looti import read_files as rf

The path to the data files and other specifications are passed to looti via a yaml file.
The yaml file should contain the following information:
- main_dir: the directory in which all relevant folders and files are stored
- config_file: this file should include a section called where the varying parameters are specified

- folders_path: path to the directory containing all the data folders, one for each class/camb run
- params_file: each data folder should contain a file specifying the values of each parameter for this run
- reference folder: name of the folder containing a run with fiducial values used as reference

- z_file_name: file containing the redshift grid
- k_file_name: file containing the k grid
- data_file_name: file containing the data to be emulated
- data_type: name of observable to be emulated

Create an instance of "FrameConstructor" - the looti class used for data management.
FrameConstructor takes the path to this yamle file as input

In [None]:
FrameConstructor = rf.FrameConstructor(path_config_file='../readfile_configs/input4cast_lhs_Plin.yaml')

Create a dataframe containing all training and test data

In [None]:
dataframe_ext = FrameConstructor.create_k_dataframe()
dataframe_ext

Save the dataframe

In [None]:
# dataframe_ext.to_csv("../data/rwth_output/mg_camb_lhs_log10fR0/Plin.csv")

Create a dataframe containing reference at fiducial values for the varying parameters at each redshift.

In [None]:
dataframe_ref = FrameConstructor.create_k_reference_dataframe()
dataframe_ref

Save the reference dataframe

In [None]:
# dataframe_ref.to_csv("../data/rwth_output/mg_camb_lhs_log10fR0/Plin_ref.csv")

In [None]:
import pandas as pd
df_all = pd.read_csv('../data/rwth_output/mg_camb_lhs_log10fR0/Plin.csv', index_col=list(range(14)))
df_z0 = df_all[df_all.index.get_level_values('redshift')==0]
df_k = df_all[df_all.index.get_level_values('data_type')=='k_grid']
df_fin = pd.concat([df_z0, df_k])
df_fin

In [None]:
# df_fin.to_csv('../data/rwth_output/mg_camb_lhs_log10fR0/Plin_z0_ref.csv')

## 2) Normalize the data and divide it into training, validation and test sets

Load looti module datahandle

In [None]:
from looti import datahandle as dhl

Specify path and names of the pandas dataframes we just created

In [None]:
data_folder = '../data/rwth_output/mg_camb_lhs_log10fR0/' 
datafile_ext = 'Plin'
datafile_ref = 'Plin_ref'

Create a DataHandle object and read the csv files containing the data

In [None]:
emulation_data = dhl.DataHandle(datafile_ext,                   # file name of csv file containing input data
                                data_folder,                    # path to folder containing both external and reference dataframes
                                datafile_ref,                   # file name of csv file containing reference data
                                num_parameters=6,               # number of parameters to be interpolated
                                data_type='Plin',               # type of observable to be emulated
                                features_name='k_grid',         # name of the grid in the data frames
                                features_to_Log=True,           # 
                                normalize_by_reference=True,
                                normalize_by_mean_std=True) 
emulation_data.read_csv_pandas()

Normalize the data (by dividing through the reference and then normalizing by mean and standard deviation)

In [None]:
emulation_data.calculate_ratio_by_redshifts(emulation_data.z_vals)

Specify the desired number of training and test data.

In [None]:
import random
import numpy as np 

n_train = 310                                               # Number of training vectors without taking acount the extrema 
n_test = 10                                                 # Number of test vectors without taking acount the extrema
test_indices=[random.sample(range(1, 1000), n_test)]        # List of list -of test indices

Split the data into training, validation and test data sets

In [None]:
emulation_data.calculate_data_split(n_train=n_train,                            # Number of training vectors per redshift
                                    n_test=n_test,                              # Number of test vectors per redsift
                                    verbosity=3,
                                    manual_split=True,
                                    test_indices=None,
                                    train_redshift_indices=list(range(5)),      # Indices of the redshifts used for the train vect.
                                    test_redshift_indices=list(range(5)))       # Indices of the redshifts used for the test vect.

## 3) PCA and GP interpolation

Import the looti module dictlearn which is used for PCA transformation and the training of intepolators

In [None]:
from looti import dictlearn as dcl

Choose the number of PCA components and specify the number of varying parameters

In [None]:
npca = 9
nparam = 7

Now the data is transformed into PCA space and a Gaussian Process (GP) is trained on the PCA components. 
<br>
<br>
Required input:
- emulation_data


Optional input:
- Operator: type of interpolation (default = 'PCA')
    - LIN: interpolation according to a simple spline
    - PCA: interpolation over PCA components
    - DL: interpolation over dictionary components of sparse DL representation
    - GP: directly train a Gaussian Process on the data without further pre-processing
- interp_type: type of interpolator
    - GP: Gaussian Process
    - int1d:

In [10]:
ratios_predicted, emulation_data, intobj = dcl.Predict_ratio(emulation_data,Operator="PCA",
                                                             train_noise=1e-10,                     # Noise for the GP's kernel
                                                             gp_n_rsts=200,                          # max times to restart the optimiser
                                                             ncomp=npca,                            # Number of components
                                                             gp_const=1,                            # Constant for the RBF kernel
                                                             gp_length=np.ones(nparam),            # Length for GP 
                                                             interp_type='GP',                      # Kind of interpolator
                                                             test_indices=test_indices,             # Indices of test vectors
                                                             interp_dim=1,
                                                             return_interpolator=True,
                                                             pca_norm=True,
                                                             train_redshift_indices=list(range(5)),
                                                             test_redshift_indices=list(range(5))
                                                             )

## 4) Save the trained interpolator

In [None]:
import pickle

In [None]:
save_path = '../interpolators/redshift_0/'
intobj_name = 'pca9_nrsts200_aniso.sav'
data_name = 'pca9_nrsts200_aniso_data.sav'

In [None]:
pickle.dump(intobj, open(save_path+intobj_name, 'wb'))

In [None]:
pickle.dump(emulation_data, open(save_path+data_name, 'wb'))

# Now repeat the same procedure for the other observables

## Example 2: Non-linear power spectrum $P_{nonlin}$

In [None]:
FrameConstructor = rf.FrameConstructor(path_config_file='../readfile_configs/input4cast_lhs_Pnonlin.yaml')

In [None]:
dataframe_Pnonlin_ext = FrameConstructor.create_k_dataframe()
dataframe_Pnonlin_ext.to_csv("../data/rwth_output/class_Asw0wa_DP/Pnonlin.csv")

In [None]:
dataframe_Pnonlin_ref = FrameConstructor.create_k_reference_dataframe()
dataframe_Pnonlin_ref.to_csv("../data/rwth_output/class_Asw0wa_DP/Pnonlin_ref.csv")

In [None]:
data_folder = '../data/rwth_output/class_Asw0wa_DP/' 
datafile_ext = 'Pnonlin'
datafile_ref = 'Pnonlin_ref'

In [None]:
emulation_data = dhl.DataHandle(datafile_ext,                   # file name of csv file containing input data
                                data_folder,                    # path to folder containing both external and reference dataframes
                                datafile_ref,                   # file name of csv file containing reference data
                                num_parameters=7,               # number of parameters to be interpolated
                                data_type='Pnonlin',            # type of observable to be emulated
                                features_name='k_grid',         # name of the grid in the data frames
                                features_to_Log=True,           # 
                                normalize_by_reference=True,
                                normalize_by_mean_std=True) 

emulation_data.read_csv_pandas()

In [None]:
emulation_data.calculate_ratio_by_redshifts(emulation_data.z_vals)

In [None]:
n_train = 100                                                  # Number of training vectors without taking acount the extrema 
n_test = 2                                                     # Number of test vectors without taking acount the extrema
test_indices=[random.sample(range(1, 1000), n_test)]           # List of list -of test indices, one list per split

In [None]:
emulation_data.calculate_data_split(n_train=n_train,                            # Number of training vectors per redshift
                                    n_test=n_test,                              # Number of test vectors per redsift
                                    verbosity=3,
                                    manual_split=True,
                                    test_indices=None,
                                    train_redshift_indices=list(range(5)),      # Indices of the redshifts used for the train vect.
                                    test_redshift_indices=list(range(5)))       # Indices of the redshifts used for the test vect.

In [None]:
npca = 7
nparam = 8

In [None]:
ratios_predicted, emulation_data, intobj = dcl.Predict_ratio(emulation_data,Operator="PCA",
                                                             train_noise=1e-10,                     # Noise for the GP's kernel
                                                             gp_n_rsts=100,                          # max times to restart the optimiser
                                                             ncomp=npca,                            # Number of components
                                                             gp_const=1,                            # Constant for the RBF kernel
                                                             gp_length=1,            # Length for GP 
                                                             interp_type='GP',                      # Kind of interpolator
                                                             test_indices=test_indices,             # Indices of test vectors
                                                             interp_dim=1,
                                                             return_interpolator=True,
                                                             pca_norm=True,
                                                             train_redshift_indices=list(range(5)),
                                                             test_redshift_indices=list(range(5))
                                                             # min_k =1e-2,ma_k=10e1
                                                             )

In [None]:
import pickle
save_path = '../interpolators/class_Asw0wa_DP/'
save_name = 'Pnonlin.sav'
data_name = 'Pnonlin_data.sav'
pickle.dump(intobj, open(save_path+save_name, 'wb'))

## Example 3: Growth factor $D$

In [None]:
from looti import datahandle as dhl
from looti import dictlearn as dcl
import numpy as np
import random

In [None]:
FrameConstructor = rf.FrameConstructor(path_config_file='../readfile_configs/input4cast_lhs_D_Growth.yaml')

In [None]:
dataframe_D_Growth_ext = FrameConstructor.create_k_dataframe()
dataframe_D_Growth_ext.to_csv("../data/rwth_output/class_Asw0wa_DP/D_Growth.csv")

In [None]:
dataframe_D_Growth_ref = FrameConstructor.create_k_reference_dataframe()
dataframe_D_Growth_ref.to_csv("../data/rwth_output/class_Asw0wa_DP/D_Growth_ref.csv")

In [None]:
data_folder = '../data/rwth_output/class_Asw0wa_DP/'
datafile_ext = 'D_Growth'
datafile_ref = 'D_Growth_ref'

In [None]:
emulation_data = dhl.DataHandle(datafile_ext,                   # file name of csv file containing input data
                                data_folder,                    # path to folder containing both external and reference dataframes
                                datafile_ref,                   # file name of csv file containing reference data
                                num_parameters=7,               # number of parameters to be interpolated
                                data_type='D_Growth',           # type of observable to be emulated
                                features_name='k_grid',         # name of the grid in the data frames
                                features_to_Log=True,           # 
                                normalize_by_reference=True,
                                normalize_by_mean_std=True) 
emulation_data.read_csv_pandas()

In [None]:
emulation_data.calculate_ratio_by_redshifts(emulation_data.z_vals)

In [None]:
n_train = 100                                                  # Number of training vectors without taking acount the extrema 
n_test = 2                                                     # Number of test vectors without taking acount the extrema
test_indices=[random.sample(range(1, 1000), n_test)]           # List of list -of test indices, one list per split

In [None]:
emulation_data.calculate_data_split(n_train=n_train,                            # Number of training vectors per redshift
                                    n_test=n_test,                              # Number of test vectors per redsift
                                    verbosity=3,
                                    manual_split=True,
                                    test_indices=None,
                                    train_redshift_indices=list(range(5)),      # Indices of the redshifts used for the train vect.
                                    test_redshift_indices=list(range(5)))       # Indices of the redshifts used for the test vect.

In [None]:
npca = 7
nparam = 8

In [None]:
ratios_predicted, emulation_data, intobj = dcl.Predict_ratio(emulation_data,Operator="PCA",
                                                             train_noise=1e-10,                     # Noise for the GP's kernel
                                                             gp_n_rsts=40,                          # max times to restart the optimiser
                                                             ncomp=npca,                            # Number of components
                                                             gp_const=1,                            # Constant for the RBF kernel
                                                             gp_length=np.ones(nparam) ,            # Length for GP 
                                                             interp_type='GP',                      # Kind of interpolator
                                                             test_indices=test_indices,             # Indices of test vectors
                                                             interp_dim=1,
                                                             return_interpolator=True,
                                                             pca_norm=True,
                                                             train_redshift_indices=list(range(5)),
                                                             test_redshift_indices=list(range(5))
                                                             # min_k =1e-2,ma_k=10e1
                                                             )

In [None]:
import pickle
save_path = '../interpolators/class_Asw0wa_DP/'
save_name = 'D_Growth.sav'
data_name = 'D_Growth_data.sav'
pickle.dump(intobj, open(save_path+save_name, 'wb'))
pickle.dump(emulation_data, open(save_path+data_name, 'wb'))

## Example 4: Growth rate $f$

In [None]:
from looti import datahandle as dhl
from looti import dictlearn as dcl
import numpy as np
import random

In [None]:
FrameConstructor = rf.FrameConstructor(path_config_file='../readfile_configs/input4cast_lhs_f_GrowthRate.yaml')

In [None]:
dataframe_f_GrowthRate_ext = FrameConstructor.create_k_dataframe()
dataframe_f_GrowthRate_ext.to_csv("../data/rwth_output/class_Asw0wa_DP/f_GrowthRate.csv")

In [None]:
dataframe_f_GrowthRate_ref = FrameConstructor.create_k_reference_dataframe()
dataframe_f_GrowthRate_ref.to_csv("../data/rwth_output/class_Asw0wa_DP/f_GrowthRate_ref.csv")

In [None]:
data_folder = '../data/rwth_output/class_Asw0wa_DP/'
datafile_ext = 'f_GrowthRate'
datafile_ref = 'f_GrowthRate_ref'

In [None]:
emulation_data = dhl.DataHandle(datafile_ext,                   # file name of csv file containing input data
                                data_folder,                    # path to folder containing both external and reference dataframes
                                datafile_ref,                   # file name of csv file containing reference data
                                num_parameters=7,               # number of parameters to be interpolated
                                data_type='f_GrowthRate',       # type of observable to be emulated
                                features_name='k_grid',         # name of the grid in the data frames
                                features_to_Log=True,           # 
                                normalize_by_reference=True,
                                normalize_by_mean_std=True) 
emulation_data.read_csv_pandas()

In [None]:
emulation_data.calculate_ratio_by_redshifts(emulation_data.z_vals)

In [None]:
n_train = 100                                                  # Number of training vectors without taking acount the extrema 
n_test = 2                                                     # Number of test vectors without taking acount the extrema
test_indices=[random.sample(range(1, 1000), n_test)]           # List of list -of test indices, one list per split

In [None]:
emulation_data.calculate_data_split(n_train=n_train,                            # Number of training vectors per redshift
                                    n_test=n_test,                              # Number of test vectors per redsift
                                    verbosity=3,
                                    manual_split=True,
                                    test_indices=None,
                                    train_redshift_indices=list(range(5)),      # Indices of the redshifts used for the train vect.
                                    test_redshift_indices=list(range(5)))       # Indices of the redshifts used for the test vect.

In [None]:
npca = 7
nparam = 8

In [None]:
ratios_predicted, emulation_data, intobj = dcl.Predict_ratio(emulation_data,Operator="PCA",
                                                             train_noise=1e-10,                     # Noise for the GP's kernel
                                                             gp_n_rsts=40,                          # max times to restart the optimiser
                                                             ncomp=npca,                            # Number of components
                                                             gp_const=1,                            # Constant for the RBF kernel
                                                             gp_length=np.ones(nparam) ,            # Length for GP 
                                                             interp_type='GP',                      # Kind of interpolator
                                                             test_indices=test_indices,             # Indices of test vectors
                                                             interp_dim=1,
                                                             return_interpolator=True,
                                                             pca_norm=True,
                                                             train_redshift_indices=list(range(5)),
                                                             test_redshift_indices=list(range(5))
                                                             # min_k =1e-2,ma_k=10e1
                                                             )

In [None]:
import pickle
save_path = '../interpolators/class_Asw0wa_DP/'
save_name = 'f_GrowthRate.sav'
data_name = 'f_GrowthRate_data.sav'
pickle.dump(intobj, open(save_path+save_name, 'wb'))
pickle.dump(emulation_data, open(save_path+data_name, 'wb'))

## Example 5: $\Sigma_{WL}$

In [None]:
from looti import datahandle as dhl
from looti import dictlearn as dcl
import numpy as np
import random

In [None]:
FrameConstructor = rf.FrameConstructor(path_config_file='../readfile_configs/input4cast_lhs_sigmaWL.yaml')

In [None]:
dataframe_sigmaWL_ext = FrameConstructor.create_k_dataframe()
dataframe_sigmaWL_ext.to_csv("../data/rwth_output/class_Asw0wa_DP/sigmaWL.csv")

In [None]:
dataframe_sigmaWL_ref = FrameConstructor.create_k_reference_dataframe()
dataframe_sigmaWL_ref.to_csv("../data/rwth_output/class_Asw0wa_DP/sigmaWL_ref.csv")

In [None]:
data_folder = '../data/rwth_output/class_Asw0wa_DP/'
datafile_ext = 'sigmaWL'
datafile_ref = 'sigmaWL_ref'

In [None]:
emulation_data = dhl.DataHandle(datafile_ext,                   # file name of csv file containing input data
                                data_folder,                    # path to folder containing both external and reference dataframes
                                datafile_ref,                   # file name of csv file containing reference data
                                num_parameters=7,               # number of parameters to be interpolated
                                data_type='sigmaWL',           # type of observable to be emulated
                                features_name='k_grid',         # name of the grid in the data frames
                                features_to_Log=True,           # 
                                normalize_by_reference=True,
                                normalize_by_mean_std=True) 
emulation_data.read_csv_pandas()

In [None]:
emulation_data.calculate_ratio_by_redshifts(emulation_data.z_vals)

In [None]:
n_train = 100                                                  # Number of training vectors without taking acount the extrema 
n_test = 2                                                     # Number of test vectors without taking acount the extrema
test_indices=[random.sample(range(1, 1000), n_test)]           # List of list -of test indices, one list per split

In [None]:
emulation_data.calculate_data_split(n_train=n_train,                            # Number of training vectors per redshift
                                    n_test=n_test,                              # Number of test vectors per redsift
                                    verbosity=3,
                                    manual_split=True,
                                    test_indices=None,
                                    train_redshift_indices=list(range(5)),      # Indices of the redshifts used for the train vect.
                                    test_redshift_indices=list(range(5)))       # Indices of the redshifts used for the test vect.

In [None]:
npca = 7
nparam = 8

In [None]:
emulation_data.matrix_ratios_dict

In [None]:
ratios_predicted, emulation_data, intobj = dcl.Predict_ratio(emulation_data,Operator="PCA",
                                                             train_noise=1e-10,                     # Noise for the GP's kernel
                                                             gp_n_rsts=40,                          # max times to restart the optimiser
                                                             ncomp=npca,                            # Number of components
                                                             gp_const=1,                            # Constant for the RBF kernel
                                                             gp_length=np.ones(nparam) ,            # Length for GP 
                                                             interp_type='GP',                      # Kind of interpolator
                                                             test_indices=test_indices,             # Indices of test vectors
                                                             interp_dim=1,
                                                             return_interpolator=True,
                                                             pca_norm=True,
                                                             train_redshift_indices=list(range(5)),
                                                             test_redshift_indices=list(range(5))
                                                             # min_k =1e-2,ma_k=10e1
                                                             )

In [None]:
import pickle
save_path = '../interpolators/class_Asw0wa_DP/'
save_name = 'sigmaWL.sav'
data_name = 'sigmaWL_data.sav'
pickle.dump(intobj, open(save_path+save_name, 'wb'))
pickle.dump(emulation_data, open(save_path+data_name, 'wb'))