In [9]:
import argparse
import glob
import multiprocessing as mp
import os
import time
import yaml
import pickle as pkl
import hashlib
import json
import pandas as pd
import numpy as np
from itertools import product
from datetime import datetime

def get_parser():
    parser = argparse.ArgumentParser(description='Testing')
    parser.add_argument('--config-file', type=str, default='configs/bomp_default.yaml', metavar= "FILE" ,help='path to config file')
    parser.add_argument("--output", type=str, help="Output path")
    return parser


def get_cfg(config_file):
    with open(config_file, 'r') as f:
        config = yaml.safe_load(f)
    return config


def merge_cfg(default_dict, input_dict):
    merged_dict = default_dict.copy()  # Start with default values.
    sections = ['MODEL', 'TEST', 'UTILS']  # Specify sections to update

    for section in sections:
        if section in default_dict and section in input_dict:
            for key in default_dict[section]:
                # Check if the key is in the user input dictionary
                if key in input_dict[section]:
                    # If it is, update the merged dictionary
                    merged_dict[section][key] = input_dict[section][key]
                else:
                    # If not, print a message about using the default value
                    print(f"Missing parameter '{key}' in section '{section}', default value '{default_dict[section][key]}' will be used.")
        else:
            print(f"Missing section '{section}' in the user input, default values will be used.")

    # Check for invalid keys in the user input dictionary
    for section in input_dict:
        if section in sections:
            for key in input_dict[section]:
                if key not in default_dict[section]:
                    print(f"Invalid key '{key}' in section '{section}'. This key will be ignored.")

    return merged_dict
    
def get_output_path(output_path, config_filename):
    if output_path is None:
        # output file will be a pickle file in the outputs folder
        output_path = os.path.join("./memory", config_filename.split("/")[-1].split(".")[0] + ".pkl")
    else:
        # output file will be a pickle file in the specified folder
        output_path = os.path.join(output_path, config_filename.split("/")[-1].split(".")[0] + ".pkl")
    return output_path    

In [10]:
# default
cfg = get_cfg("configs/bomp_default.yaml")
cfg

{'TEST': {'n': 600,
  'p': 1000,
  'm': 20,
  'noise_level': [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5],
  'model': 'BOMP',
  'cv_num': 5,
  'trial_num': 10},
 'MODEL': {'signal_bag_percent': 0,
  'atom_bag_percent': 0.7,
  'select_atom_percent': 0.3,
  'replace_flag': False,
  'agg_func': 'weight',
  'K_start': 1,
  'K_end': 40,
  'K_step': 1,
  'random_seed': 0}}

In [11]:
def get_model_params(cfg):
    import numpy as np
    all_params = cfg['MODEL']
    param_grid = {}
    fixed_params = {}
    K_start, K_end, K_step = all_params['K_start'], all_params['K_end'], all_params['K_step']
    if K_start >= K_end:
        raise ValueError("K_start must be smaller than K_end")
    if K_step <= 0:
        raise ValueError("K_step must be positive")
    # Check if K_start, K_end, K_step are integers
    if not isinstance(K_start, int) or not isinstance(K_end, int) or not isinstance(K_step, int):
        raise ValueError("K_start, K_end, K_step must be integers")
    K_list = np.arange(K_start, K_end, K_step, dtype=int)
    # Check if the param is a list or a single value if it is a list save to param_grid or else save to fixed_params
    for param, value in all_params.items():
        if param in ['K_start', 'K_end', 'K_step']:
            continue
        if isinstance(value, list):
            param_grid[param] = value
        else:
            fixed_params[param] = value
    param_grid['K'] = K_list
    return fixed_params, param_grid

In [12]:
default_config = get_cfg("configs/bomp_default.yaml")
input_config = get_cfg("configs/bomp_test.yaml")
config = merge_cfg(default_config, input_config)
config

Missing section 'UTILS' in the user input, default values will be used.
Invalid key 'N_bag' in section 'MODEL'. This key will be ignored.


{'TEST': {'n': 600,
  'p': 1000,
  'm': 20,
  'noise_level': 0,
  'model': 'BOMP',
  'cv_num': 5,
  'trial_num': 10},
 'MODEL': {'signal_bag_percent': [0, 0.9, 1],
  'atom_bag_percent': [0.7, 0.8],
  'select_atom_percent': 0,
  'replace_flag': False,
  'agg_func': ['weight', 'avg'],
  'K_start': 1,
  'K_end': 41,
  'K_step': 1,
  'random_seed': 1}}

In [13]:
fixed_params, param_grid = get_model_params(config)

print(fixed_params)
print(param_grid)

{'signal_bag_percent': [0, 0.9, 1], 'atom_bag_percent': [0.7, 0.8], 'agg_func': ['weight', 'avg'], 'K': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40])}


In [14]:
from algorithms import BOMP

my_bomp = BOMP(**fixed_params)

my_bomp

In [15]:
from data_generation import GaussianDataGenerator

N = 1000
d = 400
m = 20
noise_level = 0.1
seed = 0

Data_Geneartor = GaussianDataGenerator(N, d, m, noise_level, seed)

true_signal, dictionary, true_indices, true_coefficients, perturbed_signal = Data_Geneartor.shuffle()

perturbed_signal.shape, dictionary.shape

((400, 1), (400, 1000))

In [16]:
from sklearn.metrics import mean_squared_error 
from sklearn.model_selection import GridSearchCV


gs = GridSearchCV(my_bomp, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)

gs.fit(dictionary, perturbed_signal)

Fitting 5 folds for each of 480 candidates, totalling 2400 fits


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [17]:
gs.best_params_

{'K': 14,
 'agg_func': 'weight',
 'atom_bag_percent': 0.8,
 'signal_bag_percent': 0}

In [4]:
!python BOMP_testing.py --config configs/bomp_test.yaml

Running trials for n = 600, p = 1000, m = 20
Cross validating alpha under noise level:  0
Trial:  0  Best params:  {'K': 22, 'signal_bag_percent': 0.9}  Lowest Error:  9.448603507012114e-33
Trial:  1  Best params:  {'K': 27, 'signal_bag_percent': 1}  Lowest Error:  5.755253353425606e-33
Trial:  2  Best params:  {'K': 21, 'signal_bag_percent': 0.9}  Lowest Error:  6.965004572856197e-33
Trial:  3  Best params:  {'K': 26, 'signal_bag_percent': 0.9}  Lowest Error:  5.794289306968719e-33
Trial:  4  Best params:  {'K': 24, 'signal_bag_percent': 0.9}  Lowest Error:  4.082316320936746e-33
Trial:  5  Best params:  {'K': 23, 'signal_bag_percent': 0.9}  Lowest Error:  6.851902239839113e-33
Trial:  6  Best params:  {'K': 25, 'signal_bag_percent': 0.9}  Lowest Error:  5.0822509669892585e-33
Trial:  7  Best params:  {'K': 22, 'signal_bag_percent': 0.9}  Lowest Error:  1.9754038587102136e-32
Trial:  8  Best params:  {'K': 23, 'signal_bag_percent': 0.9}  Lowest Error:  6.851304759325583e-33
Trial:  9 

In [7]:
import pickle as pkl
reslog = pkl.load(open("./memory/bomp_test20230702-151713.pkl", "rb"))

reslog

[{'parameters': {'n': 600,
   'p': 1000,
   'm': 20,
   'noise_level_lst': [0, 0.05],
   'model_name': 'BOMP',
   'cv_num': 5,
   'trial_num': 10,
   'param_grid': {'signal_bag_percent': [0, 0.9, 1],
    'K': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
           18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
           35, 36, 37, 38, 39, 40])},
   'fixed_params': {'N_bag': 10,
    'atom_bag_percent': 0.7,
    'select_atom_percent': 0,
    'replace_flag': False,
    'agg_func': 'weight',
    'random_seed': 1}},
  'noise_level_lowest_MSE': [9.00266385938955e-33, 0.0026551059334248275],
  'log': [{'noise_level': 0,
    'trial': 0,
    'cv_error_lst': array([3.33628423e-02, 3.33751187e-02, 3.34180133e-02, 2.69850204e-02,
           2.49586673e-02, 2.48926947e-02, 1.93375360e-02, 1.99587998e-02,
           1.99528207e-02, 1.66956627e-02, 1.67641473e-02, 1.73465491e-02,
           1.30540342e-02, 1.26866698e-02, 1.30382753e-02, 1.11735020