## Task4 Incorperate CV and OMP class better, and make it more general so we can use it later for ensemble methods

In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import pickle as pkl
from itertools import product
from algorithms import OMP
from data_generation import GaussianDataGenerator
from crossvalidation import CrossValidator
import yaml
from omegaconf import DictConfig
import warnings
warnings.filterwarnings('ignore')

In [2]:
N = 1000
d = 600
m = 20
seed = 10

Data_Geneartor = GaussianDataGenerator(N, d, m, 0, seed)

true_signal, dictionary, true_indices, true_coefficients, perturbed_signal = Data_Geneartor.shuffle()

K_lst = np.arange(1, m+20+1, 1).tolist()

dictionary.shape

(600, 1000)

In [3]:
my_cv = CrossValidator(OMP, K_lst, 5, shuffle_split=True, seed=10)
my_cv.fit(perturbed_signal, dictionary)

(2.1130680102711943e-32,
 20,
 [0.0076198510201475975,
  0.004069316612261802,
  0.002137968529945831,
  0.0013949372826701585,
  0.0008829274714864061,
  0.0005220518071904683,
  0.00030171448497390947,
  0.00018800966708506196,
  0.00012747155016087187,
  7.169540216554605e-05,
  5.0032965372596205e-05,
  3.581876478177004e-05,
  2.3305478296258957e-05,
  1.8830332705877132e-05,
  1.3597016923790235e-05,
  9.710030324504057e-06,
  5.685545388237914e-06,
  2.492047845805853e-06,
  1.4087190706205076e-06,
  2.1130680102711943e-32,
  2.242547803965305e-32,
  2.681008826840851e-32,
  2.748039840543949e-32,
  2.863487930578132e-32,
  2.7912098862511556e-32,
  3.446862392390281e-32,
  3.2695832054632975e-32,
  4.100907449187801e-32,
  4.859295653867513e-32,
  4.201705180721848e-32,
  5.297158083673105e-32,
  5.020403329911364e-32,
  5.434567198020574e-32,
  6.024224097227322e-32,
  9.502357280293451e-32,
  7.422947521071139e-32,
  7.747823816210911e-32,
  6.268524172932728e-32,
  1.2237920

In [4]:
def Algorithm_Manager(method_name):

    """
    This function can return the algorithm based on the method name

    Args:
        method_name: the name of the algorithm
    Returns:
        algorithm: the algorithm (class)
    """
    
    match method_name:
        case "OMP":
            return OMP
        case _:
            raise ValueError("Invalid method name")

In [45]:
class CV_Best_K():
    """

    This class can do the ensemble testing for the all EnsembleTesting 

    """

    def __init__(self, configs):
        
        """
        Initialize the EnsembleTesting class

        Args:
            configs: the configuration
        """

        ## FIXME:: No default yaml file is provided neither the yaml file generator, I think we could build up a base class so that we can inherit from it and overwrite the function
        
        self.N = configs.N
        self.d = configs.d
        self.m = configs.m
        self.noise_level_lst = configs.noise_level_lst
        self.cv_num = configs.cv_num
        self.K_lst = configs.K_lst
        self.trial_num = configs.trial_num
        self.method_name = configs.method_name
        self.shuffle_split = configs.shuffle_split
        self.algorithm = Algorithm_Manager(self.method_name)
        
        self.noise_level_best_K = []
        self.noise_level_lowest_error = []
        
        self.output_filename = str(self.N) + '_' + str(self.d) + '_' + str(self.m) + '_' + str(self.trial_num) + '_' + str(self.cv_num) + '.pkl'
        if not os.path.exists('./memory'):
            os.mkdir('./memory')

    def run(self):

        """
        Cross validate the best K for selected algorithm under different noise level and different trials

        Returns:
            cv_res_log: the log of cross validation results
        """

        res_log = {
            'parameters': {
                'N': self.N, 
                'd': self.d, 
                'm': self.m, 
                'noise_level_lst': self.noise_level_lst, 
                'cv_num': self.cv_num, 
                'trial_num': self.trial_num, 
                'K_lst': self.K_lst
            },
            'log': []
        }
        
        noise_level_best_K_tmp = {}
        noise_level_lowest_error_tmp = {}
        for noise_level in self.noise_level_lst:
            noise_level_best_K_tmp[noise_level] = []
            noise_level_lowest_error_tmp[noise_level] = []
        
        for trial in range(self.trial_num):
            print("Trial {}:".format(trial))
            Data_Geneartor = GaussianDataGenerator(self.N, self.d, self.m, 0, trial)
            true_signal, dictionary, true_indices, true_coefficients, _ = Data_Geneartor.shuffle()
            CrossVal = CrossValidator(self.algorithm, self.K_lst, self.cv_num, shuffle_split=self.shuffle_split, seed=trial)
            # In the same trial we have the same true signal and dictionary, so that it is comparable between different noise level
            for noise_level in self.noise_level_lst:
                perturbed_signal = Data_Geneartor.update_noise_level(noise_level)
                # We only need to update the signal and change the corresponding part in the cv_res
                lowest_error, lowest_error_K, K_cv_error = CrossVal.fit(perturbed_signal, dictionary)
                # We do not need to store the lowest error and lowest error K because we can calculate it from the cv_err_lst
                log_tmp = {
                    'noise_level': noise_level, 
                    'trial': trial, 
                    'data': Data_Geneartor, 
                    'cv_error_lst': K_cv_error,
                    'lowest_error': lowest_error, 
                    'lowest_error_K': lowest_error_K
                }
                noise_level_best_K_tmp[noise_level].append(lowest_error_K)
                noise_level_lowest_error_tmp[noise_level].append(lowest_error)
                res_log['log'].append(log_tmp)
                print("Noise level {}:  Best K: {},  Lowest Error: {}".format(noise_level, lowest_error_K, lowest_error))

        for noise_level in self.noise_level_lst:
            self.noise_level_best_K.append(np.mean(noise_level_best_K_tmp[noise_level]))
            self.noise_level_lowest_error.append(np.mean(noise_level_lowest_error_tmp[noise_level]))

        res_log['noise_level_best_K'] = self.noise_level_best_K
        res_log['noise_level_lowest_error'] = self.noise_level_lowest_error

        # Save the log file
        with open('./memory/' + self.output_filename, 'wb') as f:
            pkl.dump(res_log, f)
            print("Log file saved to: ", './memory/' + self.output_filename)
        print("Finished!")
        return res_log


In [46]:
test_config = {
    'N': 1000,
    'd': 400,
    'm': 20,
    'noise_level_lst': [0, 0.05],
    'cv_num': 5,
    'trial_num': 1,
    'K_lst': np.arange(1, 40+1, 1).tolist(),
    'method_name': 'OMP',
    'shuffle_split': True,
}

In [47]:
from omegaconf import DictConfig, OmegaConf

configDict = OmegaConf.create(test_config)


In [48]:
my_cv_best_K = CV_Best_K(configDict)
res_log = my_cv_best_K.run()

Trial 0:
Noise level 0:  Best K: 20,  Lowest Error: 2.2213862614739193e-31
Noise level 0.05:  Best K: 8,  Lowest Error: 0.09158458496105422
Log file saved to:  ./memory/1000_400_20_1_5.pkl
Finished!
