In [None]:
# Task4 Incorperate CV and OMP class better, and make it more general so we can use it later for ensemble methods

In [4]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import pickle as pkl
from itertools import product
from algorithms import OMP
from data_generation import GaussianDataGenerator
import yaml
from omegaconf import DictConfig
import warnings
warnings.filterwarnings('ignore')

In [None]:
def cv_split(true_signal, dictionary, cv_num):
    """
    Split the signal and dictionary into cv_num folds
    
    Args:
        true_signal: the true signal
        dictionary: the dictionary
        cv_num: the number of folds
    Returns:
        cv_res: a list of tuples, each tuple is a fold of train signal, train dictionary, test signal, test dictionary
    """

    true_signal = true_signal.ravel()
    # true_signal is (1200, 1) and dictionary is (1200, 10000), cv both signal and dictionary by rows
    cv_signal = np.split(true_signal, cv_num)
    cv_dictionary = np.split(dictionary, cv_num)
    # Get the list of train and test set
    cv_res = []
    for i in range(cv_num):
        train_signal = np.concatenate(cv_signal[:i] + cv_signal[i + 1:], axis = 0)
        train_dictionary = np.concatenate(cv_dictionary[:i] + cv_dictionary[i + 1:], axis=0)
        test_signal = cv_signal[i]
        test_dictionary = cv_dictionary[i]
        cv_res.append((train_signal, train_dictionary, test_signal, test_dictionary))
    return cv_res

def cal_cv_error(algorithm, cv_num, signal, dictionary):
    """
    Calculate the cross validation error of the algorithm

    Args:
        algorithm: the algorithm to calculate the error
        cv_num: the number of folds
        signal: the true signal
        dictionary: the dictionary
    Returns:
        error: the cross validation error
    """
    cv_res = cv_split(signal, dictionary, cv_num)
    error_lst = []
    for i in range(cv_num):
        train_signal, train_dictionary, test_signal, test_dictionary = cv_res[i]
        algorithm.fit(train_signal, train_dictionary)
        error_lst.append(algorithm.score(test_signal, test_dictionary))
    return np.mean(error_lst)

def cv_best_K(signal, dictionary, cv_num, K_lst):
    """
    Calculate the best K for OMP algorithm using cross validation

    Args:
        signal: the true signal
        dictionary: the dictionary
        cv_num: the number of folds
        K_lst: the list of K to try
    Returns:
        lowest_error: the lowest error
        lowest_error_K: the K that gives the lowest error
        K_cv_error: the list of cross validation error for each K
    """
    K_cv_error = []
    for K in K_lst:
        OMP_tmp = OMP(K, ignore_warning=True)
        K_cv_error.append(cal_cv_error(OMP_tmp, cv_num, signal, dictionary))
    lowest_error = np.min(K_cv_error)
    lowest_error_K = K_lst[np.argmin(K_cv_error)]
    return lowest_error, lowest_error_K, K_cv_error


# Improvement: Save the result to a file

if not os.path.exists('./memory'):
    os.mkdir('./memory')

def cv_best_K_noise_level_multi_trial(N, d, m, noise_level_lst, cv_num, K_lst, trial_num, output_filename = None):
    if output_filename is None:
        output_filename = str(N) + '_' + str(d) + '_' + str(m) + '_' + str(trial_num) + '_' + str(cv_num) + '.pkl'
    res_log = {
        'parameters': {'N': N, 'd': d, 'm': m, 'noise_level_lst': noise_level_lst, 'cv_num': cv_num, 'trial_num': trial_num, 'K_lst': K_lst},
        'noise_level_best_K': [],
        'noise_level_lowest_MSE': [],
        'log': []
    }
    noise_level_best_K = []
    noise_level_lowest_MSE = []
    for noise_level in noise_level_lst:
        print("Cross validating K under noise level: ", noise_level)
        trials_best_K_tmp = []
        MSE_loweset_K_temp = []
        for trial in range(trial_num):
            Data_Geneartor = GaussianDataGenerator(N, d, m, noise_level, trial)
            true_signal, dictionary, true_indices, true_coefficients, perturbed_signal = Data_Geneartor.shuffle()
            lowest_error, lowest_error_K, cv_err_lst = cv_best_K(perturbed_signal, dictionary, cv_num, K_lst)
            trials_best_K_tmp.append(lowest_error_K)
            MSE_loweset_K_temp.append(lowest_error)
            print("Trial: ", trial, " Best K: ", lowest_error_K, " Lowest Error: ", lowest_error)
            log_tmp = {'noise_level': noise_level, 'trial': trial, 'data': Data_Geneartor, 'cv_error_lst': cv_err_lst, 
                       'lowest_error': lowest_error, 'lowest_error_K': lowest_error_K}
            res_log['log'].append(log_tmp)
        noise_level_best_K.append(np.mean(trials_best_K_tmp))
        noise_level_lowest_MSE.append(np.mean(MSE_loweset_K_temp))
        print("Average best K for noise level: ", noise_level, " is: ", np.mean(trials_best_K_tmp), " with MSE: ", np.mean(MSE_loweset_K_temp))
    res_log['noise_level_best_K'] = noise_level_best_K
    res_log['noise_level_lowest_MSE'] = noise_level_lowest_MSE
    with open('./memory/' + output_filename, 'wb') as f:
        pkl.dump(res_log, f)
    print("Finished!")
    print("Log file saved to: ", './memory/' + output_filename)
    return noise_level_best_K, noise_level_lowest_MSE, res_log


N = 1000

In [None]:
class CrossValidator:
    def __init__(self, signal, dictionary,cv_num,algorithm):
        """
        Initialize the CrossValidator class

        Args:
            signal: the true signal
            dictionary: the dictionary
            cv_num: the number of folds
        """
        self.signal = signal.ravel()
        self.dictionary = dictionary
        self.cv_num = cv_num
        self.updated = False
        self.cv_res = None
        self.lowest_error = None
        self.lowest_error_K = None
        self.K_cv_error = None
        self.algorithm = algorithm
    def cv_split(self):
        """
        Split the signal and dictionary into cv_num folds

        Returns:
            cv_res: a list of tuples, each tuple is a fold of train signal, train dictionary, test signal, test dictionary
        """
        self.cv_res = []
        true_signal = self.signal.ravel()
        cv_signal = np.split(true_signal,self.cv_num)
        cv_dictionary = np.split(self.dictionary, self.cv_num)
        for i in range(self.cv_num):
            train_signal = np.concatenate(cv_signal[:i] + cv_signal[i + 1:], axis = 0)
            train_dictionary = np.concatenate(cv_dictionary[:i] + cv_dictionary[i + 1:], axis=0)
            test_signal = cv_signal[i]
            test_dictionary = cv_dictionary[i]
            self.cv_res.append((train_signal, train_dictionary, test_signal, test_dictionary))

    def cal_cv_error(self):
        """
        Calculate the cross validation error of the algorithm

        Args:
            algorithm: the algorithm to calculate the error
        Returns:
            error: the cross validation error
        """
        if not self.updated:
            error_lst = []
            for i in range(self.cv_num):
                train_signal, train_dictionary, test_signal, test_dictionary = self.cv_res[i]
                self.algorithm.fit(train_signal, train_dictionary)
                error_lst.append(self.algorithm.score(test_signal, test_dictionary))
        return np.mean(error_lst)

    def cv_best_K(self, cv_num, K_lst):
        """
        Calculate the best K for OMP algorithm using cross validation

        Args:
            signal: the true signal
            dictionary: the dictionary
            cv_num: the number of folds
            K_lst: the list of K to try
        Returns:
            lowest_error: the lowest error
            lowest_error_K: the K that gives the lowest error
            K_cv_error: the list of cross validation error for each K
        """
        if not self.updated:
            self.cv_split()
            self.K_cv_error = []
            for K in K_lst:
                current_K_algorithm = self.algorithm(K, ignore_warning=True)
                self.K_cv_error.append(self.cal_cv_error(current_K_algorithm, cv_num))
            self.lowest_error = np.min(self.K_cv_error)
            self.lowest_error_K = K_lst[np.argmin(self.K_cv_error)]
            self.updated = True
        return self.lowest_error, self.lowest_error_K, self.K_cv_error

    def update_signal(self, new_signal):
        """
        Update the signal and reset the updated flag

        Args:
            new_signal: the new signal
        """
        self.signal = new_signal
        self.updated = False
    
    def update_algorithm(self, new_algorithm):
        """
        Update the algorithm and reset the updated flag

        Args:
            new_algorithm: the new algorithm
        """
        self.algorithm = new_algorithm
        self.updated = False

In [None]:
class EnsembleTesting:
    def __init__(self, configs: DictConfig):
        """
        Initialize the EnsembleTesting class

        Args:
            configs: the configuration
        """
        self.N = configs.N
        self.d = configs.d
        self.m = configs.m
        self.noise_level_lst = configs.noise_level_lst
        self.cv_num = configs.cv_num
        self.K_lst = configs.K_lst
        self.trial_num = configs.trial_num
        self.output_filename = configs.output_filename+ str(self.N) + '_' + str(self.d) + '_' + str(self.m) + '_' + str(self.trial_num) + '_' + str(self.cv_num) + '.pkl'
        if not os.path.exists('./memory'):
            os.mkdir('./memory')

    def cv_best_K_noise_level_multi_trial(self):
        res_log = {
            'parameters': {
                'N': self.N, 
                'd': self.d, 
                'm': self.m, 
                'noise_level_lst': self.noise_level_lst, 
                'cv_num': self.cv_num, 
                'trial_num': self.trial_num, 
                'K_lst': self.K_lst
            },
            'noise_level_best_K': [],
            'noise_level_lowest_MSE': [],
            'log': []
        }
        noise_level_best_K = []
        noise_level_lowest_MSE = []
        for noise_level in self.noise_level_lst:
            print("Cross validating K under noise level: ", noise_level)
            trials_best_K_tmp = []
            MSE_loweset_K_temp = []
            for trial in range(self.trial_num):
                Data_Geneartor = GaussianDataGenerator(self.N, self.d, self.m, noise_level, trial)
                true_signal, dictionary, true_indices, true_coefficients, perturbed_signal = Data_Geneartor.shuffle()
                lowest_error, lowest_error_K, cv_err_lst = cv_best_K(perturbed_signal, dictionary, self.cv_num, self.K_lst)
                trials_best_K_tmp.append(lowest_error_K)
                MSE_loweset_K_temp.append(lowest_error)
                print("Trial: ", trial, " Best K: ", lowest_error_K, " Lowest Error: ", lowest_error)
                log_tmp = {
                    'noise_level': noise_level, 
                    'trial': trial, 
                    'data': Data_Geneartor, 
                    'cv_error_lst': cv_err_lst, 
                    'lowest_error': lowest_error, 
                    'lowest_error_K': lowest_error_K
                }
                res_log['log'].append(log_tmp)
            noise_level_best_K.append(np.mean(trials_best_K_tmp))
            noise_level_lowest_MSE.append(np.mean(MSE_loweset_K_temp))
            print("Average best K for noise level: ", noise_level, " is: ", np.mean(trials_best_K_tmp), " with MSE: ", np.mean(MSE_loweset_K_temp))
        res_log['noise_level_best_K'] = noise_level_best_K
        res_log['noise_level_lowest_MSE'] = noise_level_lowest_MSE
        with open('./memory/' + self.output_filename, 'wb') as f:
            pkl.dump(res_log, f)
        print("Finished!")
        print("Log file saved to: ", './memory/' + self.output_filename)
        return noise_level_best_K, noise_level_lowest_MSE, res_log
