In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import glob
import pickle as pkl
import re
import ast

In [2]:
def data_loader(filename):
    with open(filename, "r") as file:
        lines = file.readlines()

    # initializing variables
    n = p = m = noise_level = 0
    data = []
    for line in lines:
        try:
            if "Running trials for" in line:
                n = int(re.search(r"n = (\d+)", line).group(1))
                p = int(re.search(r"p = (\d+)", line).group(1))
                m = int(re.search(r"m = (\d+)", line).group(1))
            elif "Cross validating alpha under noise level:" in line:
                noise_level = float(
                    re.search(r"noise level:  (\d+\.\d+)", line).group(1)
                )
            elif "Trial:" in line:
                trial_num = int(re.search(r"Trial:  (\d+)", line).group(1))
                params = ast.literal_eval(
                    re.search(r"Best params:  ({.*?})", line).group(1)
                )
                error = float(re.search(r"Lowest Error:  (\d+\.\d+)", line).group(1))
                params["trial_num"] = trial_num
                params["lowest_error"] = error
                params["n"] = n
                params["p"] = p
                params["m"] = m
                params["noise_level"] = noise_level
                data.append(params)
        except Exception as e:
            print(f"Error parsing line: {e}")

    return pd.DataFrame(data)

In [10]:
# specify the directory you want to search in
directory = "outputs/"

# use glob to match the file pattern '*.out'
files = glob.glob(os.path.join(directory, "*.out"))

In [6]:
# loop over the list of files from the glob.glob() method
files = ["outputs/slurm-7484898.out", "outputs/slurm-7484899.out"]
final_results = pd.DataFrame()
for filename in files:
    df = data_loader(filename)
    final_results = final_results.append(df)

  final_results = final_results.append(df)
  final_results = final_results.append(df)


In [7]:
final_results

Unnamed: 0,K,N_bag,atom_bag_percent,replace_flag,signal_bag_percent,trial_num,lowest_error,n,p,m,noise_level
0,13,1,0.6,False,1.0,0,0.016888,600,1000,20,0.12
1,15,1,0.7,False,1.0,1,0.013901,600,1000,20,0.12
2,19,75,0.7,False,1.0,2,0.016281,600,1000,20,0.12
3,17,75,0.5,False,1.0,3,0.016096,600,1000,20,0.12
4,21,50,0.6,False,0.8,4,0.015993,600,1000,20,0.12
...,...,...,...,...,...,...,...,...,...,...,...
45,9,10,0.7,True,1.0,5,0.042052,600,1000,20,0.20
46,9,10,0.5,True,1.4,6,0.049680,600,1000,20,0.20
47,25,75,0.8,True,0.6,7,0.055430,600,1000,20,0.20
48,13,100,0.8,True,1.4,8,0.046491,600,1000,20,0.20


In [25]:
OMP_filename = "outputs/slurm-7506859.out"


with open(OMP_filename, "r") as f:
    text = f.read()

    noise_levels = re.findall(r"Average best K for noise level:  (\d+\.\d+)", text)
    average_mse = re.findall(r"with MSE:  (\d+\.\d+)", text)

# Convert to float for further calculations
noise_levels = [float(i) for i in noise_levels]
average_mse = [float(i) for i in average_mse]


OMP = pd.DataFrame({"noise_level": noise_levels, "average_mse": average_mse})

In [49]:
OMP

Unnamed: 0,noise_level,average_mse
0,0.02,0.000473
1,0.04,0.001892
2,0.06,0.00421
3,0.08,0.007445
4,0.1,0.011655
5,0.12,0.016733
6,0.14,0.022747
7,0.16,0.029592
8,0.18,0.037294
9,0.2,0.045774


In [34]:
BOMP_filenames = ["outputs/slurm-7484898.out", "outputs/slurm-7484899.out"]

Total_BOMP = pd.DataFrame()
for BOMP_filename in BOMP_filenames:
    with open(BOMP_filename, "r") as f:
        text = f.read()

        noise_levels = re.findall(r"Noise level:  (\d+\.\d+)", text)
        average_mse = re.findall(r"Avg Lowest MSE:  (\d+\.\d+)", text)

    # Convert to float for further calculations
    noise_levels = [float(i) for i in noise_levels]
    average_mse = [float(i) for i in average_mse]

    BOMP = pd.DataFrame({"noise_level": noise_levels, "average_mse": average_mse})

    Total_BOMP = Total_BOMP.append(BOMP)

Total_BOMP = Total_BOMP.groupby("noise_level").min().reset_index()

  Total_BOMP = Total_BOMP.append(BOMP)
  Total_BOMP = Total_BOMP.append(BOMP)


In [50]:
BOMP

Unnamed: 0,noise_level,average_mse
0,0.12,0.016454
1,0.14,0.022688
2,0.16,0.029814
3,0.18,0.037742
4,0.2,0.04656


In [48]:
Merged_Results = pd.merge(
    OMP, Total_BOMP, on="noise_level", suffixes=("_OMP", "_Total_BOMP")
).rename(columns={"average_mse_OMP": "OMP_mse", "average_mse_Total_BOMP": "BOMP_mse"})
Merged_Results["Improvements"] = (
    1 - Merged_Results["BOMP_mse"] / Merged_Results["OMP_mse"]
)
Merged_Results

Unnamed: 0,noise_level,OMP_mse,BOMP_mse,Improvements
0,0.12,0.016733,0.016007,0.043392
1,0.14,0.022747,0.022219,0.023196
2,0.16,0.029592,0.029484,0.003638
3,0.18,0.037294,0.037354,-0.001612
4,0.2,0.045774,0.046252,-0.010448
