In [91]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import glob
import pickle as pkl
import re
import ast

In [86]:
def parse_omp_logs_to_dataframe(input_text):
    lines = input_text.split("\n")
    current_trial_group = None
    current_noise_level = None
    data = []
    
    for line in lines:
        if "Running trials for n =" in line:
            current_trial_group = tuple(map(int, re.findall(r'\d+', line)))
        elif "Cross validating alpha under noise level:" in line:
            current_noise_level = float(line.split()[-1])
        elif "Trial:" in line and current_trial_group and current_noise_level is not None:
            trial_info = re.findall(r'{.*?}', line)[0]
            trial_info = eval(trial_info)
            trial_info["Trial"] = int(line.split()[1])
            trial_info["Lowest CV Error"] = float(re.findall(r'(?<=Lowest CV Error:  )\d+\.\d+', line)[0])
            trial_info["Training Error"] = float(re.findall(r'(?<=Training Error:  )\d+\.\d+', line)[0])
            trial_info["Testing Error"] = float(re.findall(r'(?<=Testing Error:  )\d+\.\d+', line)[0])
            trial_info["n"] = current_trial_group[0]
            trial_info["p"] = current_trial_group[1]
            trial_info["m"] = current_trial_group[2]
            trial_info["Noise Level"] = current_noise_level
            data.append(trial_info)
    
    df = pd.DataFrame(data)
    return df


file_path = 'outputs/0718/slurm-8042307.out'  # replace with your .out file path
omp_out_summary = parse_omp_logs_to_dataframe(open(file_path).read())
omp_error_dataframe = omp_out_summary[['Noise Level', 'Testing Error', 'Training Error', 'n', 'p', 'm']].groupby(['n', 'p', 'm','Noise Level']).mean().reset_index()


In [87]:
def parse_bomp_logs_to_dataframe(input_text):
    lines = input_text.split("\n")
    current_trial_group = None
    current_noise_level = None
    data = []
    
    for line in lines:
        if "Running trials for n =" in line:
            current_trial_group = tuple(map(int, re.findall(r'\d+', line)))
        elif "Cross validating alpha under noise level:" in line:
            current_noise_level = float(line.split()[-1])
        elif "Trial:" in line and current_trial_group and current_noise_level is not None:
            trial_info = re.findall(r'{.*?}', line)[0]
            trial_info = eval(trial_info)
            trial_info["Trial"] = int(line.split()[1])
            trial_info["Lowest CV Error"] = float(re.findall(r'(?<=Lowest CV Error:  )\d+\.\d+', line)[0])
            trial_info["Training Error"] = float(re.findall(r'(?<=Training Error:  )\d+\.\d+', line)[0])
            trial_info["Testing Error"] = float(re.findall(r'(?<=Testing Error:  )\d+\.\d+', line)[0])
            trial_info["n"] = current_trial_group[0]
            trial_info["p"] = current_trial_group[1]
            trial_info["m"] = current_trial_group[2]
            trial_info["Noise Level"] = current_noise_level
            data.append(trial_info)
    
    df = pd.DataFrame(data)
    return df


file_path = 'outputs/0718/slurm-8021496.out'  # replace with your .out file path
bomp_out_summary = parse_bomp_logs_to_dataframe(open(file_path).read())
bomp_error_dataframe = bomp_out_summary[['Noise Level', 'Testing Error', 'Training Error', 'n', 'p', 'm']].groupby(['n', 'p', 'm','Noise Level']).mean().reset_index()


In [90]:
bomp_out_summary['best_k'].value_counts()

100    73
200    12
10      8
40      2
20      2
90      1
60      1
30      1
Name: best_k, dtype: int64

In [50]:
# first we need to add prefixes to the column names of each dataframe
temp_omp_error_dataframe = omp_error_dataframe.copy().add_prefix('omp_')
temp_bomp_error_dataframe = bomp_error_dataframe.copy().add_prefix('bomp_')

# then we remove the prefix from the columns we will merge on
temp_omp_error_dataframe.rename(columns={'omp_n':'n', 'omp_p':'p', 'omp_m':'m', 'omp_Noise Level':'Noise Level'}, inplace=True)
temp_bomp_error_dataframe.rename(columns={'bomp_n':'n', 'bomp_p':'p', 'bomp_m':'m', 'bomp_Noise Level':'Noise Level'}, inplace=True)

# now we can merge
merged_df = pd.merge(temp_omp_error_dataframe, temp_bomp_error_dataframe, on=['n', 'p', 'm', 'Noise Level'], suffixes=('_omp', '_bomp'))

merged_df['testing_error_improvement'] = (merged_df['omp_Testing Error'] - merged_df['bomp_Testing Error'])/ merged_df['omp_Testing Error']
merged_df['training_error_improvement'] = (merged_df['omp_Training Error'] - merged_df['bomp_Training Error'])/ merged_df['omp_Training Error']


groups = merged_df.groupby(['n', 'p', 'm'])
list_of_groups = [groups.get_group(x) for x in groups.groups]

merged_df


Unnamed: 0,n,p,m,Noise Level,omp_Testing Error,omp_Training Error,bomp_Testing Error,bomp_Training Error,testing_error_improvement,training_error_improvement
0,300,500,10,0.02,0.015201,0.015418,0.001472,0.000432,0.903184,0.972011
1,300,500,10,0.04,0.016448,0.016805,0.003675,0.001159,0.776552,0.931046
2,300,500,10,0.06,0.018522,0.018987,0.006592,0.0019,0.644081,0.899957
3,300,500,10,0.08,0.021421,0.021965,0.010767,0.003482,0.497372,0.841477
4,300,500,10,0.1,0.026213,0.025723,0.017413,0.005383,0.335717,0.790722
5,300,500,20,0.02,0.050262,0.044469,0.006332,0.001103,0.874028,0.975206
6,300,500,20,0.04,0.052411,0.045978,0.007968,0.00137,0.847977,0.970205
7,300,500,20,0.06,0.054397,0.048302,0.011909,0.001728,0.781075,0.96422
8,300,500,20,0.08,0.058109,0.051249,0.016361,0.002726,0.718449,0.946814
9,300,500,20,0.1,0.061757,0.05495,0.021508,0.004331,0.651737,0.921192


In [52]:
for group in list_of_groups:
    group.columns = group.columns.str.replace('Error', '')
    group.columns = group.columns.str.replace('_error', '')
    print(f"m = {group['m'].iloc[0]}")
    print(group.drop(columns=['n', 'p', 'm']).to_latex(index=False))

m = 10
\begin{tabular}{rrrrrrr}
\toprule
 Noise Level &  omp\_Testing  &  omp\_Training  &  bomp\_Testing  &  bomp\_Training  &  testing\_improvement &  training\_improvement \\
\midrule
        0.02 &      0.015201 &       0.015418 &       0.001472 &        0.000432 &             0.903184 &              0.972011 \\
        0.04 &      0.016448 &       0.016805 &       0.003675 &        0.001159 &             0.776552 &              0.931046 \\
        0.06 &      0.018522 &       0.018987 &       0.006592 &        0.001900 &             0.644081 &              0.899957 \\
        0.08 &      0.021421 &       0.021965 &       0.010767 &        0.003482 &             0.497372 &              0.841477 \\
        0.10 &      0.026213 &       0.025723 &       0.017413 &        0.005383 &             0.335717 &              0.790722 \\
\bottomrule
\end{tabular}

m = 20
\begin{tabular}{rrrrrrr}
\toprule
 Noise Level &  omp\_Testing  &  omp\_Training  &  bomp\_Testing  &  bomp\_Training  &  te

  print(group.drop(columns=['n', 'p', 'm']).to_latex(index=False))
  print(group.drop(columns=['n', 'p', 'm']).to_latex(index=False))


In [92]:
# BOMP_log_name = 'outputs/0718/BOMP_300_500_1020_nr_0718_0718-205341.pkl'
BOMP_log_name = "memory/BOMP_30_50_10_nr_0719_0719-181603.pkl"

npm_lists = None
with open(BOMP_log_name, 'rb') as f:
    npm_lists = pkl.load(f)
single_npm = npm_lists[0]

In [93]:
single_npm_logs = single_npm['log']


In [94]:
single_npm['parameters']['fixed_params']

{'agg_func': 'weight',
 'random_seed': 1,
 'replace_flag': False,
 'select_atom_percent': 0,
 'Bag_lst': [1, 20],
 'K_lst': [1, 5, 10, 15, 20, 25, 30]}

In [95]:
single_npm_Bag_lst = single_npm['parameters']['fixed_params']['Bag_lst']
single_npm_K_lst  = single_npm['parameters']['fixed_params']['K_lst']

In [82]:
single_npm_K_lst

[1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200, 250, 300]

In [71]:
single_npm_logs[0].keys()

dict_keys(['noise_level', 'trial', 'cv_error_lst', 'lowest_cv_error', 'training_error', 'best_params', 'param_lst', 'testing_error', 'best_bag_k_error_matrix'])

In [96]:
single_npm_logs[0]['best_bag_k_error_matrix']

array([[1.00000000e+00, 1.00000000e+00, 2.36773828e-01],
       [1.00000000e+00, 5.00000000e+00, 2.02679618e-01],
       [1.00000000e+00, 1.00000000e+01, 2.62788738e-01],
       [1.00000000e+00, 1.50000000e+01, 9.61296683e-01],
       [1.00000000e+00, 2.00000000e+01, 1.72704300e+01],
       [1.00000000e+00, 2.50000000e+01, 4.57723977e+00],
       [1.00000000e+00, 3.00000000e+01, 8.88470908e+01],
       [2.00000000e+01, 1.00000000e+00, 6.25748513e-02],
       [2.00000000e+01, 5.00000000e+00, 1.32439462e-01],
       [2.00000000e+01, 1.00000000e+01, 1.97924039e-01],
       [2.00000000e+01, 1.50000000e+01, 2.87392801e-01],
       [2.00000000e+01, 2.00000000e+01, 4.72106712e+04],
       [2.00000000e+01, 2.50000000e+01, 1.18169224e+02],
       [2.00000000e+01, 3.00000000e+01, 4.07873357e+07]])