In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import glob
import pickle as pkl
import re
import ast

In [20]:
def parse_omp_logs_to_dataframe(input_text):
    lines = input_text.split("\n")
    current_trial_group = None
    current_noise_level = None
    data = []
    
    for line in lines:
        if "Running trials for n =" in line:
            current_trial_group = tuple(map(int, re.findall(r'\d+', line)))
        elif "Cross validating alpha under noise level:" in line:
            current_noise_level = float(line.split()[-1])
        elif "Trial:" in line and current_trial_group and current_noise_level is not None:
            trial_info = re.findall(r'{.*?}', line)[0]
            trial_info = eval(trial_info)
            trial_info["Trial"] = int(line.split()[1])
            trial_info["Lowest CV Error"] = float(re.findall(r'(?<=Lowest CV Error:  )\d+\.\d+', line)[0])
            trial_info["Training Error"] = float(re.findall(r'(?<=Training Error:  )\d+\.\d+', line)[0])
            trial_info["Testing Error"] = float(re.findall(r'(?<=Testing Error:  )\d+\.\d+', line)[0])
            trial_info["n"] = current_trial_group[0]
            trial_info["p"] = current_trial_group[1]
            trial_info["m"] = current_trial_group[2]
            trial_info["Noise Level"] = current_noise_level
            data.append(trial_info)
    
    df = pd.DataFrame(data)
    return df


file_path = 'outputs/0718/slurm-8042307.out'  # replace with your .out file path
logs_from_one_out = parse_omp_logs_to_dataframe(open(file_path).read())
omp_error_dataframe = logs_from_one_out[['Noise Level', 'Testing Error', 'Training Error', 'n', 'p', 'm']].groupby(['n', 'p', 'm','Noise Level']).mean().reset_index()


In [21]:
def parse_bomp_logs_to_dataframe(input_text):
    lines = input_text.split("\n")
    current_trial_group = None
    current_noise_level = None
    data = []
    
    for line in lines:
        if "Running trials for n =" in line:
            current_trial_group = tuple(map(int, re.findall(r'\d+', line)))
        elif "Cross validating alpha under noise level:" in line:
            current_noise_level = float(line.split()[-1])
        elif "Trial:" in line and current_trial_group and current_noise_level is not None:
            trial_info = re.findall(r'{.*?}', line)[0]
            trial_info = eval(trial_info)
            trial_info["Trial"] = int(line.split()[1])
            trial_info["Lowest CV Error"] = float(re.findall(r'(?<=Lowest CV Error:  )\d+\.\d+', line)[0])
            trial_info["Training Error"] = float(re.findall(r'(?<=Training Error:  )\d+\.\d+', line)[0])
            trial_info["Testing Error"] = float(re.findall(r'(?<=Testing Error:  )\d+\.\d+', line)[0])
            trial_info["n"] = current_trial_group[0]
            trial_info["p"] = current_trial_group[1]
            trial_info["m"] = current_trial_group[2]
            trial_info["Noise Level"] = current_noise_level
            data.append(trial_info)
    
    df = pd.DataFrame(data)
    return df


file_path = 'outputs/0718/slurm-8021496.out'  # replace with your .out file path
logs_from_one_out = parse_bomp_logs_to_dataframe(open(file_path).read())
bomp_error_dataframe = logs_from_one_out[['Noise Level', 'Testing Error', 'Training Error', 'n', 'p', 'm']].groupby(['n', 'p', 'm','Noise Level']).mean().reset_index()


In [6]:
bomp_error_dataframe 

Unnamed: 0,n,p,m,Noise Level,Testing Error,Training Error
0,300,500,10,0.02,0.001472,0.000432
1,300,500,10,0.04,0.003675,0.001159
2,300,500,10,0.06,0.006592,0.0019
3,300,500,10,0.08,0.010767,0.003482
4,300,500,10,0.1,0.017413,0.005383
5,300,500,20,0.02,0.006332,0.001103
6,300,500,20,0.04,0.007968,0.00137
7,300,500,20,0.06,0.011909,0.001728
8,300,500,20,0.08,0.016361,0.002726
9,300,500,20,0.1,0.021508,0.004331


In [10]:
omp_error_dataframe

Unnamed: 0,n,p,m,Noise Level,omp_omp_omp_Testing Error,omp_omp_omp_Training Error
0,300,500,10,0.02,0.015201,0.015418
1,300,500,10,0.04,0.016448,0.016805
2,300,500,10,0.06,0.018522,0.018987
3,300,500,10,0.08,0.021421,0.021965
4,300,500,10,0.1,0.026213,0.025723
5,300,500,20,0.02,0.050262,0.044469
6,300,500,20,0.04,0.052411,0.045978
7,300,500,20,0.06,0.054397,0.048302
8,300,500,20,0.08,0.058109,0.051249
9,300,500,20,0.1,0.061757,0.05495


In [22]:
# first we need to add prefixes to the column names of each dataframe
temp_omp_error_dataframe = omp_error_dataframe.copy().add_prefix('omp_')
temp_bomp_error_dataframe = bomp_error_dataframe.copy().add_prefix('bomp_')

# then we remove the prefix from the columns we will merge on
temp_omp_error_dataframe.rename(columns={'omp_n':'n', 'omp_p':'p', 'omp_m':'m', 'omp_Noise Level':'Noise Level'}, inplace=True)
temp_bomp_error_dataframe.rename(columns={'bomp_n':'n', 'bomp_p':'p', 'bomp_m':'m', 'bomp_Noise Level':'Noise Level'}, inplace=True)

# now we can merge
merged_df = pd.merge(temp_omp_error_dataframe, temp_bomp_error_dataframe, on=['n', 'p', 'm', 'Noise Level'], suffixes=('_omp', '_bomp'))

merged_df['testing_error_improvement'] = (merged_df['omp_Testing Error'] - merged_df['bomp_Testing Error'])/ merged_df['omp_Testing Error']
merged_df['training_error_improvement'] = (merged_df['omp_Training Error'] - merged_df['bomp_Training Error'])/ merged_df['omp_Training Error']

merged_df

Unnamed: 0,n,p,m,Noise Level,omp_Testing Error,omp_Training Error,bomp_Testing Error,bomp_Training Error,testing_error_improvement,training_error_improvement
0,300,500,10,0.02,0.015201,0.015418,0.001472,0.000432,0.903184,0.972011
1,300,500,10,0.04,0.016448,0.016805,0.003675,0.001159,0.776552,0.931046
2,300,500,10,0.06,0.018522,0.018987,0.006592,0.0019,0.644081,0.899957
3,300,500,10,0.08,0.021421,0.021965,0.010767,0.003482,0.497372,0.841477
4,300,500,10,0.1,0.026213,0.025723,0.017413,0.005383,0.335717,0.790722
5,300,500,20,0.02,0.050262,0.044469,0.006332,0.001103,0.874028,0.975206
6,300,500,20,0.04,0.052411,0.045978,0.007968,0.00137,0.847977,0.970205
7,300,500,20,0.06,0.054397,0.048302,0.011909,0.001728,0.781075,0.96422
8,300,500,20,0.08,0.058109,0.051249,0.016361,0.002726,0.718449,0.946814
9,300,500,20,0.1,0.061757,0.05495,0.021508,0.004331,0.651737,0.921192
