In [5]:
import pandas as pd
import numpy as np

def read_excel(file_name):
    df = pd.read_excel(file_name)
    return df

def read_txt(file_name):
    file = open(file_name)
    lines = file.readlines()
    return(lines[0])

In [6]:
import os
import glob

def get_files(subfolder, extension):
    dir = f"{os.getcwd()}/content/{subfolder}/"
    tables = glob.glob(f"{dir}*.{extension}")
    return tables

In [7]:
class Analizer:
    def __init__(self, boundary):
        self.results = get_files(subfolder="results", extension="xlsx")
        self.results_df = pd.DataFrame()
        self.boundary = boundary
    
    def has_minimum_requirements(self, df, sort_by="r2"):
        sorted_df = df.sort_values(by=sort_by, ascending=False)
        top_r2 = sorted_df.head(1)[sort_by].values[0]
        if top_r2 < self.boundary:
            return False
        return True
    
    def concatenate_df(self, df, architecture):
        if self.has_minimum_requirements(df):
            df['Architecture'] = architecture
            df = df.rename(columns={'Unnamed: 0': 'model'})
            self.results_df = pd.concat([self.results_df, df], ignore_index=True) 

    def create_results_df(self):
        for file in self.results:
            df = read_excel(file)
            architecture = read_txt(file.replace(".xlsx", ".txt"))
            self.concatenate_df(df, architecture)
        self.results_df = self.results_df.sort_values(by="r2", ascending=False, ignore_index=True)

    def discard_below_average(self, sort_by):
        column_mean = self.results_df[sort_by].mean()      
        self.results_df = self.results_df[self.results_df[sort_by] >= column_mean]
    
    def discard_high_standard_deviation(self):
        r2_val, r2_test = self.results_df['r2_val'], self.results_df['r2_test']
        std_devs = np.abs(r2_val - r2_test)
        mean_std_dev = std_devs.mean()
        self.results_df = self.results_df[std_devs < mean_std_dev]

    def clean_folder(self, subfolder, extension, remove_last=True):
        files = get_files(subfolder, extension)
        models = self.results_df["model"]
        if (remove_last):
            models = models.apply(lambda x: '_'.join(x.rsplit('_', 1)[:-1]))
        for file in files:
            file_name = os.path.basename(file).split('.')[0]
            file_parts = file_name.split('_')            
            dataset_model = f"model_{file_parts[1]}_{file_parts[2]}" 
            if (remove_last == False):
                dataset_model = (f"{dataset_model}_{file_parts[3]}")
            if dataset_model not in models.values:
                os.remove(file)   
        
    def Analize(self):
        self.create_results_df()
        # self.discard_below_average(sort_by="r2")
        # self.discard_below_average(sort_by="r2_vt")
        # self.discard_high_standard_deviation()
        self.results_df.sort_values(by="mse")
        self.results_df.to_excel(f"./results/better_results.xlsx", index=True)
        display(self.results_df)


In [9]:
analize = Analizer(0.8)
analize.Analize()
# analize.clean_folder(subfolder="dataset", extension="pkl")
# analize.clean_folder(subfolder="results", extension="xlsx")
# analize.clean_folder(subfolder="results", extension="txt")
# analize.clean_folder(subfolder="models", extension="keras", remove_last=False)



Unnamed: 0,model,r2,r2_sup,r2_test,r2_val,r2_vt,mse,mse_sup,mse_test,mse_val,mse_vt,mape,rmse,r2_adj,rsd,aic,bic,Architecture
0,model_1_0_22,0.859132,0.52982,0.829422,0.856945,0.850702,0.056784,0.279119,0.046171,0.058563,0.052367,0.105627,0.238293,0.74845,0.240207,115.737012,271.732517,"Hidden Size=[2, 12], regularizer=0.02, learnin..."
1,model_1_0_21,0.856652,0.528751,0.823353,0.856202,0.847927,0.057784,0.279754,0.047814,0.058867,0.053341,0.106844,0.240382,0.744021,0.242313,115.702101,271.697606,"Hidden Size=[2, 12], regularizer=0.02, learnin..."
2,model_1_0_20,0.836362,0.51285,0.795515,0.838822,0.827043,0.065962,0.289193,0.055349,0.065982,0.060666,0.114407,0.256831,0.70779,0.258894,115.437348,271.432853,"Hidden Size=[2, 12], regularizer=0.02, learnin..."
3,model_1_0_19,0.813434,0.49565,0.764233,0.819691,0.803809,0.075205,0.299404,0.063816,0.073814,0.068815,0.122406,0.274235,0.666846,0.276437,115.175083,271.170588,"Hidden Size=[2, 12], regularizer=0.02, learnin..."
4,model_1_0_18,0.786728,0.477274,0.72561,0.796426,0.77533,0.08597,0.310313,0.07427,0.083338,0.078804,0.131535,0.293206,0.619157,0.295561,114.907517,270.903022,"Hidden Size=[2, 12], regularizer=0.02, learnin..."
5,model_1_0_17,0.760482,0.458826,0.68636,0.777222,0.74898,0.09655,0.321264,0.084894,0.0912,0.088047,0.140306,0.310724,0.572289,0.31322,114.675397,270.670902,"Hidden Size=[2, 12], regularizer=0.02, learnin..."
6,model_1_0_15,0.7289,0.423814,0.640937,0.774015,0.729581,0.10928,0.342049,0.097189,0.092513,0.094851,0.152747,0.330575,0.515894,0.333231,114.427684,270.423189,"Hidden Size=[2, 12], regularizer=0.02, learnin..."
7,model_1_0_16,0.726717,0.429073,0.634569,0.740649,0.707653,0.11016,0.338927,0.098913,0.106172,0.102542,0.150609,0.331904,0.511995,0.334569,114.411644,270.407149,"Hidden Size=[2, 12], regularizer=0.02, learnin..."
8,model_1_0_14,0.724559,0.422901,0.633255,0.773817,0.726502,0.11103,0.342591,0.099268,0.092594,0.095931,0.153899,0.333212,0.508141,0.335888,114.395909,270.391414,"Hidden Size=[2, 12], regularizer=0.02, learnin..."
9,model_1_0_13,0.698887,0.428919,0.670415,0.767364,0.737074,0.121378,0.339018,0.08921,0.095235,0.092223,0.164897,0.348394,0.462299,0.351192,114.217687,270.213191,"Hidden Size=[2, 12], regularizer=0.02, learnin..."
