In [5]:
import pandas as pd
import numpy as np

def read_excel(file_name):
    df = pd.read_excel(file_name)
    return df

def read_txt(file_name):
    file = open(file_name)
    lines = file.readlines()
    return(lines[0])

In [6]:
import os
import glob

def get_files(subfolder, extension):
    dir = f"{os.getcwd()}/content/{subfolder}/"
    tables = glob.glob(f"{dir}*.{extension}")
    return tables

In [7]:
class Analizer:
    def __init__(self, boundary):
        self.results = get_files(subfolder="results", extension="xlsx")
        self.results_df = pd.DataFrame()
        self.boundary = boundary
    
    def has_minimum_requirements(self, df, sort_by="r2"):
        sorted_df = df.sort_values(by=sort_by, ascending=False)
        top_r2 = sorted_df.head(1)[sort_by].values[0]
        if top_r2 < self.boundary:
            return False
        return True
    
    def concatenate_df(self, df, architecture):
        if self.has_minimum_requirements(df):
            df['Architecture'] = architecture
            df = df.rename(columns={'Unnamed: 0': 'model'})
            self.results_df = pd.concat([self.results_df, df], ignore_index=True) 

    def create_results_df(self):
        for file in self.results:
            df = read_excel(file)
            architecture = read_txt(file.replace(".xlsx", ".txt"))
            self.concatenate_df(df, architecture)
        self.results_df = self.results_df.sort_values(by="r2", ascending=False, ignore_index=True)

    def discard_below_average(self, sort_by):
        column_mean = self.results_df[sort_by].mean()      
        self.results_df = self.results_df[self.results_df[sort_by] >= column_mean]
    
    def discard_high_standard_deviation(self):
        r2_val, r2_test = self.results_df['r2_val'], self.results_df['r2_test']
        std_devs = np.abs(r2_val - r2_test)
        mean_std_dev = std_devs.mean()
        self.results_df = self.results_df[std_devs < mean_std_dev]

    def clean_folder(self, subfolder, extension, remove_last=True):
        files = get_files(subfolder, extension)
        models = self.results_df["model"]
        if (remove_last):
            models = models.apply(lambda x: '_'.join(x.rsplit('_', 1)[:-1]))
        for file in files:
            file_name = os.path.basename(file).split('.')[0]
            file_parts = file_name.split('_')            
            dataset_model = f"model_{file_parts[1]}_{file_parts[2]}" 
            if (remove_last == False):
                dataset_model = (f"{dataset_model}_{file_parts[3]}")
            if dataset_model not in models.values:
                os.remove(file)   
        
    def Analize(self):
        self.create_results_df()
        self.discard_below_average(sort_by="r2")
        self.discard_below_average(sort_by="r2_vt")
        self.discard_high_standard_deviation()
        self.results_df.to_excel(f"better_results.xlsx", index=True)
        display(self.results_df)


In [8]:
analize = Analizer(0.9)
analize.Analize()
analize.clean_folder(subfolder="dataset", extension="pkl")
analize.clean_folder(subfolder="results", extension="xlsx")
analize.clean_folder(subfolder="results", extension="txt")
analize.clean_folder(subfolder="models", extension="keras", remove_last=False)



Unnamed: 0,model,r2,r2_sup,r2_test,r2_val,r2_vt,mse,mse_sup,mse_test,mse_val,mse_vt,mape,rmse,r2_adj,rsd,aic,bic,Architecture
10,model_40_9_4,0.996311,0.845551,0.971817,0.984129,0.981233,0.024669,1.032804,0.031499,0.056519,0.044009,0.638254,0.157064,1.000579,0.163751,361.404408,577.145429,"Hidden Size=[44], regularizer=0.05, learning_r..."
12,model_22_8_2,0.996304,0.773041,0.910237,0.969777,0.983483,0.024713,1.517677,0.051025,0.024908,0.037966,0.204214,0.157205,1.000647,0.163898,329.400819,525.639827,"Hidden Size=[40], regularizer=0.2, learning_ra..."
14,model_40_9_5,0.996292,0.845823,0.960656,0.983873,0.978379,0.024798,1.030984,0.043974,0.057433,0.050704,0.599440,0.157475,1.000582,0.164179,361.393956,577.134977,"Hidden Size=[44], regularizer=0.05, learning_r..."
17,model_40_9_3,0.996228,0.845159,0.981940,0.984346,0.983810,0.025221,1.035426,0.020186,0.055746,0.037966,0.681454,0.158811,1.000592,0.165572,361.360171,577.101192,"Hidden Size=[44], regularizer=0.05, learning_r..."
21,model_22_8_1,0.996184,0.773725,0.923181,0.977099,0.986396,0.025515,1.513104,0.043667,0.018874,0.031270,0.182506,0.159734,1.000668,0.166534,329.336979,525.575987,"Hidden Size=[40], regularizer=0.2, learning_ra..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
328,model_8_6_15,0.988669,0.776827,0.984015,0.976578,0.983223,0.075773,1.492356,0.175920,0.265908,0.220914,0.307327,0.275268,1.002248,0.286987,295.160037,471.897031,"Hidden Size=[36], regularizer=0.05, learning_r..."
329,model_8_7_0,0.988647,0.775878,0.990067,0.993276,0.993828,0.075918,1.498708,0.018484,0.097286,0.057885,0.280809,0.275533,1.002252,0.287263,295.156193,471.893187,"Hidden Size=[36], regularizer=0.05, learning_r..."
331,model_6_5_4,0.988606,0.835560,0.980167,0.993674,0.989076,0.076189,1.099614,0.224395,0.048410,0.136403,0.443963,0.276024,1.013021,0.287775,95.149063,149.998475,"Hidden Size=[11], regularizer=0.05, learning_r..."
332,model_12_7_1,0.988593,0.730010,0.987426,0.967181,0.988886,0.076277,1.805425,0.089377,0.076837,0.083107,0.280985,0.276183,1.010951,0.287941,103.146764,162.871679,"Hidden Size=[12], regularizer=0.03, learning_r..."
