In [25]:
import pandas as pd
import numpy as np

def read_excel(file_name):
    df = pd.read_excel(file_name)
    return df

def read_txt(file_name):
    file = open(file_name)
    lines = file.readlines()
    return(lines[0])

In [26]:
import os
import glob

def get_files(subfolder, extension):
    dir = f"{os.getcwd()}/content/{subfolder}/"
    tables = glob.glob(f"{dir}*.{extension}")
    return tables

In [27]:
class Analizer:
    def __init__(self):
        self.results = get_files(subfolder="results", extension="xlsx")
        self.results_df = pd.DataFrame()
    
    def has_minimum_requirements(self, df, sort_by="r2", boundary=0.75):
        sorted_df = df.sort_values(by=sort_by, ascending=False)
        top_r2 = sorted_df.head(1)[sort_by].values[0]
        if top_r2 < boundary:
            return False
        return True
    
    def concatenate_df(self, df, architecture):
        if self.has_minimum_requirements(df):
            df['Architecture'] = architecture
            df = df.rename(columns={'Unnamed: 0': 'model'})
            self.results_df = pd.concat([self.results_df, df], ignore_index=True) 

    def create_results_df(self):
        for file in self.results:
            df = read_excel(file)
            architecture = read_txt(file.replace(".xlsx", ".txt"))
            self.concatenate_df(df, architecture)
        self.results_df = self.results_df.sort_values(by="r2", ascending=False, ignore_index=True)

    def discard_below_average(self, sort_by):
        column_mean = self.results_df[sort_by].mean()      
        self.results_df = self.results_df[self.results_df[sort_by] >= column_mean]
    
    def discard_high_standard_deviation(self):
        r2_val, r2_test = self.results_df['r2_val'], self.results_df['r2_test']
        std_devs = np.abs(r2_val - r2_test)
        mean_std_dev = std_devs.mean()
        self.results_df = self.results_df[std_devs < mean_std_dev]

    def clean_folder(self, subfolder, extension, remove_last=True):
        files = get_files(subfolder, extension)
        models = self.results_df["model"]
        if (remove_last):
            models = models.apply(lambda x: '_'.join(x.rsplit('_', 1)[:-1]))
        for file in files:
            file_name = os.path.basename(file).split('.')[0]
            file_parts = file_name.split('_')            
            dataset_model = f"model_{file_parts[1]}_{file_parts[2]}" 
            if (remove_last == False):
                dataset_model = (f"{dataset_model}_{file_parts[3]}")
            if dataset_model not in models.values:
                os.remove(file)
        
    def Analize(self):
        self.create_results_df()
        self.discard_below_average(sort_by="r2")
        self.discard_below_average(sort_by="r2_vt")
        self.discard_high_standard_deviation()
        self.results_df.to_excel(f"better_results.xlsx", index=True)
        display(self.results_df)


In [28]:
analize = Analizer()
analize.Analize()
analize.clean_folder(subfolder="dataset", extension="pkl")
analize.clean_folder(subfolder="results", extension="xlsx")
analize.clean_folder(subfolder="results", extension="xlsx")
analize.clean_folder(subfolder="results", extension="png", remove_last=False)
analize.clean_folder(subfolder="models", extension="keras", remove_last=False)



Unnamed: 0,model,r2,r2_sup,r2_test,r2_val,r2_vt,mse,mse_sup,mse_test,mse_val,mse_vt,Architecture
0,model_4_9_11,0.99988,0.784694,0.999123,0.999582,0.999237,0.000503,1.439753,0.002064,0.000199,0.001132,"Hidden Size=[30, 16], regularizer=0.2, learnin..."
1,model_4_9_12,0.99988,0.784798,0.999055,0.999558,0.999179,0.000504,1.439057,0.002224,0.00021,0.001217,"Hidden Size=[30, 16], regularizer=0.2, learnin..."
2,model_4_9_10,0.999879,0.784578,0.999195,0.999608,0.999298,0.000507,1.440528,0.001894,0.000186,0.00104,"Hidden Size=[30, 16], regularizer=0.2, learnin..."
3,model_4_9_13,0.999879,0.784891,0.998992,0.999535,0.999125,0.000508,1.438433,0.002373,0.000221,0.001297,"Hidden Size=[30, 16], regularizer=0.2, learnin..."
4,model_4_9_14,0.999878,0.784975,0.998933,0.999514,0.999075,0.000514,1.437874,0.002511,0.000231,0.001371,"Hidden Size=[30, 16], regularizer=0.2, learnin..."
5,model_4_9_9,0.999877,0.784449,0.999272,0.999635,0.999363,0.000517,1.441393,0.001713,0.000174,0.000943,"Hidden Size=[30, 16], regularizer=0.2, learnin..."
6,model_4_9_15,0.999876,0.78505,0.998879,0.999494,0.999029,0.000521,1.437371,0.002638,0.000241,0.001439,"Hidden Size=[30, 16], regularizer=0.2, learnin..."
8,model_4_9_8,0.999873,0.784304,0.999353,0.999664,0.999432,0.000535,1.442359,0.001523,0.00016,0.000842,"Hidden Size=[30, 16], regularizer=0.2, learnin..."
12,model_4_9_7,0.999866,0.784143,0.999437,0.999693,0.999504,0.000563,1.443437,0.001325,0.000146,0.000736,"Hidden Size=[30, 16], regularizer=0.2, learnin..."
18,model_4_9_6,0.999856,0.783963,0.999524,0.999722,0.999577,0.000605,1.444642,0.001121,0.000132,0.000627,"Hidden Size=[30, 16], regularizer=0.2, learnin..."
