In [4]:
import pandas as pd
import numpy as np

def read_excel(file_name):
    df = pd.read_excel(file_name)
    return df

def read_txt(file_name):
    file = open(file_name)
    lines = file.readlines()
    return(lines[0])

In [5]:
import os
import glob

def get_files(subfolder, extension):
    dir = f"{os.getcwd()}/content/{subfolder}/"
    tables = glob.glob(f"{dir}*.{extension}")
    return tables

In [8]:
class Analizer:
    def __init__(self, boundary):
        self.results = get_files(subfolder="results", extension="xlsx")
        self.results_df = pd.DataFrame()
        self.boundary = boundary
    
    def has_minimum_requirements(self, df, sort_by="r2"):
        sorted_df = df.sort_values(by=sort_by, ascending=False)
        top_r2 = sorted_df.head(1)[sort_by].values[0]
        if top_r2 < self.boundary:
            return False
        return True
    
    def concatenate_df(self, df, architecture):
        if self.has_minimum_requirements(df):
            df['Architecture'] = architecture
            df = df.rename(columns={'Unnamed: 0': 'model'})
            self.results_df = pd.concat([self.results_df, df], ignore_index=True) 

    def create_results_df(self):
        for file in self.results:
            df = read_excel(file)
            architecture = read_txt(file.replace(".xlsx", ".txt"))
            self.concatenate_df(df, architecture)
        self.results_df = self.results_df.sort_values(by="r2", ascending=False, ignore_index=True)

    def discard_below_average(self, sort_by):
        column_mean = self.results_df[sort_by].mean()      
        self.results_df = self.results_df[self.results_df[sort_by] >= column_mean]
    
    def discard_high_standard_deviation(self):
        r2_val, r2_test = self.results_df['r2_val'], self.results_df['r2_test']
        std_devs = np.abs(r2_val - r2_test)
        mean_std_dev = std_devs.mean()
        self.results_df = self.results_df[std_devs < mean_std_dev]

    def clean_folder(self, subfolder, extension, remove_last=True):
        files = get_files(subfolder, extension)
        models = self.results_df["model"]
        if (remove_last):
            models = models.apply(lambda x: '_'.join(x.rsplit('_', 1)[:-1]))
        for file in files:
            file_name = os.path.basename(file).split('.')[0]
            file_parts = file_name.split('_')            
            dataset_model = f"model_{file_parts[1]}_{file_parts[2]}" 
            if (remove_last == False):
                dataset_model = (f"{dataset_model}_{file_parts[3]}")
            if dataset_model not in models.values:
                os.remove(file)   
        
    def Analize(self):
        self.create_results_df()
        # self.discard_below_average(sort_by="r2")
        # self.discard_below_average(sort_by="r2_vt")
        # self.discard_high_standard_deviation()
        self.results_df.sort_values(by="mse")
        self.results_df.to_excel(f"./results/better_results.xlsx", index=True)
        display(self.results_df)


In [9]:
analize = Analizer(0.9)
analize.Analize()
# analize.clean_folder(subfolder="dataset", extension="pkl")
# analize.clean_folder(subfolder="results", extension="xlsx")
# analize.clean_folder(subfolder="results", extension="txt")
# analize.clean_folder(subfolder="models", extension="keras", remove_last=False)



Unnamed: 0,model,r2,r2_sup,r2_test,r2_val,r2_vt,mse,mse_sup,mse_test,mse_val,mse_vt,mape,rmse,r2_adj,rsd,aic,bic,Architecture
0,model_1_19_24,0.976054,0.491708,0.967868,0.998579,0.984638,0.012244,0.301744,0.016035,0.000786,0.009258,0.035105,0.110654,1.026709,0.114537,118.805398,195.871254,"Hidden Size=[2, 12], regularizer=0.02, learnin..."
1,model_1_19_23,0.975907,0.493347,0.969229,0.998635,0.985287,0.012319,0.300771,0.015356,0.000755,0.008867,0.035117,0.110993,1.026873,0.114889,118.793147,195.859003,"Hidden Size=[2, 12], regularizer=0.02, learnin..."
2,model_1_19_22,0.975744,0.494921,0.970545,0.99869,0.985915,0.012403,0.299837,0.014699,0.000725,0.008488,0.035123,0.111368,1.027054,0.115277,118.779672,195.845528,"Hidden Size=[2, 12], regularizer=0.02, learnin..."
3,model_1_19_21,0.975563,0.496398,0.971796,0.998743,0.986512,0.012496,0.29896,0.014075,0.000695,0.008128,0.035131,0.111784,1.027257,0.115707,118.76475,195.830606,"Hidden Size=[2, 12], regularizer=0.02, learnin..."
4,model_1_19_20,0.975388,0.498066,0.973126,0.998797,0.987146,0.012585,0.29797,0.013411,0.000665,0.007746,0.035122,0.112183,1.027452,0.116121,118.750489,195.816345,"Hidden Size=[2, 12], regularizer=0.02, learnin..."
5,model_1_19_19,0.975172,0.499386,0.974265,0.998849,0.987691,0.012695,0.297186,0.012843,0.000637,0.007418,0.035126,0.112673,1.027692,0.116628,118.733056,195.798912,"Hidden Size=[2, 12], regularizer=0.02, learnin..."
6,model_1_19_18,0.974968,0.501004,0.975522,0.998903,0.988292,0.0128,0.296226,0.012215,0.000607,0.007056,0.03511,0.113136,1.02792,0.117107,118.716667,195.782523,"Hidden Size=[2, 12], regularizer=0.02, learnin..."
7,model_1_19_17,0.974726,0.502321,0.976614,0.998954,0.988815,0.012924,0.295444,0.011671,0.000578,0.006741,0.035106,0.113683,1.028191,0.117673,118.697383,195.763239,"Hidden Size=[2, 12], regularizer=0.02, learnin..."
8,model_1_19_16,0.974452,0.503413,0.97757,0.999003,0.989275,0.013063,0.294796,0.011193,0.000551,0.006464,0.035111,0.114295,1.028495,0.118307,118.675889,195.741744,"Hidden Size=[2, 12], regularizer=0.02, learnin..."
9,model_1_19_15,0.974181,0.504673,0.978591,0.999053,0.989765,0.013202,0.294047,0.010684,0.000523,0.006168,0.035102,0.1149,1.028798,0.118933,118.65478,195.720636,"Hidden Size=[2, 12], regularizer=0.02, learnin..."
