In [24]:
import pandas as pd
import numpy as np

def read_excel(file_name):
    df = pd.read_excel(file_name)
    return df

def read_txt(file_name):
    file = open(file_name)
    lines = file.readlines()
    return(lines[0])

In [23]:
import os
import glob

def get_files(subfolder, extension):
    dir = f"{os.getcwd()}/content/{subfolder}/"
    tables = glob.glob(f"{dir}*.{extension}")
    return tables

In [22]:
class Analizer:
    def __init__(self, boundary):
        self.results = get_files(subfolder="results", extension="xlsx")
        self.results_df = pd.DataFrame()
        self.boundary = boundary
    
    def has_minimum_requirements(self, df, sort_by="r2"):
        sorted_df = df.sort_values(by=sort_by, ascending=False)
        top_r2 = sorted_df.head(1)[sort_by].values[0]
        if top_r2 < self.boundary:
            return False
        return True
    
    def concatenate_df(self, df, architecture):
        if self.has_minimum_requirements(df):
            df['Architecture'] = architecture
            df = df.rename(columns={'Unnamed: 0': 'model'})
            self.results_df = pd.concat([self.results_df, df], ignore_index=True) 

    def create_results_df(self):
        for file in self.results:
            df = read_excel(file)
            architecture = read_txt(file.replace(".xlsx", ".txt"))
            self.concatenate_df(df, architecture)
        self.results_df = self.results_df.sort_values(by="r2", ascending=False, ignore_index=True)

    def discard_below_average(self, sort_by):
        column_mean = self.results_df[sort_by].mean()      
        self.results_df = self.results_df[self.results_df[sort_by] >= column_mean]
    
    def discard_high_standard_deviation(self):
        r2_val, r2_test = self.results_df['r2_val'], self.results_df['r2_test']
        std_devs = np.abs(r2_val - r2_test)
        mean_std_dev = std_devs.mean()
        self.results_df = self.results_df[std_devs < mean_std_dev]

    def clean_folder(self, subfolder, extension, remove_last=True):
        files = get_files(subfolder, extension)
        models = self.results_df["model"]
        if (remove_last):
            models = models.apply(lambda x: '_'.join(x.rsplit('_', 1)[:-1]))
        for file in files:
            file_name = os.path.basename(file).split('.')[0]
            file_parts = file_name.split('_')            
            dataset_model = f"model_{file_parts[1]}_{file_parts[2]}" 
            if (remove_last == False):
                dataset_model = (f"{dataset_model}_{file_parts[3]}")
            if dataset_model not in models.values:
                os.remove(file)   
        
    def Analize(self):
        self.create_results_df()
        # self.discard_below_average(sort_by="r2")
        # self.discard_below_average(sort_by="r2_vt")
        # self.discard_high_standard_deviation()
        self.results_df.sort_values(by="mse")
        self.results_df.to_excel(f"./results/better_results.xlsx", index=True)
        display(self.results_df)


In [25]:
analize = Analizer(0.9)
analize.Analize()
# analize.clean_folder(subfolder="dataset", extension="pkl")
# analize.clean_folder(subfolder="results", extension="xlsx")
# analize.clean_folder(subfolder="results", extension="txt")
# analize.clean_folder(subfolder="models", extension="keras", remove_last=False)



Unnamed: 0,model,r2,r2_sup,r2_test,r2_val,r2_vt,mse,mse_sup,mse_test,mse_val,mse_vt,mape,rmse,r2_adj,rsd,aic,bic,Architecture
0,model_1_1_19,0.971797,0.597622,0.945376,0.968928,0.961813,0.010375,0.238869,0.010935,0.012039,0.011487,0.046944,0.10186,0.949637,0.102678,119.136621,275.132126,"Hidden Size=[2, 12], regularizer=0.02, learnin..."
1,model_1_1_20,0.97155,0.591078,0.939946,0.96912,0.960129,0.010466,0.242754,0.012022,0.011965,0.011994,0.047249,0.102305,0.949196,0.103127,119.119179,275.114683,"Hidden Size=[2, 12], regularizer=0.02, learnin..."
2,model_1_1_21,0.971076,0.583565,0.930522,0.969426,0.957191,0.010641,0.247214,0.013909,0.011846,0.012877,0.047471,0.103153,0.948351,0.103982,119.086166,275.081671,"Hidden Size=[2, 12], regularizer=0.02, learnin..."
3,model_1_1_18,0.969501,0.598209,0.947409,0.966499,0.960924,0.01122,0.23852,0.010528,0.012981,0.011754,0.048902,0.105924,0.945538,0.106775,118.980125,274.97563,"Hidden Size=[2, 12], regularizer=0.02, learnin..."
4,model_1_1_17,0.969296,0.598091,0.947192,0.966303,0.960726,0.011295,0.23859,0.010571,0.013057,0.011814,0.049155,0.10628,0.945172,0.107134,118.966706,274.962211,"Hidden Size=[2, 12], regularizer=0.02, learnin..."
5,model_1_1_16,0.968938,0.59765,0.946506,0.965923,0.960253,0.011427,0.238852,0.010709,0.013204,0.011956,0.049316,0.106898,0.944532,0.107757,118.943503,274.939008,"Hidden Size=[2, 12], regularizer=0.02, learnin..."
6,model_1_1_15,0.96814,0.600234,0.944046,0.965469,0.959142,0.011721,0.237318,0.011201,0.01338,0.012291,0.049746,0.108262,0.943108,0.109132,118.892804,274.888309,"Hidden Size=[2, 12], regularizer=0.02, learnin..."
7,model_1_1_14,0.964725,0.598188,0.93726,0.962401,0.954908,0.012977,0.238533,0.01256,0.014568,0.013564,0.052695,0.113917,0.937009,0.114832,118.689138,274.684643,"Hidden Size=[2, 12], regularizer=0.02, learnin..."
8,model_1_1_13,0.963833,0.602613,0.928706,0.962173,0.951915,0.013305,0.235906,0.014272,0.014657,0.014464,0.051441,0.115348,0.935417,0.116274,118.639213,274.634718,"Hidden Size=[2, 12], regularizer=0.02, learnin..."
9,model_1_1_12,0.960696,0.59853,0.922918,0.959447,0.948233,0.014459,0.23833,0.015431,0.015713,0.015572,0.054363,0.120247,0.929814,0.121213,118.47282,274.468325,"Hidden Size=[2, 12], regularizer=0.02, learnin..."
