In [10]:
import pandas as pd
import numpy as np

def read_excel(file_name):
    df = pd.read_excel(file_name)
    return df

def read_txt(file_name):
    file = open(file_name)
    lines = file.readlines()
    return(lines[0])

In [11]:
import os
import glob

def get_files(subfolder, extension):
    dir = f"{os.getcwd()}/content/{subfolder}/"
    tables = glob.glob(f"{dir}*.{extension}")
    return tables

In [12]:
class Analizer:
    def __init__(self, boundary):
        self.results = get_files(subfolder="results", extension="xlsx")
        self.results_df = pd.DataFrame()
        self.boundary = boundary
    
    def has_minimum_requirements(self, df, sort_by="r2"):
        sorted_df = df.sort_values(by=sort_by, ascending=False)
        top_r2 = sorted_df.head(1)[sort_by].values[0]
        if top_r2 < self.boundary:
            return False
        return True
    
    def concatenate_df(self, df, architecture):
        if self.has_minimum_requirements(df):
            df['Architecture'] = architecture
            df = df.rename(columns={'Unnamed: 0': 'model'})
            self.results_df = pd.concat([self.results_df, df], ignore_index=True) 

    def create_results_df(self):
        for file in self.results:
            df = read_excel(file)
            architecture = read_txt(file.replace(".xlsx", ".txt"))
            self.concatenate_df(df, architecture)
        self.results_df = self.results_df.sort_values(by="r2", ascending=False, ignore_index=True)

    def discard_below_average(self, sort_by):
        column_mean = self.results_df[sort_by].mean()      
        self.results_df = self.results_df[self.results_df[sort_by] >= column_mean]
    
    def discard_high_standard_deviation(self):
        r2_val, r2_test = self.results_df['r2_val'], self.results_df['r2_test']
        std_devs = np.abs(r2_val - r2_test)
        mean_std_dev = std_devs.mean()
        self.results_df = self.results_df[std_devs < mean_std_dev]

    def clean_folder(self, subfolder, extension, remove_last=True):
        files = get_files(subfolder, extension)
        models = self.results_df["model"]
        if (remove_last):
            models = models.apply(lambda x: '_'.join(x.rsplit('_', 1)[:-1]))
        for file in files:
            file_name = os.path.basename(file).split('.')[0]
            file_parts = file_name.split('_')            
            dataset_model = f"model_{file_parts[1]}_{file_parts[2]}" 
            if (remove_last == False):
                dataset_model = (f"{dataset_model}_{file_parts[3]}")
            if dataset_model not in models.values:
                os.remove(file)   
        
    def Analize(self):
        self.create_results_df()
        # self.discard_below_average(sort_by="r2")
        # self.discard_below_average(sort_by="r2_vt")
        # self.discard_high_standard_deviation()
        self.results_df.sort_values(by="mse")
        self.results_df.to_excel(f"./results/better_results.xlsx", index=True)
        display(self.results_df)


In [14]:
analize = Analizer(0.7)
analize.Analize()
analize.clean_folder(subfolder="dataset", extension="pkl")
analize.clean_folder(subfolder="results", extension="xlsx")
analize.clean_folder(subfolder="results", extension="txt")
analize.clean_folder(subfolder="models", extension="keras", remove_last=False)



Unnamed: 0,model,r2,r2_sup,r2_test,r2_val,r2_vt,mse,mse_sup,mse_test,mse_val,mse_vt,mape,rmse,r2_adj,rsd,aic,bic,Architecture
0,model_1_8_21,0.747632,-0.692000,0.696494,0.277498,0.581922,0.040630,1.004444,0.052438,0.092679,0.072559,0.074860,0.201570,0.549343,0.203189,116.406473,272.401978,"Hidden Size=[2, 12], regularizer=0.02, learnin..."
1,model_1_9_0,0.745257,-0.690557,0.730845,0.800841,0.759192,0.041013,1.003588,0.050497,0.021536,0.036016,0.075512,0.202516,0.545103,0.204143,116.387743,272.383248,"Hidden Size=[2, 12], regularizer=0.02, learnin..."
2,model_1_8_20,0.738218,-0.520450,0.691161,0.317846,0.594178,0.042146,0.902605,0.053360,0.087504,0.070432,0.077389,0.205295,0.532533,0.206944,116.333229,272.328734,"Hidden Size=[2, 12], regularizer=0.02, learnin..."
3,model_1_8_19,0.738177,-0.554366,0.691323,0.317336,0.594070,0.042153,0.922739,0.053332,0.087569,0.070450,0.077485,0.205311,0.532459,0.206960,116.332912,272.328417,"Hidden Size=[2, 12], regularizer=0.02, learnin..."
4,model_1_8_18,0.735794,-0.638891,0.689900,0.334816,0.599822,0.042536,0.972917,0.053577,0.085327,0.069452,0.078003,0.206244,0.528203,0.207900,116.314790,272.310295,"Hidden Size=[2, 12], regularizer=0.02, learnin..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,model_1_8_6,-1.193558,-0.092466,0.571345,-14.221054,-4.838418,0.353156,0.648535,0.074061,1.952491,1.013276,0.118510,0.594270,-2.917068,0.599043,112.081689,268.077194,"Hidden Size=[2, 12], regularizer=0.02, learnin..."
69,model_1_8_4,-1.598940,-0.182725,0.629960,-17.154656,-5.893375,0.418422,0.702116,0.063934,2.328800,1.196367,0.125865,0.646855,-3.640965,0.652051,111.742532,267.738037,"Hidden Size=[2, 12], regularizer=0.02, learnin..."
70,model_1_8_3,-1.616472,-0.118904,0.599525,-17.214294,-5.930565,0.421244,0.664230,0.069192,2.336451,1.202821,0.124934,0.649033,-3.672271,0.654246,111.729086,267.724591,"Hidden Size=[2, 12], regularizer=0.02, learnin..."
71,model_1_8_11,-1.763427,-0.633404,0.702248,-19.942786,-6.887768,0.444903,0.969659,0.051444,2.686450,1.368947,0.112986,0.667011,-3.934691,0.672368,111.619796,267.615301,"Hidden Size=[2, 12], regularizer=0.02, learnin..."
