In [1]:
import pandas as pd
import numpy as np

def read_excel(file_name):
    df = pd.read_excel(file_name)
    return df

def read_txt(file_name):
    file = open(file_name)
    lines = file.readlines()
    return(lines[0])

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
import os
import glob

def get_files(subfolder, extension):
    dir = f"{os.getcwd()}/content/{subfolder}/"
    tables = glob.glob(f"{dir}*.{extension}")
    return tables

In [3]:
class Analizer:
    def __init__(self, boundary):
        self.results = get_files(subfolder="results", extension="xlsx")
        self.results_df = pd.DataFrame()
        self.boundary = boundary
    
    def has_minimum_requirements(self, df, sort_by="r2"):
        sorted_df = df.sort_values(by=sort_by, ascending=False)
        top_r2 = sorted_df.head(1)[sort_by].values[0]
        if top_r2 < self.boundary:
            return False
        return True
    
    def concatenate_df(self, df, architecture):
        if self.has_minimum_requirements(df):
            df['Architecture'] = architecture
            df = df.rename(columns={'Unnamed: 0': 'model'})
            self.results_df = pd.concat([self.results_df, df], ignore_index=True) 

    def create_results_df(self):
        for file in self.results:
            df = read_excel(file)
            architecture = read_txt(file.replace(".xlsx", ".txt"))
            self.concatenate_df(df, architecture)
        self.results_df = self.results_df.sort_values(by="r2", ascending=False, ignore_index=True)

    def discard_below_average(self, sort_by):
        column_mean = self.results_df[sort_by].mean()      
        self.results_df = self.results_df[self.results_df[sort_by] >= column_mean]
    
    def discard_high_standard_deviation(self):
        r2_val, r2_test = self.results_df['r2_val'], self.results_df['r2_test']
        std_devs = np.abs(r2_val - r2_test)
        mean_std_dev = std_devs.mean()
        self.results_df = self.results_df[std_devs < mean_std_dev]

    def clean_folder(self, subfolder, extension, remove_last=True):
        files = get_files(subfolder, extension)
        models = self.results_df["model"]
        if (remove_last):
            models = models.apply(lambda x: '_'.join(x.rsplit('_', 1)[:-1]))
        for file in files:
            file_name = os.path.basename(file).split('.')[0]
            file_parts = file_name.split('_')            
            dataset_model = f"model_{file_parts[1]}_{file_parts[2]}" 
            if (remove_last == False):
                dataset_model = (f"{dataset_model}_{file_parts[3]}")
            if dataset_model not in models.values:
                os.remove(file)   
        
    def Analize(self):
        self.create_results_df()
        self.discard_below_average(sort_by="r2_sup")
        self.discard_below_average(sort_by="r2_vt")
        self.discard_high_standard_deviation()
        self.results_df.to_excel(f"better_results.xlsx", index=True)
        display(self.results_df)


In [6]:
analize = Analizer(0.9)
analize.Analize()
analize.clean_folder(subfolder="dataset", extension="pkl")
analize.clean_folder(subfolder="results", extension="xlsx")
analize.clean_folder(subfolder="results", extension="txt")
analize.clean_folder(subfolder="models", extension="keras", remove_last=False)



Unnamed: 0,model,r2,r2_sup,r2_test,r2_val,r2_vt,mse,mse_sup,mse_test,mse_val,mse_vt,mape,rmse,r2_adj,rsd,aic,bic,Architecture
0,model_20_8_2,0.996385,0.822630,0.997405,0.991818,0.996196,0.024171,1.186073,0.027360,0.029501,0.028430,0.112986,0.155472,1.001637,0.162091,161.445165,255.298603,"Hidden Size=[19], regularizer=0.05, learning_r..."
1,model_20_9_5,0.996323,0.816195,0.990687,0.981859,0.986088,0.024588,1.229108,0.024101,0.094852,0.059476,0.093817,0.156804,1.001665,0.163480,161.411028,255.264466,"Hidden Size=[19], regularizer=0.05, learning_r..."
2,model_20_9_4,0.996313,0.816054,0.991079,0.983144,0.986992,0.024653,1.230045,0.023087,0.088136,0.055612,0.096612,0.157013,1.001669,0.163697,161.405719,255.259158,"Hidden Size=[19], regularizer=0.05, learning_r..."
3,model_20_8_3,0.996300,0.821566,0.997108,0.986751,0.994765,0.024743,1.193190,0.030489,0.047769,0.039129,0.122434,0.157301,1.001676,0.163997,161.398389,255.251828,"Hidden Size=[19], regularizer=0.05, learning_r..."
4,model_20_9_6,0.996299,0.816292,0.990286,0.980607,0.985201,0.024746,1.228457,0.025140,0.101398,0.063269,0.099275,0.157310,1.001676,0.164007,161.398148,255.251586,"Hidden Size=[19], regularizer=0.05, learning_r..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149,model_34_6_2,0.978080,0.843617,0.956525,0.971898,0.968516,0.146577,1.045731,0.193378,0.434109,0.313743,0.086459,0.382854,1.007624,0.399153,189.840403,303.195855,"Hidden Size=[23], regularizer=0.2, learning_ra..."
156,model_34_6_1,0.976666,0.844301,0.953935,0.973205,0.968951,0.156036,1.041163,0.204898,0.413913,0.309406,0.087451,0.395014,1.008116,0.411831,189.715333,303.070785,"Hidden Size=[23], regularizer=0.2, learning_ra..."
173,model_34_6_0,0.976666,0.844301,0.953939,0.973206,0.968953,0.156037,1.041163,0.204879,0.413892,0.309386,0.087378,0.395015,1.008116,0.411831,189.715328,303.070780,"Hidden Size=[23], regularizer=0.2, learning_ra..."
280,model_6_5_6,0.960503,0.819204,0.949491,0.947298,0.952845,0.264116,1.208981,0.248980,0.636433,0.442706,1.887069,0.513922,1.023120,0.535801,132.662732,211.889661,"Hidden Size=[16], regularizer=0.2, learning_ra..."
