In [1]:
import pandas as pd
import numpy as np

def read_excel(file_name):
    df = pd.read_excel(file_name)
    return df

def read_txt(file_name):
    file = open(file_name)
    lines = file.readlines()
    return(lines[0])

In [2]:
import os
import glob

def get_files(subfolder, extension):
    dir = f"{os.getcwd()}/content/{subfolder}/"
    tables = glob.glob(f"{dir}*.{extension}")
    return tables

In [3]:
class Analizer:
    def __init__(self, boundary):
        self.results = get_files(subfolder="results", extension="xlsx")
        self.results_df = pd.DataFrame()
        self.boundary = boundary
    
    def has_minimum_requirements(self, df, sort_by="r2"):
        sorted_df = df.sort_values(by=sort_by, ascending=False)
        top_r2 = sorted_df.head(1)[sort_by].values[0]
        if top_r2 < self.boundary:
            return False
        return True
    
    def concatenate_df(self, df, architecture):
        if self.has_minimum_requirements(df):
            df['Architecture'] = architecture
            df = df.rename(columns={'Unnamed: 0': 'model'})
            self.results_df = pd.concat([self.results_df, df], ignore_index=True) 

    def create_results_df(self):
        for file in self.results:
            df = read_excel(file)
            architecture = read_txt(file.replace(".xlsx", ".txt"))
            self.concatenate_df(df, architecture)


    def keep_random_models(self, n_models=3):
        # Extract block part from 'model' column
        self.results_df['block'] = self.results_df['model'].apply(lambda x: '_'.join(x.split('_')[1:3]))
        
        # Group by 'block' and keep n_models randomly from each group
        def select_random_models(group):
            return group.sample(n=min(n_models, len(group)), random_state=1)
        
        # Apply the function and ensure grouping columns are excluded from the final DataFrame
        grouped = self.results_df.groupby('block', group_keys=False).apply(select_random_models)
        self.results_df = grouped.reset_index(drop=True).drop(columns=['block'])

    def clean_folder(self, subfolder, extension, remove_last=True):
        files = get_files(subfolder, extension)
        models = self.results_df["model"]
        if (remove_last):
            models = models.apply(lambda x: '_'.join(x.rsplit('_', 1)[:-1]))
        for file in files:
            file_name = os.path.basename(file).split('.')[0]
            file_parts = file_name.split('_')            
            dataset_model = f"model_{file_parts[1]}_{file_parts[2]}" 
            if (remove_last == False):
                dataset_model = (f"{dataset_model}_{file_parts[3]}")
            if dataset_model not in models.values:
                os.remove(file)   
        
    def Analize(self):
        self.create_results_df()
        self.keep_random_models()
        self.results_df = self.results_df.sort_values(by="r2_sup", ascending=False, ignore_index=True)
        self.results_df.to_excel(f"./results/better_results.xlsx", index=True)
        display(self.results_df)


In [4]:
analize = Analizer(0.1)
analize.Analize()
#analize.clean_folder(subfolder="dataset", extension="pkl")
#analize.clean_folder(subfolder="results", extension="xlsx")
#analize.clean_folder(subfolder="results", extension="txt")
#analize.clean_folder(subfolder="models", extension="keras", remove_last=False)


  grouped = self.results_df.groupby('block', group_keys=False).apply(select_random_models)


Unnamed: 0,model,r2,r2_sup,r2_test,r2_val,mse,mse_sup,mse_test,mse_val,mape,rmse,r2_adj,rsd,aic,bic,Architecture
0,model_2_4_5,0.971362,-0.386447,0.976041,0.969524,4.044395,205.027823,9.137215,4.737569,0.385348,2.011068,0.877810,2.042740,95.205336,201.750312,"Hidden Size=[12], regularizer=0.2, learning_ra..."
1,model_2_4_3,0.958644,-0.387368,0.966573,0.953823,5.840472,205.163972,12.747846,7.178361,0.507399,2.416707,0.823546,2.454768,94.470377,201.015353,"Hidden Size=[12], regularizer=0.2, learning_ra..."
2,model_2_3_0,0.924062,-0.391863,0.950522,0.939626,10.724260,205.828698,18.869220,15.399096,0.879434,3.274792,0.675996,3.326366,93.254983,199.799959,"Hidden Size=[12], regularizer=0.2, learning_ra..."
3,model_2_6_7,0.997278,-0.401907,0.996729,0.997597,0.384458,207.314097,1.247415,0.582713,0.084245,0.620047,0.988385,0.629812,99.911839,206.456815,"Hidden Size=[12], regularizer=0.2, learning_ra..."
4,model_2_5_4,0.988494,-0.418089,0.991942,0.988391,1.624853,209.707032,3.072853,1.758859,0.267807,1.274697,0.950910,1.294773,97.029165,203.574141,"Hidden Size=[12], regularizer=0.2, learning_ra..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
397,model_2_48_6,0.999998,-0.868421,0.963939,0.999998,0.000569,276.302099,13.752449,0.000608,0.002659,0.023851,1.000002,0.024867,112.943651,172.668566,"Hidden Size=[12], regularizer=0.2, learning_ra..."
398,model_2_48_9,0.999998,-0.868504,0.963935,0.999998,0.000517,276.314488,13.754031,0.000561,0.002531,0.022731,1.000002,0.023698,113.136155,172.861071,"Hidden Size=[12], regularizer=0.2, learning_ra..."
399,model_2_49_2,0.999999,-0.869003,0.963956,0.999994,0.000232,276.388254,13.745716,0.000176,0.001675,0.015237,1.000001,0.015886,114.736135,174.461050,"Hidden Size=[12], regularizer=0.2, learning_ra..."
400,model_2_49_6,1.000000,-0.869180,0.963957,0.999996,0.000140,276.414447,13.745372,0.000108,0.001325,0.011835,1.000000,0.012338,115.746945,175.471860,"Hidden Size=[12], regularizer=0.2, learning_ra..."
