In [None]:
import pandas as pd
import numpy as np

def read_excel(file_name):
    df = pd.read_excel(file_name)
    return df

def read_txt(file_name):
    file = open(file_name)
    lines = file.readlines()
    return(lines[0])

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
import os
import glob

def get_files(subfolder, extension):
    dir = f"{os.getcwd()}/content/{subfolder}/"
    tables = glob.glob(f"{dir}*.{extension}")
    return tables

In [3]:
class Analizer:
    def __init__(self, boundary):
        self.results = get_files(subfolder="results", extension="xlsx")
        self.results_df = pd.DataFrame()
        self.boundary = boundary
    
    def has_minimum_requirements(self, df, sort_by="r2"):
        sorted_df = df.sort_values(by=sort_by, ascending=False)
        top_r2 = sorted_df.head(1)[sort_by].values[0]
        if top_r2 < self.boundary:
            return False
        return True
    
    def concatenate_df(self, df, architecture):
        if self.has_minimum_requirements(df):
            df['Architecture'] = architecture
            df = df.rename(columns={'Unnamed: 0': 'model'})
            self.results_df = pd.concat([self.results_df, df], ignore_index=True) 

    def create_results_df(self):
        for file in self.results:
            df = read_excel(file)
            architecture = read_txt(file.replace(".xlsx", ".txt"))
            self.concatenate_df(df, architecture)


    def keep_random_models(self, n_models=3):
        # Extract block part from 'model' column
        self.results_df['block'] = self.results_df['model'].apply(lambda x: '_'.join(x.split('_')[1:3]))
        
        # Group by 'block' and keep n_models randomly from each group
        def select_random_models(group):
            return group.sample(n=min(n_models, len(group)), random_state=1)
        
        # Apply the function and ensure grouping columns are excluded from the final DataFrame
        grouped = self.results_df.groupby('block', group_keys=False).apply(select_random_models)
        self.results_df = grouped.reset_index(drop=True).drop(columns=['block'])

    def clean_folder(self, subfolder, extension, remove_last=True):
        files = get_files(subfolder, extension)
        models = self.results_df["model"]
        if (remove_last):
            models = models.apply(lambda x: '_'.join(x.rsplit('_', 1)[:-1]))
        for file in files:
            file_name = os.path.basename(file).split('.')[0]
            file_parts = file_name.split('_')            
            dataset_model = f"model_{file_parts[1]}_{file_parts[2]}" 
            if (remove_last == False):
                dataset_model = (f"{dataset_model}_{file_parts[3]}")
            if dataset_model not in models.values:
                os.remove(file)   
        
    def Analize(self):
        self.create_results_df()
        self.keep_random_models()
        self.results_df = self.results_df.sort_values(by="r2_sup", ascending=False, ignore_index=True)
        self.results_df.to_excel(f"./results/better_results.xlsx", index=True)
        display(self.results_df)


In [4]:
analize = Analizer(0.1)
analize.Analize()
analize.clean_folder(subfolder="dataset", extension="pkl")
analize.clean_folder(subfolder="results", extension="xlsx")
analize.clean_folder(subfolder="results", extension="txt")
analize.clean_folder(subfolder="models", extension="keras", remove_last=False)



  grouped = self.results_df.groupby('block', group_keys=False).apply(select_random_models)


Unnamed: 0,model,r2,r2_sup,r2_test,r2_val,mse,mse_sup,mse_test,mse_val,mape,rmse,r2_adj,rsd,aic,bic,Architecture
0,model_1_5_9,0.648402,0.539812,0.729004,0.507711,0.102620,0.273188,0.230818,0.143215,0.163153,0.320343,0.521229,0.325389,38.553446,75.518030,"Hidden Size=[4], regularizer=0.02, learning_ra..."
1,model_1_35_7,0.677906,0.536701,0.754502,0.301777,0.094009,0.275034,0.209101,0.061447,0.151261,0.306609,0.561403,0.311438,38.728731,75.693315,"Hidden Size=[4], regularizer=0.02, learning_ra..."
2,model_1_36_2,0.680808,0.535730,0.755526,0.331228,0.093162,0.275610,0.208228,0.078903,0.150366,0.305224,0.565356,0.310031,38.746835,75.711419,"Hidden Size=[4], regularizer=0.02, learning_ra..."
3,model_1_24_2,0.680210,0.534618,0.742078,0.748033,0.093336,0.276271,0.219683,0.127060,0.151434,0.305510,0.564541,0.310322,38.743089,75.707673,"Hidden Size=[4], regularizer=0.02, learning_ra..."
4,model_1_35_3,0.676995,0.533813,0.753774,0.325668,0.094275,0.276749,0.209721,0.059344,0.151236,0.307042,0.560163,0.311878,38.723082,75.687665,"Hidden Size=[4], regularizer=0.02, learning_ra..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
535,model_2_49_9,0.774845,-3327.667846,-6431.723323,0.872723,0.065715,1976.041024,5479.014453,0.073132,0.129834,0.256350,0.535165,0.260387,71.444847,143.199627,"Hidden Size=[8], regularizer=0.02, learning_ra..."
536,model_3_48_0,0.553336,-38115.006706,-126619.048916,-0.058276,0.130367,22627.308106,107847.492148,0.096677,0.183532,0.361063,-0.905765,0.366750,102.074809,208.619785,"Hidden Size=[12], regularizer=0.02, learning_r..."
537,model_3_47_5,0.348690,-38350.193263,-128398.404036,-0.918482,0.190096,22766.925006,109363.041929,0.432760,0.181947,0.436000,-1.778922,0.442867,101.320450,207.865426,"Hidden Size=[12], regularizer=0.02, learning_r..."
538,model_3_47_2,0.244764,-39215.377638,-128370.810842,-1.584248,0.220429,23280.535825,109339.539673,0.582939,0.183957,0.469499,-2.222340,0.476893,101.024360,207.569336,"Hidden Size=[12], regularizer=0.02, learning_r..."
