In [1]:
import pandas as pd
import numpy as np

def read_excel(file_name):
    df = pd.read_excel(file_name)
    return df

def read_txt(file_name):
    file = open(file_name)
    lines = file.readlines()
    return(lines[0])

In [2]:
import os
import glob

def get_files(subfolder, extension):
    dir = f"{os.getcwd()}/content/{subfolder}/"
    tables = glob.glob(f"{dir}*.{extension}")
    return tables

In [5]:
class Analizer:
    def __init__(self, boundary):
        self.results = get_files(subfolder="results", extension="xlsx")
        self.results_df = pd.DataFrame()
        self.boundary = boundary
    
    def has_minimum_requirements(self, df, sort_by="r2_sup"):
        sorted_df = df.sort_values(by=sort_by, ascending=False)
        top_r2 = sorted_df.head(1)[sort_by].values[0]
        if top_r2 < self.boundary:
            return False
        return True
    
    def concatenate_df(self, df, architecture):
        if self.has_minimum_requirements(df):
            df['Architecture'] = architecture
            df = df.rename(columns={'Unnamed: 0': 'model'})
            self.results_df = pd.concat([self.results_df, df], ignore_index=True) 

    def create_results_df(self):
        for file in self.results:
            df = read_excel(file)
            architecture = read_txt(file.replace(".xlsx", ".txt"))
            self.concatenate_df(df, architecture)


    def keep_random_models(self, n_models=3):
        # Extract block part from 'model' column
        self.results_df['block'] = self.results_df['model'].apply(lambda x: '_'.join(x.split('_')[1:3]))
        
        # Group by 'block' and keep n_models randomly from each group
        def select_random_models(group):
            return group.sample(n=min(n_models, len(group)), random_state=1)
        
        # Apply the function and ensure grouping columns are excluded from the final DataFrame
        grouped = self.results_df.groupby('block', group_keys=False).apply(select_random_models)
        self.results_df = grouped.reset_index(drop=True).drop(columns=['block'])

    def clean_folder(self, subfolder, extension, remove_last=True):
        files = get_files(subfolder, extension)
        models = self.results_df["model"]
        if (remove_last):
            models = models.apply(lambda x: '_'.join(x.rsplit('_', 1)[:-1]))
        for file in files:
            file_name = os.path.basename(file).split('.')[0]
            file_parts = file_name.split('_')            
            dataset_model = f"model_{file_parts[1]}_{file_parts[2]}" 
            if (remove_last == False):
                dataset_model = (f"{dataset_model}_{file_parts[3]}")
            if dataset_model not in models.values:
                os.remove(file)   
        
    def Analize(self):
        self.create_results_df()
        self.keep_random_models()
        self.results_df = self.results_df.sort_values(by="r2_sup", ascending=False, ignore_index=True)
        self.results_df.to_excel(f"./results/better_results.xlsx", index=True)
        display(self.results_df)


In [7]:
analize = Analizer(0.7)
analize.Analize()
analize.clean_folder(subfolder="dataset", extension="pkl")
analize.clean_folder(subfolder="results", extension="xlsx")
analize.clean_folder(subfolder="results", extension="txt")
analize.clean_folder(subfolder="models", extension="keras", remove_last=False)



  grouped = self.results_df.groupby('block', group_keys=False).apply(select_random_models)


Unnamed: 0,model,r2,r2_sup,r2_test,r2_val,mse,mse_sup,mse_test,mse_val,mape,rmse,r2_adj,rsd,aic,bic,Architecture
0,model_9_9_7,0.999075,0.836974,0.853375,0.995802,0.003852,1.090153,0.524468,0.018273,0.058807,0.062061,1.001039,0.063038,253.118575,516.219435,"Hidden Size=[12, 6], regularizer=0.2, learning..."
1,model_9_7_7,0.998440,0.836586,0.832004,0.991258,0.006495,1.092752,0.600911,0.031899,0.097966,0.080595,1.001752,0.081864,252.073299,515.174158,"Hidden Size=[12, 6], regularizer=0.2, learning..."
2,model_9_9_3,0.999116,0.836514,0.852591,0.996302,0.003678,1.093231,0.527275,0.016096,0.072172,0.060649,1.000992,0.061604,253.210637,516.311497,"Hidden Size=[12, 6], regularizer=0.2, learning..."
3,model_9_9_0,0.999118,0.836146,0.851776,0.996775,0.003674,1.095692,0.530189,0.014038,0.086844,0.060611,1.000991,0.061566,253.213097,516.313956,"Hidden Size=[12, 6], regularizer=0.2, learning..."
4,model_9_10_2,0.999573,0.836108,0.848829,0.999761,0.001779,1.095944,0.540732,0.001912,0.036591,0.042173,1.000480,0.042838,254.663869,517.764729,"Hidden Size=[12, 6], regularizer=0.2, learning..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
787,model_5_0_0,0.649960,0.575646,0.473184,0.612215,1.457300,2.837660,1.884389,1.465194,2.551132,1.207187,23.402545,1.226199,129.246829,270.582001,"Hidden Size=[8, 4], regularizer=0.2, learning_..."
788,model_17_0_0,0.689945,0.539897,0.376236,0.671210,1.290835,3.076713,2.231168,1.375884,1.602226,1.136149,1.027220,1.154042,1585.489422,3309.778527,"Hidden Size=[36, 18], regularizer=0.2, learnin..."
789,model_13_0_0,0.677360,0.535809,0.390555,0.702122,1.343226,3.104050,2.179949,1.993767,2.955303,1.158976,1.064327,1.177229,769.409851,1606.548950,"Hidden Size=[24, 12], regularizer=0.2, learnin..."
790,model_2_1_0,0.704131,0.531832,0.534521,0.535008,1.231774,3.130641,1.664992,0.820874,1.479854,1.109853,0.514472,1.127332,49.583090,103.942772,"Hidden Size=[4, 2], regularizer=0.2, learning_..."
