In [5]:
import pandas as pd
import numpy as np

def read_excel(file_name):
    df = pd.read_excel(file_name)
    return df

def read_txt(file_name):
    file = open(file_name)
    lines = file.readlines()
    return(lines[0])

In [6]:
import os
import glob

def get_files(subfolder, extension):
    dir = f"{os.getcwd()}/content/{subfolder}/"
    tables = glob.glob(f"{dir}*.{extension}")
    return tables

In [7]:
class Analizer:
    def __init__(self, boundary):
        self.results = get_files(subfolder="results", extension="xlsx")
        self.results_df = pd.DataFrame()
        self.boundary = boundary
    
    def has_minimum_requirements(self, df, sort_by="r2"):
        sorted_df = df.sort_values(by=sort_by, ascending=False)
        top_r2 = sorted_df.head(1)[sort_by].values[0]
        if top_r2 < self.boundary:
            return False
        return True
    
    def concatenate_df(self, df, architecture):
        if self.has_minimum_requirements(df):
            df['Architecture'] = architecture
            df = df.rename(columns={'Unnamed: 0': 'model'})
            self.results_df = pd.concat([self.results_df, df], ignore_index=True) 

    def create_results_df(self):
        for file in self.results:
            df = read_excel(file)
            architecture = read_txt(file.replace(".xlsx", ".txt"))
            self.concatenate_df(df, architecture)


    def keep_random_models(self, n_models=3):
        # Extract block part from 'model' column
        self.results_df['block'] = self.results_df['model'].apply(lambda x: '_'.join(x.split('_')[1:3]))
        
        # Group by 'block' and keep n_models randomly from each group
        def select_random_models(group):
            return group.sample(n=min(n_models, len(group)), random_state=1)
        
        # Apply the function and ensure grouping columns are excluded from the final DataFrame
        grouped = self.results_df.groupby('block', group_keys=False).apply(select_random_models)
        self.results_df = grouped.reset_index(drop=True).drop(columns=['block'])

    def clean_folder(self, subfolder, extension, remove_last=True):
        files = get_files(subfolder, extension)
        models = self.results_df["model"]
        if (remove_last):
            models = models.apply(lambda x: '_'.join(x.rsplit('_', 1)[:-1]))
        for file in files:
            file_name = os.path.basename(file).split('.')[0]
            file_parts = file_name.split('_')            
            dataset_model = f"model_{file_parts[1]}_{file_parts[2]}" 
            if (remove_last == False):
                dataset_model = (f"{dataset_model}_{file_parts[3]}")
            if dataset_model not in models.values:
                os.remove(file)   
        
    def Analize(self):
        self.create_results_df()
        self.keep_random_models()
        self.results_df = self.results_df.sort_values(by="r2_sup", ascending=False, ignore_index=True)
        self.results_df.to_excel(f"./results/better_results.xlsx", index=True)
        display(self.results_df)


In [8]:
analize = Analizer(0.1)
analize.Analize()
analize.clean_folder(subfolder="dataset", extension="pkl")
analize.clean_folder(subfolder="results", extension="xlsx")
analize.clean_folder(subfolder="results", extension="txt")
analize.clean_folder(subfolder="models", extension="keras", remove_last=False)



  grouped = self.results_df.groupby('block', group_keys=False).apply(select_random_models)


Unnamed: 0,model,r2,r2_sup,r2_test,r2_val,mse,mse_sup,mse_test,mse_val,mape,rmse,r2_adj,rsd,aic,bic,Architecture
0,model_1_4_2,0.928940,0.683505,0.650527,0.923829,0.295790,2.116401,1.250045,0.197684,0.778177,0.543865,5.547846,0.552431,132.436214,273.771387,"Hidden Size=[12, 2], regularizer=0.02, learnin..."
1,model_1_4_5,0.951458,0.675483,0.598107,0.940115,0.202059,2.170048,1.437546,0.155419,0.667521,0.449510,4.106710,0.456589,133.198393,274.533565,"Hidden Size=[12, 2], regularizer=0.02, learnin..."
2,model_1_3_8,0.920872,0.672625,0.628323,0.933134,0.329371,2.189155,1.329467,0.262201,0.829254,0.573909,6.064171,0.582947,132.221141,273.556313,"Hidden Size=[12, 2], regularizer=0.02, learnin..."
3,model_1_4_8,0.956689,0.667323,0.584078,0.933514,0.180283,2.224614,1.487728,0.172550,0.623845,0.424598,3.771904,0.431285,133.426453,274.761625,"Hidden Size=[12, 2], regularizer=0.02, learnin..."
4,model_1_5_8,0.981054,0.661066,0.464005,0.963137,0.078864,2.266452,1.917224,0.176129,0.376437,0.280828,2.212563,0.285251,135.080049,276.415222,"Hidden Size=[12, 2], regularizer=0.02, learnin..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,model_1_38_2,0.981340,0.043385,-1.984275,0.966596,0.077673,6.396887,10.674578,0.093572,0.385127,0.278699,2.194243,0.283088,135.110497,276.445670,"Hidden Size=[12, 2], regularizer=0.02, learnin..."
140,model_1_38_9,0.986486,0.034419,-1.959812,0.966244,0.056254,6.456847,10.587077,0.094559,0.357765,0.237178,1.864914,0.240914,135.755772,277.090945,"Hidden Size=[12, 2], regularizer=0.02, learnin..."
141,model_1_39_3,0.988615,0.033861,-2.152167,0.987170,0.047391,6.460574,11.275119,0.046969,0.332308,0.217694,1.728644,0.221122,136.098662,277.433835,"Hidden Size=[12, 2], regularizer=0.02, learnin..."
142,model_1_39_1,0.987823,0.021411,-2.056973,0.992570,0.050687,6.543829,10.934615,0.027199,0.332736,0.225137,1.779320,0.228682,135.964189,277.299362,"Hidden Size=[12, 2], regularizer=0.02, learnin..."
