In [1]:
import pandas as pd
import numpy as np

def read_excel(file_name):
    df = pd.read_excel(file_name)
    return df

def read_txt(file_name):
    file = open(file_name)
    lines = file.readlines()
    return(lines[0])

In [2]:
import os
import glob

def get_files(subfolder, extension):
    dir = f"{os.getcwd()}/content/{subfolder}/"
    tables = glob.glob(f"{dir}*.{extension}")
    return tables

In [8]:
class Analizer:
    def __init__(self, boundary):
        self.results = get_files(subfolder="results", extension="xlsx")
        self.results_df = pd.DataFrame()
        self.boundary = boundary
    
    def has_minimum_requirements(self, df, sort_by="r2_sup"):
        sorted_df = df.sort_values(by=sort_by, ascending=False)
        top_r2 = sorted_df.head(1)[sort_by].values[0]
        if top_r2 < self.boundary:
            return False
        return True
    
    def concatenate_df(self, df, architecture):
        if self.has_minimum_requirements(df):
            df['Architecture'] = architecture
            df = df.rename(columns={'Unnamed: 0': 'model'})
            self.results_df = pd.concat([self.results_df, df], ignore_index=True) 

    def create_results_df(self):
        for file in self.results:
            df = read_excel(file)
            architecture = read_txt(file.replace(".xlsx", ".txt"))
            self.concatenate_df(df, architecture)


    def keep_random_models(self, n_models=3):
        # Extract block part from 'model' column
        self.results_df['block'] = self.results_df['model'].apply(lambda x: '_'.join(x.split('_')[1:3]))
        
        # Group by 'block' and keep n_models randomly from each group
        def select_random_models(group):
            return group.sample(n=min(n_models, len(group)), random_state=1)
        
        # Apply the function and ensure grouping columns are excluded from the final DataFrame
        grouped = self.results_df.groupby('block', group_keys=False).apply(select_random_models)
        self.results_df = grouped.reset_index(drop=True).drop(columns=['block'])

    def clean_folder(self, subfolder, extension, remove_last=True):
        files = get_files(subfolder, extension)
        models = self.results_df["model"]
        if (remove_last):
            models = models.apply(lambda x: '_'.join(x.rsplit('_', 1)[:-1]))
        for file in files:
            file_name = os.path.basename(file).split('.')[0]
            file_parts = file_name.split('_')            
            dataset_model = f"model_{file_parts[1]}_{file_parts[2]}" 
            if (remove_last == False):
                dataset_model = (f"{dataset_model}_{file_parts[3]}")
            if dataset_model not in models.values:
                os.remove(file)   
        
    def Analize(self):
        self.create_results_df()
        self.keep_random_models()
        self.results_df = self.results_df.sort_values(by="r2_sup", ascending=False, ignore_index=True)
        self.results_df.to_excel(f"./results/better_results.xlsx", index=True)
        display(self.results_df)


In [9]:
analize = Analizer(0.7)
analize.Analize()
analize.clean_folder(subfolder="dataset", extension="pkl")
analize.clean_folder(subfolder="results", extension="xlsx")
analize.clean_folder(subfolder="results", extension="txt")
analize.clean_folder(subfolder="models", extension="keras", remove_last=False)



  grouped = self.results_df.groupby('block', group_keys=False).apply(select_random_models)


Unnamed: 0,model,r2,r2_sup,r2_test,r2_val,mse,mse_sup,mse_test,mse_val,mape,rmse,r2_adj,rsd,aic,bic,Architecture
0,model_1_3_7,0.928161,0.806469,0.815715,0.857133,0.169958,1.294145,0.659175,0.364882,0.551535,0.412260,0.882111,0.418752,53.544408,107.904089,"Hidden Size=[4, 2], regularizer=0.2, learning_..."
1,model_1_5_3,0.931630,0.783118,0.707727,0.611177,0.161751,1.450291,1.045445,0.342861,0.403160,0.402183,0.887803,0.408517,53.643389,108.003071,"Hidden Size=[4, 2], regularizer=0.2, learning_..."
2,model_1_8_7,0.911525,0.771219,0.626641,0.629163,0.209317,1.529861,1.335483,0.578987,0.780335,0.457512,0.854810,0.464717,53.127808,107.487490,"Hidden Size=[4, 2], regularizer=0.2, learning_..."
3,model_5_1_0,0.949634,0.769439,0.790315,0.736396,0.119157,1.541766,0.750032,0.119503,0.189278,0.345192,4.223435,0.350628,134.254619,275.589792,"Hidden Size=[8, 4], regularizer=0.2, learning_..."
4,model_1_8_3,0.907017,0.767808,0.618413,0.611405,0.219981,1.552667,1.364913,0.606713,0.814774,0.469021,0.847413,0.476408,53.028429,107.388111,"Hidden Size=[4, 2], regularizer=0.2, learning_..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,model_1_7_9,0.671511,0.613635,-0.085641,-2.594308,0.777147,2.583624,3.883276,3.560971,2.404584,0.881560,0.460941,0.895443,50.504251,104.863933,"Hidden Size=[4, 2], regularizer=0.2, learning_..."
65,model_13_1_0,0.907233,0.607317,0.331921,0.888672,0.219471,2.625875,2.389680,0.205731,0.268666,0.468478,1.018496,0.475856,773.033069,1610.172167,"Hidden Size=[24, 12], regularizer=0.2, learnin..."
66,model_5_1_6,0.826713,0.581590,-0.153949,-3.461625,0.409967,2.797911,4.127609,2.022650,0.227504,0.640287,12.090398,0.650371,131.783355,273.118528,"Hidden Size=[8, 4], regularizer=0.2, learning_..."
67,model_5_0_0,0.667541,0.542033,0.596092,0.639945,0.786538,3.062430,1.444756,0.444397,0.878245,0.886870,22.277349,0.900837,130.480228,271.815401,"Hidden Size=[8, 4], regularizer=0.2, learning_..."
