In [1]:
import pandas as pd
import numpy as np

def read_excel(file_name):
    df = pd.read_excel(file_name)
    return df

def read_txt(file_name):
    file = open(file_name)
    lines = file.readlines()
    return(lines[0])

In [2]:
import os
import glob

def get_files(subfolder, extension):
    dir = f"{os.getcwd()}/content/{subfolder}/"
    tables = glob.glob(f"{dir}*.{extension}")
    return tables

In [3]:
class Analizer:
    def __init__(self, boundary):
        self.results = get_files(subfolder="results", extension="xlsx")
        self.results_df = pd.DataFrame()
        self.boundary = boundary
    
    def has_minimum_requirements(self, df, sort_by="r2"):
        sorted_df = df.sort_values(by=sort_by, ascending=False)
        top_r2 = sorted_df.head(1)[sort_by].values[0]
        if top_r2 < self.boundary:
            return False
        return True
    
    def concatenate_df(self, df, architecture):
        if self.has_minimum_requirements(df):
            df['Architecture'] = architecture
            df = df.rename(columns={'Unnamed: 0': 'model'})
            self.results_df = pd.concat([self.results_df, df], ignore_index=True) 

    def create_results_df(self):
        for file in self.results:
            df = read_excel(file)
            architecture = read_txt(file.replace(".xlsx", ".txt"))
            self.concatenate_df(df, architecture)


    def keep_random_models(self, n_models=3):
        # Extract block part from 'model' column
        self.results_df['block'] = self.results_df['model'].apply(lambda x: '_'.join(x.split('_')[1:3]))
        
        # Group by 'block' and keep n_models randomly from each group
        def select_random_models(group):
            return group.sample(n=min(n_models, len(group)), random_state=1)
        
        # Apply the function and ensure grouping columns are excluded from the final DataFrame
        grouped = self.results_df.groupby('block', group_keys=False).apply(select_random_models)
        self.results_df = grouped.reset_index(drop=True).drop(columns=['block'])

    def clean_folder(self, subfolder, extension, remove_last=True):
        files = get_files(subfolder, extension)
        models = self.results_df["model"]
        if (remove_last):
            models = models.apply(lambda x: '_'.join(x.rsplit('_', 1)[:-1]))
        for file in files:
            file_name = os.path.basename(file).split('.')[0]
            file_parts = file_name.split('_')            
            dataset_model = f"model_{file_parts[1]}_{file_parts[2]}" 
            if (remove_last == False):
                dataset_model = (f"{dataset_model}_{file_parts[3]}")
            if dataset_model not in models.values:
                os.remove(file)   
        
    def Analize(self):
        self.create_results_df()
        self.keep_random_models()
        self.results_df = self.results_df.sort_values(by="r2_sup", ascending=False, ignore_index=True)
        self.results_df.to_excel(f"./results/better_results.xlsx", index=True)
        display(self.results_df)


In [4]:
analize = Analizer(0.1)
analize.Analize()



  grouped = self.results_df.groupby('block', group_keys=False).apply(select_random_models)


Unnamed: 0,model,r2,r2_sup,r2_test,r2_val,mse,mse_sup,mse_test,mse_val,mape,rmse,r2_adj,rsd,aic,bic,Architecture
0,model_1_6_7,0.367879,0.567506,0.639553,-0.740495,0.169571,0.256747,0.307008,0.22444,0.207152,0.41179,0.13924,0.418275,37.548968,74.513551,"Hidden Size=[4], regularizer=0.02, learning_ra..."
1,model_1_7_2,0.389383,0.547559,0.6251,0.513112,0.163802,0.268589,0.319317,0.147944,0.209125,0.404725,0.168522,0.411099,37.61819,74.582774,"Hidden Size=[4], regularizer=0.02, learning_ra..."
2,model_1_6_3,0.371518,0.546047,0.632614,-0.658173,0.168595,0.269486,0.312918,0.213824,0.210371,0.410603,0.144195,0.41707,37.560515,74.525098,"Hidden Size=[4], regularizer=0.02, learning_ra..."
3,model_1_7_6,0.411549,0.537839,0.602912,0.506191,0.157856,0.274358,0.338217,0.150047,0.207129,0.397311,0.198705,0.403568,37.692141,74.656725,"Hidden Size=[4], regularizer=0.02, learning_ra..."
4,model_1_7_9,0.421123,0.532016,0.590368,0.496957,0.155288,0.277816,0.3489,0.152853,0.205515,0.394066,0.211741,0.400272,37.724948,74.689531,"Hidden Size=[4], regularizer=0.02, learning_ra..."
5,model_11_4_0,0.276459,0.528176,0.606678,-3.928196,0.194095,0.280095,0.335009,0.135758,0.195608,0.440562,-2.08711,0.447501,101.278813,207.82379,"Hidden Size=[12], regularizer=0.2, learning_ra..."
6,model_1_6_0,0.366722,0.519121,0.617838,-0.617213,0.169881,0.28547,0.325503,0.208542,0.213728,0.412167,0.137664,0.418658,37.545311,74.509894,"Hidden Size=[4], regularizer=0.02, learning_ra..."
7,model_1_3_1,0.311046,0.517705,0.585319,-0.25108,0.184817,0.286311,0.353201,0.266936,0.210689,0.429903,0.06185,0.436674,37.376781,74.341364,"Hidden Size=[4], regularizer=0.02, learning_ra..."
8,model_1_3_2,0.311777,0.517588,0.585452,-0.252393,0.184621,0.286381,0.353088,0.267216,0.210779,0.429675,0.062845,0.436442,37.378903,74.343486,"Hidden Size=[4], regularizer=0.02, learning_ra..."
9,model_1_5_1,0.367118,0.512331,0.613235,-0.31867,0.169775,0.289502,0.329423,0.264963,0.21428,0.412038,0.138203,0.418527,37.54656,74.511143,"Hidden Size=[4], regularizer=0.02, learning_ra..."
