In [None]:
import pandas as pd
import numpy as np

def read_excel(file_name):
    df = pd.read_excel(file_name)
    return df

def read_txt(file_name):
    file = open(file_name)
    lines = file.readlines()
    return(lines[0])

In [2]:
import os
import glob

def get_files(subfolder, extension):
    dir = f"{os.getcwd()}/content/{subfolder}/"
    tables = glob.glob(f"{dir}*.{extension}")
    return tables

In [3]:
class Analizer:
    def __init__(self, boundary):
        self.results = get_files(subfolder="results", extension="xlsx")
        self.results_df = pd.DataFrame()
        self.boundary = boundary
    
    def has_minimum_requirements(self, df, sort_by="r2"):
        sorted_df = df.sort_values(by=sort_by, ascending=False)
        top_r2 = sorted_df.head(1)[sort_by].values[0]
        if top_r2 < self.boundary:
            return False
        return True
    
    def concatenate_df(self, df, architecture):
        if self.has_minimum_requirements(df):
            df['Architecture'] = architecture
            df = df.rename(columns={'Unnamed: 0': 'model'})
            self.results_df = pd.concat([self.results_df, df], ignore_index=True) 

    def create_results_df(self):
        for file in self.results:
            df = read_excel(file)
            architecture = read_txt(file.replace(".xlsx", ".txt"))
            self.concatenate_df(df, architecture)


    def keep_random_models(self, n_models=3):
        # Extract block part from 'model' column
        self.results_df['block'] = self.results_df['model'].apply(lambda x: '_'.join(x.split('_')[1:3]))
        
        # Group by 'block' and keep n_models randomly from each group
        def select_random_models(group):
            return group.sample(n=min(n_models, len(group)), random_state=1)
        
        # Apply the function and ensure grouping columns are excluded from the final DataFrame
        grouped = self.results_df.groupby('block', group_keys=False).apply(select_random_models)
        self.results_df = grouped.reset_index(drop=True).drop(columns=['block'])

    def clean_folder(self, subfolder, extension, remove_last=True):
        files = get_files(subfolder, extension)
        models = self.results_df["model"]
        if (remove_last):
            models = models.apply(lambda x: '_'.join(x.rsplit('_', 1)[:-1]))
        for file in files:
            file_name = os.path.basename(file).split('.')[0]
            file_parts = file_name.split('_')            
            dataset_model = f"model_{file_parts[1]}_{file_parts[2]}" 
            if (remove_last == False):
                dataset_model = (f"{dataset_model}_{file_parts[3]}")
            if dataset_model not in models.values:
                os.remove(file)   
        
    def Analize(self):
        self.create_results_df()
        self.keep_random_models()
        self.results_df = self.results_df.sort_values(by="r2_sup", ascending=False, ignore_index=True)
        self.results_df.to_excel(f"./results/better_results.xlsx", index=True)
        display(self.results_df)


In [4]:
analize = Analizer(0.1)
analize.Analize()
analize.clean_folder(subfolder="dataset", extension="pkl")
analize.clean_folder(subfolder="results", extension="xlsx")
analize.clean_folder(subfolder="results", extension="txt")
analize.clean_folder(subfolder="models", extension="keras", remove_last=False)



  grouped = self.results_df.groupby('block', group_keys=False).apply(select_random_models)


Unnamed: 0,model,r2,r2_sup,r2_test,r2_val,mse,mse_sup,mse_test,mse_val,mape,rmse,r2_adj,rsd,aic,bic,Architecture
0,model_7_1_3,0.360303,0.276485,0.478715,0.572173,0.075485,0.429510,0.365171,0.065335,0.109825,0.274745,-0.289390,0.276883,135.167652,321.557391,"Hidden Size=[8, 4], regularizer=0.02, learning..."
1,model_1_43_2,0.253341,0.248549,0.107513,0.047995,0.088106,0.446094,0.625206,0.051593,0.123599,0.296827,0.140008,0.299137,38.858423,87.606509,"Hidden Size=[4], regularizer=0.02, learning_ra..."
2,model_1_43_6,0.254371,0.244623,0.099650,0.033386,0.087985,0.448424,0.630714,0.052385,0.123536,0.296622,0.141196,0.298930,38.861186,87.609272,"Hidden Size=[4], regularizer=0.02, learning_ra..."
3,model_1_42_7,0.254099,0.242615,0.102143,-2.379439,0.088017,0.449616,0.628968,0.042095,0.123548,0.296676,0.140882,0.298985,38.860457,87.608543,"Hidden Size=[4], regularizer=0.02, learning_ra..."
4,model_1_43_9,0.254989,0.241727,0.093086,0.024211,0.087912,0.450143,0.635312,0.052882,0.123455,0.296499,0.141907,0.298807,38.862844,87.610930,"Hidden Size=[4], regularizer=0.02, learning_ra..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1363,model_7_24_7,0.050793,-18.513548,-0.148963,-1.092597,0.112007,11.584085,0.804872,0.296234,0.118783,0.334674,-0.913245,0.337279,134.378388,320.768127,"Hidden Size=[8, 4], regularizer=0.02, learning..."
1364,model_7_22_3,-1.114632,-18.913336,-0.980274,-9.549775,0.249528,11.821416,1.387223,1.033902,0.137889,0.499528,-3.262304,0.503415,132.776370,319.166109,"Hidden Size=[8, 4], regularizer=0.02, learning..."
1365,model_7_22_2,-1.780243,-19.076946,-0.685683,-13.170942,0.328070,11.918542,1.180856,1.388784,0.152486,0.572774,-4.603928,0.577232,132.229054,318.618794,"Hidden Size=[8, 4], regularizer=0.02, learning..."
1366,model_7_24_5,-0.121535,-20.545614,-0.179838,-1.475280,0.132342,12.790407,0.826501,0.350408,0.133542,0.363788,-1.260593,0.366619,134.044735,320.434475,"Hidden Size=[8, 4], regularizer=0.02, learning..."
