In [10]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import itertools
import numpy as np
from tqdm import tqdm  # Barra de progresso
from sklearn.metrics import cohen_kappa_score
from scipy.spatial.distance import jaccard
import ast
import hashlib

In [11]:
FILE1 = "/teamspace/studios/this_studio/CNN_MODEL_TRAINING/CNN_models_combination_metrics.csv"
FILE2 = "/teamspace/studios/this_studio/DNN_MODEL_TRAINING/DNN_models_combination_metrics.csv"
OUTPUT_FILE = 'ensemble_candidates.csv'
ENSEMBLES_LENGTHS = [3,4]

In [12]:
# Load CSVs into pandas DataFrames
df1 = pd.read_csv(FILE1)
df2 = pd.read_csv(FILE2)


In [13]:
# Filter models with val2_Cohen_Kappa_Score > 0.40
df2 = df2[df2["val2_Cohen_Kappa_Score"] > 0.40].reset_index(drop=True)

In [14]:
# Concatenate both DataFrames
df1 = df1.sort_values(by='val2_accuracy', ascending=False).head(10)
df2 = df2.sort_values(by='val2_accuracy', ascending=False).head(10)

In [15]:
df = pd.concat([df1, df2], ignore_index=True)

In [16]:
#df = df.drop_duplicates(subset="Model", keep="first")

In [17]:
df.to_csv("200_best_models.csv", index=False)

In [18]:
total_combinations = sum(len(list(itertools.combinations(df.iterrows(), size))) for size in ENSEMBLES_LENGTHS)
print(f"Total Combinations: {total_combinations}")

Total Combinations: 5985


In [19]:

# Convert string representations of lists to actual lists
def safe_eval_list(value):
    """Safely converts a string representation of a list into a real list."""
    try:
        return ast.literal_eval(value) if isinstance(value, str) else value
    except (SyntaxError, ValueError):
        return []

list_columns = [
    "val2_y_pred", "val2_accuracy_vector", "val2_y_true", "val2_y_proba",
    "val2_Confusion_Matrix", "val2_error_indices"
]

for col in list_columns:
    df[col] = df[col].apply(safe_eval_list)

# Define chunk size for saving intermediate results
CHUNK_SIZE = 100000  # Salva a cada 100k combinações para evitar estouro de memória

# Calcula o total de combinações
total_combinations = sum(len(list(itertools.combinations(df.iterrows(), size))) for size in ENSEMBLES_LENGTHS)

def generate_combinations(df):
    """Generator function to yield model combinations in batches."""
    count = 0  # Para gerar ID único nos ensembles
    for size in ENSEMBLES_LENGTHS:
        for combo in itertools.combinations(df.iterrows(), size):
            indices, models = zip(*combo)
            
            # Garante um nome longo para o ensemble
            ensemble_name = f"ensemble_{count:010d}"

            count += 1

            # Extract model names and features
            models_type = [m["Model"] for m in models]
            models_features = [[m["Feature Group"]] for m in models]
            models_files = [m["val2_model_path"] for m in models]


  
            
            
            # Extract all individual metrics as lists
            individual_metrics = {
#                "train_accuracy": [m["train_accuracy"] for m in models],
#                "val_accuracy": [m["val_accuracy"] for m in models],
                "val2_accuracy": [m["val2_accuracy"] for m in models],
#                "gap": [m["gap"] for m in models],
                "val2_recall": [m["val2_recall"] for m in models],
                "val2_precision": [m["val2_precision"] for m in models],
                "val2_f1": [m["val2_f1"] for m in models],
                "val2_model_path": [m["val2_model_path"] for m in models],
                "val2_confusion_matrix": [m["val2_Confusion_Matrix"] for m in models],
                "val2_error_indices": [m["val2_error_indices"] for m in models]
            }

            # Extract predictions and accuracy vectors
            y_preds = [m["val2_y_pred"] for m in models]
            accuracy_vectors = [m["val2_accuracy_vector"] for m in models]

            # Compute Cohen's Kappa (agreement between models)
#            pairwise_kappas = []
#            for i in range(len(y_preds)):
#                for j in range(i + 1, len(y_preds)):
#                    try:
#                        kappa = cohen_kappa_score(y_preds[i], y_preds[j])
#                        pairwise_kappas.append(kappa)
#                    except ValueError:
#                        pairwise_kappas.append(0)  # Default if Cohen's Kappa cannot be computed

#            ensemble_kappa = np.mean(pairwise_kappas) if pairwise_kappas else 0

            # Compute Jaccard diversity (1 - Jaccard similarity)
#            pairwise_jaccards = []
#            for i in range(len(accuracy_vectors)):
#                for j in range(i + 1, len(accuracy_vectors)):
#                    if len(accuracy_vectors[i]) == len(accuracy_vectors[j]):
#                        intersection = np.sum(np.logical_and(accuracy_vectors[i], accuracy_vectors[j]))
#                        union = np.sum(np.logical_or(accuracy_vectors[i], accuracy_vectors[j]))
#                        jaccard_score = 1 - (intersection / union) if union != 0 else 1
#                        pairwise_jaccards.append(jaccard_score)

#            ensemble_diversity = np.mean(pairwise_jaccards) if pairwise_jaccards else 0

            # Assign the ensemble to a quadrant
#            if ensemble_diversity >= 0.4 and ensemble_kappa < 0.5:
#                quadrant = "Q1"
#            elif ensemble_diversity >= 0.4 and ensemble_kappa >= 0.5:
#                quadrant = "Q2"
#            elif ensemble_diversity < 0.4 and ensemble_kappa < 0.5:
#                quadrant = "Q3"
#            else:
#                quadrant = "Q4"

            # Store the ensemble data
            yield {
                "ensemble_name": ensemble_name,
                "models_files":models_files,
                "ensemble_length": size,
                "models_type": models_type,
                "models_features": models_features,
#                "ensemble_agreement_between_models": ensemble_kappa,
#                "ensemble_diversity_in_correct_prediction": ensemble_diversity,
#                "quadrant": quadrant,
                **individual_metrics  # Add all individual metrics
            }

# Process and save in chunks
with tqdm(total=total_combinations, desc="Generating Ensembles", unit="combination") as pbar:
    chunk = []
    for i, ensemble_data in enumerate(generate_combinations(df)):
        chunk.append(ensemble_data)

        # Save periodically
        if len(chunk) >= CHUNK_SIZE:
            pd.DataFrame(chunk).to_csv(OUTPUT_FILE, mode='a', header=not os.path.exists(OUTPUT_FILE), index=False)
            chunk = []  # Clear memory

        pbar.update(1)

    # Save remaining data
    if chunk:
        pd.DataFrame(chunk).to_csv(OUTPUT_FILE, mode='a', header=not os.path.exists(OUTPUT_FILE), index=False)

print(f"✅ Ensemble candidates saved to {OUTPUT_FILE}")

Generating Ensembles: 100%|██████████| 5985/5985 [00:00<00:00, 6173.22combination/s] 

✅ Ensemble candidates saved to ensemble_candidates.csv





In [20]:
import os
import pandas as pd

def split_csv_by_row(input_csv_path: str, output_dir: str, ensemble_name_col: str):
    """
    Divide um arquivo CSV em múltiplos arquivos, onde cada linha se torna um novo CSV.
    
    Args:
        input_csv_path (str): Caminho do arquivo CSV de entrada.
        output_dir (str): Diretório onde os arquivos CSV individuais serão salvos.
        ensemble_name_col (str): Nome da coluna a ser usada para nomear os arquivos.
    """
    # Criar diretório de saída se não existir
    os.makedirs(output_dir, exist_ok=True)
    
    # Ler o CSV
    df = pd.read_csv(input_csv_path)
    
    # Iterar sobre as linhas do DataFrame
    for _, row in df.iterrows():
        # Obter o nome do arquivo a partir da coluna especificada
        file_name = f"{row[ensemble_name_col]}.csv"
        file_path = os.path.join(output_dir, file_name)
        
        # Salvar a linha como um novo CSV
        row.to_frame().T.to_csv(file_path, index=False)

    print(f"Arquivo CSV dividido e salvo em: {output_dir}")


In [None]:
split_csv_by_row("ensemble_candidates.csv", "individual_ensemble_candidates", "ensemble_name")

Arquivo CSV dividido e salvo em: individual_ensemble_candidates
