# D - Model Training

## DNN

In [6]:
import os
import itertools
import base64
import csv
import joblib
from dnn_models import *

def group_features(df):
    feature_groups = {}

    for col in df.columns:
        if col.startswith("DNN_"):  # Filtra apenas colunas que começam com "DNN_"
            parts = col.split("_")
            if len(parts) > 2:  # Garante que há pelo menos um conjunto intermediário
                base_feature = "_".join(parts[1:-1])  # Pega todos os conjuntos entre o primeiro e o último
                
                if base_feature not in feature_groups:
                    feature_groups[base_feature] = []
                
                feature_groups[base_feature].append(col)

    return feature_groups


def generate_combinations_and_csv(features, max_length, output_dir, model_prefix, csv_filename):
    """
    Generate all combinations of features from size 1 to `max_length`, and save the results to a CSV file.
    
    Each combination includes:
    - The formatted combination (e.g., "feature_1 / feature_2").
    - The corresponding model file path.
    - The corresponding metrics file path.
    
    Args:
        features (list): List of feature names (strings).
        max_length (int): Maximum size of combinations to generate.
        output_dir (str): Directory where the output files will be saved.
        model_prefix (str): Prefix for naming model and metrics files.
        csv_filename (str): Name of the output CSV file.
    
    Output:
        A CSV file containing the combinations, model file paths, and metrics file paths.
    """
    
    # Ensure the output directories exist
    os.makedirs(output_dir, exist_ok=True)
    models_dir = os.path.join(output_dir, "models")
    metrics_dir = os.path.join(output_dir, "metrics")
    os.makedirs(models_dir, exist_ok=True)
    os.makedirs(metrics_dir, exist_ok=True)

    # List to store rows for the CSV file
    csv_rows = []

    # Generate all combinations of sizes from 1 to `max_length`
    for size in range(1, max_length + 1):
        for combination in itertools.combinations(features, size):
            # Format the combination as "feature_1 / feature_2 / ..."
            formatted_combination = " / ".join(combination)

            # Create the Base64-encoded name for the model and metrics files
            base_name = "/".join(combination)  # Use "/" as the separator for encoding
            base_name_encoded = base64.urlsafe_b64encode(base_name.encode("utf-8")).decode("utf-8")

            # Full paths for the model and metrics files
            model_file = os.path.join(models_dir, f"{model_prefix}_{base_name_encoded}.keras")
            metrics_file = os.path.join(metrics_dir, f"{model_prefix}_{base_name_encoded}.csv")

            # Add the row to the CSV data
            csv_rows.append({
                "combination": formatted_combination,
                "model_file": model_file,
                "metrics_file": metrics_file
            })

    # Save the CSV file
    csv_filepath = os.path.join(output_dir, csv_filename)
    with open(csv_filepath, mode="w", encoding="utf-8", newline="") as file:
        writer = csv.DictWriter(file, fieldnames=["combination", "model_file", "metrics_file"])
        writer.writeheader()
        writer.writerows(csv_rows)

    print(f"CSV file successfully generated: {csv_filepath}")

## Create all possible model combinations using feature sets ranging in length from 1 to 4.

12,590 combinations

In [None]:
DNN_val = joblib.load("DNN_val.joblib")

feature_groups = list(group_features(DNN_val).keys())
max_length = 4
output_dir = "DNN_MODEL_TRAINING"
model_prefix = "DNN"
csv_filename = "DNN_models_combination.csv"

generate_combinations_and_csv(feature_groups, max_length, output_dir, model_prefix, csv_filename)

## Train all model combinations

In [None]:
import os
import pandas as pd
from multiprocessing import Pool
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix, cohen_kappa_score


def train_and_evaluate(params, output_dir, feature_groups, train_df, val_df, test_df):
    """
    Trains and evaluates a single combination of model, feature group, and hyperparameters.
    Saves the trained model and metrics if they don't already exist.
    
    Args:
        params (dict): A dictionary containing all necessary parameters for training and evaluation.
        output_dir (str): Directory where models and metrics will be saved.
        feature_groups (dict): Dictionary mapping feature groups to their corresponding columns.
        train_df (pd.DataFrame): Training dataset.
        val_df (pd.DataFrame): Validation dataset.
        test_df (pd.DataFrame): Test dataset.
        
    Returns:
        None
    """
    # Import TensorFlow and other dependencies inside the function
    import tensorflow as tf
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LeakyReLU, Input
    from tensorflow.keras.optimizers import Adam, RMSprop, SGD
    from tensorflow.keras.callbacks import EarlyStopping
    
    # Unpack parameters
    model_func = params["model_func"]
    feature_combination = params["feature_group"]
    epochs = params["epochs"]
    batch_size = params["batch_size"]
    patience = params["patience"]
    learning_rate = params["learning_rate"]
    monitor_metric = params["monitor_metric"]
    model_name = params["model_name"]
    model_path = params["model_file"]  # Model file path from combinations_csv
    metrics_path = params["metrics_file"]  # Metrics file path from combinations_csv
    print(f"Working on {feature_combination}")
    
    # Check if both model and metrics files already exist
    if os.path.exists(model_path) and os.path.exists(metrics_path):
        print(f"Skipping training for {model_path} (both model and metrics already exist)")
        return
    
    # Filter the datasets based on the feature combination
    feature_keys = [key.strip() for key in feature_combination.split("/")]
    missing_keys = [key for key in feature_keys if key not in feature_groups]
    if missing_keys:
        raise KeyError(f"Missing feature groups in the dictionary: {missing_keys}")
    
    columns = []
    for key in feature_keys:
        columns.extend(feature_groups[key])
    
    X_train = train_df[columns].values
    X_val = val_df[columns].values
    X_test = test_df[columns].values
    
    # Use raw integer targets instead of one-hot encoding
    y_train = train_df['emotion'].values
    y_val = val_df['emotion'].values
    y_test = test_df['emotion'].values
    
    # Get the number of input features dynamically
    input_dim = X_train.shape[1]
    
    # Determine the number of unique classes
    num_classes = len(train_df['emotion'].unique())
    
    # Create and compile the model
    model, optimizer = model_func(input_dim=input_dim, num_classes=num_classes, learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    # Define early stopping callback
    early_stopping = EarlyStopping(
        monitor=monitor_metric, patience=patience, restore_best_weights=True
    )
    
    # Train the model
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=epochs,
        batch_size=batch_size,
        callbacks=[early_stopping],
        verbose=1
    )
    
    # Evaluate the model on the training and validation sets
    _, train_accuracy = model.evaluate(X_train, y_train, verbose=1)
    _, val_accuracy = model.evaluate(X_val, y_val, verbose=1)
    
    # Predict on the test set
    y_pred_proba = model.predict(X_test)
    y_pred = y_pred_proba.argmax(axis=1)
    y_true = y_test
    
    # Calculate additional metrics for the test set (val2)
    val2_accuracy = accuracy_score(y_true, y_pred)
    val2_recall = recall_score(y_true, y_pred, average='weighted')
    val2_precision = precision_score(y_true, y_pred, average='weighted')
    val2_f1 = f1_score(y_true, y_pred, average='weighted')
    val2_confusion_matrix = confusion_matrix(y_true, y_pred).tolist()
    val2_cohen_kappa = cohen_kappa_score(y_true, y_pred)
    val2_error_indices = [i for i, (true, pred) in enumerate(zip(y_true, y_pred)) if true != pred]
    val2_accuracy_vector = [1 if true == pred else 0 for true, pred in zip(y_true, y_pred)]
    
    # Calculate the gap between train_accuracy and val2_accuracy
    gap = train_accuracy - val2_accuracy
    
    # Save the trained model
    os.makedirs(os.path.dirname(model_path), exist_ok=True)
    model.save(model_path)
    print(f"Saved model to {model_path}")
    
    # Save the metrics
    metrics = {
        "Model": model_name,
        "Feature Group": feature_combination,
        "train_accuracy": train_accuracy,
        "val_accuracy": val_accuracy,
        "val2_accuracy": val2_accuracy,
        "gap": gap,  # Updated gap calculation
        "val2_recall": val2_recall,
        "val2_precision": val2_precision,
        "val2_f1": val2_f1,
        "val2_model_path": model_path,
        "val2_Confusion_Matrix": val2_confusion_matrix,
        "val2_Cohen_Kappa_Score": val2_cohen_kappa,
        "val2_error_indices": val2_error_indices,
        "val2_accuracy_vector": val2_accuracy_vector,
        "val2_y_pred": y_pred.tolist(),
        "val2_y_true": y_true.tolist(),
        "val2_y_proba": y_pred_proba.tolist()
    }
    os.makedirs(os.path.dirname(metrics_path), exist_ok=True)
    metrics_df = pd.DataFrame([metrics])
    metrics_df.to_csv(metrics_path, index=False)
    print(f"Saved metrics to {metrics_path}")


def train_dnn_models(output_dir, train_df, val_df, test_df, combinations_csv, num_workers=10):
    """
    Trains and evaluates the DNN-6 model for each feature combination in the provided CSV file,
    using fixed hyperparameters and parallel processing.
    
    Args:
        output_dir (str): Directory where models and metrics will be saved.
        train_df (pd.DataFrame): Training dataset.
        val_df (pd.DataFrame): Validation dataset.
        test_df (pd.DataFrame): Test dataset.
        combinations_csv (str): Path to the CSV file containing feature combinations.
        num_workers (int): Number of parallel workers.
        
    Returns:
        None
    """
    # Define fixed hyperparameters
    epochs = 100
    batch_size = 64
    patience = 10
    learning_rate = 0.0005
    monitor_metric = 'val_accuracy'
    
    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Generate the global feature groups dictionary
    feature_groups = group_features(train_df)
    
    # Load the combinations CSV file
    try:
        combinations_df = pd.read_csv(combinations_csv)
    except Exception as e:
        raise ValueError(f"Error reading combinations CSV file: {e}")
    
    # Validate that the required columns exist
    required_columns = ["combination", "model_file", "metrics_file"]
    missing_columns = [col for col in required_columns if col not in combinations_df.columns]
    if missing_columns:
        raise ValueError(f"The combinations CSV file is missing the following columns: {missing_columns}")
    
    # Prepare all experiments
    experiments = []
    for _, row in combinations_df.iterrows():
        feature_combination = row['combination']
        model_file = row['model_file']
        metrics_file = row['metrics_file']
        
        # Append the experiment configuration
        experiments.append({
            "model_func": create_model_6,  # Only DNN-6 is used
            "feature_group": feature_combination,  # Use the combination as the feature group name
            "epochs": epochs,
            "batch_size": batch_size,
            "patience": patience,
            "learning_rate": learning_rate,
            "monitor_metric": monitor_metric,
            "model_name": "DNN-6",  # Fixed model name
            "model_file": model_file,  # Model file path from combinations_csv
            "metrics_file": metrics_file  # Metrics file path from combinations_csv
        })
    
    # Use multiprocessing to run experiments in parallel
    with Pool(processes=num_workers) as pool:
        # Map the experiments to the worker function
        pool.starmap(
            train_and_evaluate,
            [(exp, output_dir, feature_groups, train_df, val_df, test_df) for exp in experiments]
        )

    print("All experiments completed. Models and metrics saved individually.")

In [None]:
DNN_train = joblib.load("DNN_train.joblib")
DNN_val = joblib.load("DNN_val.joblib")
DNN_val2 = joblib.load("DNN_val2.joblib")

In [None]:
output_dir = "DNN_MODEL_TRAINING"
csv_filename = os.path.join(output_dir, "DNN_models_combination.csv")

train_dnn_models(output_dir, DNN_train, DNN_val, DNN_val2, csv_filename, num_workers=50)



In [7]:
import os
import pandas as pd

def consolidate_csv(directory: str, output_file: str):
    """
    Traverses a directory and its subdirectories to find all CSV files,
    consolidating them into a single destination file.
    
    Parameters:
    directory (str): Path to the root directory where CSV files are located.
    output_file (str): Path to the destination file where consolidated data will be saved.
    """
    all_dfs = []  # List to store temporary DataFrames
    
    # Walk through all directories and subdirectories
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".csv"):  # Check if the file is a CSV
                file_path = os.path.join(root, file)
                try:
                    # Read the CSV file into a DataFrame
                    df = pd.read_csv(file_path)
                    df['model_file'] = file
                    all_dfs.append(df)
                except Exception as e:
                    print(f"Error processing {file_path}: {e}")
    
    # Consolidate all DataFrames into a single one and save to the destination file
    if all_dfs:
        consolidated_df = pd.concat(all_dfs, ignore_index=True)
        consolidated_df.to_csv(output_file, index=False)
        print(f"Consolidation completed. File saved at: {output_file}")
    else:
        print("No CSV files found for consolidation.")

In [8]:
consolidate_csv("DNN_MODEL_TRAINING/metrics","DNN_MODEL_TRAINING/DNN_models_combination_metrics.csv")

Consolidation completed. File saved at: DNN_MODEL_TRAINING/DNN_models_combination_metrics.csv
