# Traning - Features

In [1]:
%pip install -r ../requirements.txt

You should consider upgrading via the '/home/rcalabro/codebase/senac-ia/neural-networks/rn-musical-instruments/.venv/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [8]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0'
os.environ['ABSL_LOG_THRESHOLD'] = '0'

# Built-in
import ast
import json
from functools import partial
from pathlib import Path

# Third-party - Data Handling
import numpy as np
import pandas as pd

# Third-party - Visualization
import matplotlib.pyplot as plt

# Third-party - Machine Learning
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA

# Third-party - Deep Learning
import keras
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras import layers, models, regularizers
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Third-party - Audio Processing
import librosa
from librosa.feature.rhythm import tempo
import soundfile as sf

# Third-party - Utilities
from tqdm import tqdm

print("TensorFlow:", tf.__version__)
print("Keras:", keras.__version__)
print("Is TensorFlow using GPU?", tf.test.is_gpu_available())
print("GPU disponível:", tf.config.list_physical_devices('GPU'))
print("XLA ativado:", tf.config.optimizer.get_jit())
# Mostra configuração geral
tf.config.experimental.list_physical_devices()

TensorFlow: 2.19.0
Keras: 3.9.2
Is TensorFlow using GPU? True
GPU disponível: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
XLA ativado: 


I0000 00:00:1745958950.269023  129308 gpu_device.cc:2019] Created device /device:GPU:0 with 4047 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2060, pci bus id: 0000:0a:00.0, compute capability: 7.5


[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
ORIGIN_DATASET_PATH = '../datasets/augmented'
ORIGIN_DATASET_VERSION = 'v2'

ORIGIN_DATASET_VERSION_PATH = Path(os.path.join(ORIGIN_DATASET_PATH, ORIGIN_DATASET_VERSION))
ORIGIN_DATASET_TRAIN_METADATA = ORIGIN_DATASET_VERSION_PATH / 'train_metadata.csv'
ORIGIN_DATASET_TRAIN_DATA = ORIGIN_DATASET_VERSION_PATH / 'train_data'

ORIGIN_DATASET_TEST_METADATA = ORIGIN_DATASET_VERSION_PATH / 'test_metadata.csv'
ORIGIN_DATASET_TEST_DATA = ORIGIN_DATASET_VERSION_PATH / 'test_data'

In [4]:
TRAINING_DATASET_PATH = '../datasets/training'
TRAINING_DATASET_VERSION = 'v7'

TRAINING_DATASET_VERSION_PATH = Path(os.path.join(TRAINING_DATASET_PATH, TRAINING_DATASET_VERSION))

TRAINING_DATASET_TRAIN_METADATA = TRAINING_DATASET_VERSION_PATH / 'train_metadata.csv'
TRAINING_DATASET_TEST_METADATA = TRAINING_DATASET_VERSION_PATH / 'test_metadata.csv'

TRAINING_DATASET_VERSION_PATH.mkdir(parents=True, exist_ok=True)
print(f"📁 TRAINING_DATASET_VERSION_PATH: {TRAINING_DATASET_VERSION_PATH}")

📁 TRAINING_DATASET_VERSION_PATH: ../datasets/training/v7


In [None]:
import librosa
import numpy as np

def extract_features(row, basepath, n_mfcc=40):
    filepath = basepath / row['filename']
    y, sr = librosa.load(filepath, sr=None, res_type='kaiser_fast')

    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc).mean(axis=1)
    chroma = librosa.feature.chroma_stft(y=y, sr=sr).mean(axis=1)
    contrast = librosa.feature.spectral_contrast(y=y, sr=sr).mean(axis=1)
    features = np.concatenate([mfcc, chroma, contrast], axis=0)
    features = (features - np.mean(features)) / np.std(features)

    return features



In [None]:

def prepare_dataset(metadata_file, output_metadata, input_audio_dir=None, n_mfcc=80):
    metadata_file = Path(metadata_file)
    output_metadata = Path(output_metadata)
    input_audio_dir = Path(input_audio_dir) if input_audio_dir else metadata_file.parent

    df = pd.read_csv(metadata_file nrows=10)

    label_encoder = LabelEncoder()
    df['label'] = label_encoder.fit_transform(df['class'])

    tqdm.pandas(desc="Extraindo features")
    df['features'] = df.progress_apply(lambda row: extract_features(row, input_audio_dir, n_mfcc).mean(axis=0).tolist(), axis=1)

    df[['class', 'features', 'label']].to_csv(output_metadata, index=False)
    print(f"✅ Dataset com features salvo em: {output_metadata}")
    return label_encoder

In [7]:
label_encoder = prepare_dataset(
    metadata_file=ORIGIN_DATASET_TRAIN_METADATA,
    input_audio_dir=ORIGIN_DATASET_TRAIN_DATA,
    output_metadata=TRAINING_DATASET_TRAIN_METADATA,
    n_mfcc=80
)

  return pitch_tuning(
Extraindo features: 100%|██████████| 15774/15774 [25:00<00:00, 10.51it/s]  

✅ Dataset com features salvo em: ../datasets/training/v7/train_metadata.csv





In [9]:
def load_dataset_from_csv(csv_path):
    df = pd.read_csv(csv_path)

    df['features'] = df['features'].apply(ast.literal_eval)

    X = np.vstack(df['features'].values).astype(np.float32)

    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(df['class'].values)

    ds_shape = (X.shape[1],)

    return X, y, label_encoder, ds_shape

# Uso:
X, y, label_encoder, ds_shape = load_dataset_from_csv(TRAINING_DATASET_TRAIN_METADATA)

print(ds_shape)

# 🔥 Split normal
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

ValueError: malformed node or string: 0.0

In [None]:
def build_robust_dense_model(input_shape, num_classes, gamma=1.0, alpha=0.25):
    inputs = layers.Input(shape=input_shape)

    x = layers.BatchNormalization()(inputs)

    x = layers.Dense(512, activation='swish', kernel_regularizer=regularizers.l2(1e-4))(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.4)(x)

    x1 = layers.Dense(256, activation='swish', kernel_regularizer=regularizers.l2(1e-4))(x)
    x1 = layers.BatchNormalization()(x1)
    x1 = layers.Dropout(0.3)(x1)

    x2 = layers.Dense(256, activation='swish', kernel_regularizer=regularizers.l2(1e-4))(x1)
    x2 = layers.BatchNormalization()(x2)
    x2 = layers.Dropout(0.3)(x2)

    x = layers.add([x1, x2])

    x3 = layers.Dense(128, activation='swish', kernel_regularizer=regularizers.l2(1e-4))(x)
    x3 = layers.BatchNormalization()(x3)
    x3 = layers.Dropout(0.25)(x3)

    x4 = layers.Dense(128, activation='swish', kernel_regularizer=regularizers.l2(1e-4))(x3)
    x4 = layers.BatchNormalization()(x4)
    x4 = layers.Dropout(0.25)(x4)

    x = layers.add([x3, x4])

    x = layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(1e-4))(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.2)(x)

    x = layers.Dense(32, activation='relu', kernel_regularizer=regularizers.l2(1e-4))(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.2)(x)

    outputs = layers.Dense(num_classes, activation='softmax')(x)

    model = models.Model(inputs=inputs, outputs=outputs)


    def focal_loss(gamma=2., alpha=0.25):
        def loss(y_true, y_pred):
            y_true = tf.cast(y_true, tf.int32)
            y_true_one_hot = tf.one_hot(y_true, depth=tf.shape(y_pred)[-1])
            cross_entropy = K.categorical_crossentropy(y_true_one_hot, y_pred)
            pt = tf.reduce_sum(y_true_one_hot * y_pred, axis=-1)
            fl = alpha * tf.pow(1. - pt, gamma) * cross_entropy
            return fl
        return loss

    model.compile(
        optimizer='adam',
        loss=focal_loss(gamma=gamma, alpha=alpha),
        metrics=['accuracy']
    )

    return model



In [None]:
from tensorflow.keras.utils import plot_model
from IPython.display import Image, display

model = build_robust_dense_model(
    input_shape=(160,),
    num_classes=len(label_encoder.classes_)
)

plot_model(
    model,
    to_file='model_plot.png',
    show_shapes=True,
    show_layer_names=True,
    expand_nested=True,
    dpi=96
)

# Exibe no notebook
display(Image(filename='model_plot.png'))


In [None]:
model = build_robust_dense_model(
    input_shape=(160,),
    num_classes=len(label_encoder.classes_),
    gamma=1.5,
    alpha=0.3,
)

callbacks = [
    EarlyStopping(patience=15, restore_best_weights=True),
    ReduceLROnPlateau(factor=0.5, patience=10, verbose=1)
]

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=128,
    epochs=100,
    callbacks=callbacks
)

In [None]:
history_dict = history.history

plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history_dict['loss'], label='Treino')
plt.plot(history_dict['val_loss'], label='Validação')
plt.title('Loss')
plt.xlabel('Época')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history_dict['accuracy'], label='Treino')
plt.plot(history_dict['val_accuracy'], label='Validação')
plt.title('Acurácia')
plt.xlabel('Época')
plt.ylabel('Acurácia')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
y_pred_probs = model_dv.predict(X_val, verbose=0)
y_pred_dv = np.argmax(y_pred_probs, axis=1)
y_true = y_val  

print(classification_report(
    y_true, 
    y_pred, 
    target_names=label_encoder.classes_  # usa as classes corretas
))

cm = confusion_matrix(y_true, y_pred)
print('Confusion matrix:')
print(cm)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)
disp.plot(xticks_rotation=45, cmap='Blues')
plt.tight_layout()
plt.show()

In [None]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_dv, cmap='coolwarm', alpha=0.6)
plt.title("PCA: drum vs violin")
plt.show()

In [None]:
def load_filtered_data(csv_path):
    df = pd.read_csv(csv_path)
    df = df[df['class'].isin(['drum', 'violin'])].copy()

    df['features'] = df['features'].apply(ast.literal_eval)
    X = np.vstack(df['features'].values).astype(np.float32)

    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(df['class'].values)

    return X, y, label_encoder

X_dv, y_dv, label_encoder_dv = load_filtered_data(TRAINING_DATASET_TRAIN_METADATA)

X_train_dv, X_val_dv, y_train_dv, y_val_dv = train_test_split(
    X_dv, y_dv,
    test_size=0.2,
    random_state=42,
    stratify=y_dv
)

In [None]:
def build_binary_dense_model(input_shape):
    inputs = layers.Input(shape=input_shape)

    x = layers.BatchNormalization()(inputs)

    x = layers.Dense(512, activation='swish', kernel_regularizer=regularizers.l2(1e-4))(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.4)(x)

    x = layers.Dense(256, activation='swish', kernel_regularizer=regularizers.l2(1e-4))(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.3)(x)

    x = layers.Dense(128, activation='swish', kernel_regularizer=regularizers.l2(1e-4))(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.2)(x)

    x = layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(1e-4))(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.2)(x)


    x = layers.Dense(32, activation='relu', kernel_regularizer=regularizers.l2(1e-4))(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.2)(x)

    outputs = layers.Dense(1, activation='sigmoid')(x)

    model = models.Model(inputs=inputs, outputs=outputs)

    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

    return model

In [None]:
model_dv = build_binary_dense_model(
    input_shape=(160,),
)

callbacks_dv = [
    EarlyStopping(patience=15, restore_best_weights=True),
    ReduceLROnPlateau(factor=0.5, patience=10, verbose=1)
]

history_dv = model_dv.fit(
    X_train_dv, y_train_dv,
    validation_data=(X_val_dv, y_val_dv),
    batch_size=128,
    epochs=100,
    callbacks=callbacks_dv
)

In [None]:
history_dict_dv = history_dv.history

plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history_dict_dv['loss'], label='Treino')
plt.plot(history_dict_dv['val_loss'], label='Validação')
plt.title('Loss')
plt.xlabel('Época')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history_dict_dv['accuracy'], label='Treino')
plt.plot(history_dict_dv['val_accuracy'], label='Validação')
plt.title('Acurácia')
plt.xlabel('Época')
plt.ylabel('Acurácia')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
y_pred_probs_dv = model_dv.predict(X_val_dv, verbose=0)
y_pred_dv = (y_pred_probs_dv > 0.5).astype(int).flatten()
y_true_dv = y_val_dv

print(classification_report(
    y_true_dv, 
    y_pred_dv, 
    target_names=label_encoder_dv.classes_
))

cm_dv = confusion_matrix(y_true_dv, y_pred_dv)
print('Confusion matrix (drum vs violin):')
print(cm_dv)

disp_dv = ConfusionMatrixDisplay(confusion_matrix=cm_dv, display_labels=label_encoder_dv.classes_)
disp_dv.plot(xticks_rotation=45, cmap='Blues')
plt.tight_layout()
plt.show()


In [None]:
pca_dv = PCA(n_components=2)
X_pca_dv = pca_dv.fit_transform(X_dv)

plt.scatter(X_pca_dv[:, 0], X_pca_dv[:, 1], c=y_dv, cmap='coolwarm', alpha=0.6)
plt.title("PCA: drum vs violin")
plt.show()



In [None]:
def predict_final_batch(X, model, model_dv, label_encoder, label_encoder_dv):
    """
    Predição refinada: usa model principal + model_dv para drum/violin.
    Retorna array (N, num_classes) com distribuições softmax finais.
    """
    probs_main = model.predict(X, verbose=0)  # shape (N, num_classes)
    preds_main_idx = np.argmax(probs_main, axis=1)
    preds_main_name = label_encoder.inverse_transform(preds_main_idx)

    # Índices das classes 'drum' e 'violin' no modelo principal
    idx_drum = np.where(label_encoder.classes_ == "drum")[0][0]
    idx_violin = np.where(label_encoder.classes_ == "violin")[0][0]

    # Identifica quais amostras precisam de refinamento
    mask_refine = np.isin(preds_main_name, label_encoder_dv.classes_)
    X_refine = X[mask_refine]

    if len(X_refine) > 0:
        probs_violin = model_dv.predict(X_refine, verbose=0).flatten()  # sigmoid: violin
        probs_drum = 1.0 - probs_violin

        probs_main_refined = probs_main.copy()

        for i, idx in enumerate(np.where(mask_refine)[0]):
            probs_main_refined[idx, idx_drum] = probs_drum[i]
            probs_main_refined[idx, idx_violin] = probs_violin[i]

            # Renormaliza a distribuição (softmax-style)
            probs_main_refined[idx] /= probs_main_refined[idx].sum()

        return probs_main_refined

    return probs_main


In [None]:
# 🔹 Fazer predições usando a função refinada
y_pred_probs_final = predict_final_batch(
    X_val,
    model,
    model_dv,
    label_encoder,
    label_encoder_dv
)

# 🔹 Inferir a classe com maior probabilidade
y_pred_final = np.argmax(y_pred_probs_final, axis=1)
y_true_final = y_val

# 🔹 Relatório de classificação
print(classification_report(
    y_true_final, 
    y_pred_final, 
    target_names=label_encoder.classes_
))

# 🔹 Matriz de confusão
cm_final = confusion_matrix(y_true_final, y_pred_final)
print('Confusion matrix (com refinamento drum/violin):')
print(cm_final)

disp_final = ConfusionMatrixDisplay(confusion_matrix=cm_final, display_labels=label_encoder.classes_)
disp_final.plot(xticks_rotation=45, cmap='Blues')
plt.tight_layout()
plt.show()
