# DATA PREPARATION

### Download Kaggle dataset 

In [1]:
import kagglehub
import shutil
import os

dataset_path = '../datasets/raw'
force_download = False

# Download latest version
path = kagglehub.dataset_download("soumendraprasad/musical-instruments-sound-dataset", force_download=force_download)
print("Kaggle downloaded files:", path)

# Cria a pasta de destino, se não existir
os.makedirs(dataset_path, exist_ok=True)

shutil.copytree(
    src=path,
    dst=dataset_path,
    dirs_exist_ok=True  # Permite copiar para pasta existente
)

print("Moved kaggle files to:", path)

  from .autonotebook import tqdm as notebook_tqdm


Kaggle downloaded files: /home/rcalabro/.cache/kagglehub/datasets/soumendraprasad/musical-instruments-sound-dataset/versions/3
Moved kaggle files to: /home/rcalabro/.cache/kagglehub/datasets/soumendraprasad/musical-instruments-sound-dataset/versions/3


### Organize files

In [17]:
import os
import re
import shutil
import pandas as pd
from pathlib import Path

RAW_DATASET_PATH = '../datasets/raw'
NORMALIZED_DATASET_PATH = '../datasets/normalized'

os.makedirs(NORMALIZED_DATASET_PATH, exist_ok=True)

# Função para normalizar nomes dos arquivos
def normalize_filename(filename: str) -> str:
    stem = Path(filename).stem
    stem = re.sub(r"[^a-zA-Z0-9]", "_", stem).lower()
    stem = re.sub(r"_+", "_", stem).strip("_")
    return f"{stem}.wav"

#### Normalize test files

In [33]:
# Caminhos
RAW_TEST_CSV_PATH = Path(os.path.join(RAW_DATASET_PATH, 'Metadata_Test.csv'))
print(f"RAW_TEST_CSV_PATH: {RAW_TEST_CSV_PATH}")

RAW_TEST_AUDIOS_DIR = Path(os.path.join(RAW_DATASET_PATH, 'Test_submission', 'Test_submission'))
print(f"RAW_TEST_AUDIOS_DIR: {RAW_TEST_AUDIOS_DIR}")

NORMALIZED_CSV_PATH = Path(os.path.join(NORMALIZED_DATASET_PATH, 'test_metadata.csv'))
print(f"NORMALIZED_CSV_PATH: {NORMALIZED_CSV_PATH}")

NORMALIZED_TEST_AUDIOS_DIR = Path(os.path.join(NORMALIZED_DATASET_PATH, 'test_data'))
print(f"NORMALIZED_TEST_AUDIOS_DIR: {NORMALIZED_TEST_AUDIOS_DIR}")

RAW_TEST_CSV_PATH: ../datasets/raw/Metadata_Test.csv
RAW_TEST_AUDIOS_DIR: ../datasets/raw/Test_submission/Test_submission
NORMALIZED_CSV_PATH: ../datasets/normalized/test_metadata.csv
NORMALIZED_TEST_AUDIOS_DIR: ../datasets/normalized/test_data


In [41]:
# Leitura do CSV
df = pd.read_csv(RAW_TEST_CSV_PATH)

# Correção e normalização da coluna de classe
df["Class"] = (
    df["Class"]
    .str.lower()
    .str.replace(r"^sound_", "", regex=True)
    .str.replace(r"(?i)^guiatr$", "guitar", regex=True)
)

# Guarda o nome original antes da normalização
df["OriginalFilename"] = df["FileName"]

# Normaliza o nome dos arquivos
df["FileName"] = df["FileName"].apply(normalize_filename)

df.head()

Unnamed: 0,FileName,Class,OriginalFilename
0,acoustic_guitar_logo_13084.wav,guitar,acoustic-guitar-logo-13084.wav
1,guitar_chords_70663.wav,guitar,guitar-chords-70663.wav
2,guitar_intro_110935.wav,guitar,guitar-intro-110935.wav
3,guitar_solo_27194.wav,guitar,guitar-solo-27194.wav
4,guitar_solo_5999.wav,guitar,guitar-solo-5999.wav


In [42]:
os.makedirs(NORMALIZED_DATASET_PATH, exist_ok=True)
NORMALIZED_TEST_AUDIOS_DIR.mkdir(parents=True, exist_ok=True)

In [43]:
missing_files = []

for _, row in df.iterrows():
    original_file = RAW_TEST_AUDIOS_DIR / row["OriginalFilename"]
    normalized_file = NORMALIZED_TEST_AUDIOS_DIR / row["FileName"]

    if original_file.exists():
        shutil.copy2(original_file, normalized_file)
    else:
        missing_files.append(str(original_file))

print(f"missing: {missing_files}")

missing: []


In [44]:
df = df.drop(columns=["OriginalFilename"])
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(r"[^a-z0-9]+", "_", regex=True)
    .str.strip("_")
)

df.to_csv(NORMALIZED_CSV_PATH, index=False)

In [45]:
print(f"✅ Arquivos normalizados e copiados para: {TARGET_AUDIO_DIR}")
print(f"📄 Novo CSV salvo em: {OUTPUT_CSV_PATH}")

if missing_files:
    print(f"⚠️ Arquivos ausentes ({len(missing_files)}):")
    for f in missing_files[:5]:
        print(f" - {f}")
else:
    print("🎉 Todos os arquivos foram encontrados e copiados com sucesso!")

✅ Arquivos normalizados e copiados para: ../datasets/normalized/test
📄 Novo CSV salvo em: Metadata_Test_Normalized.csv
🎉 Todos os arquivos foram encontrados e copiados com sucesso!


#### Normalize Train Files

In [46]:
# Caminhos
RAW_TRAIN_CSV_PATH = Path(os.path.join(RAW_DATASET_PATH, 'Metadata_Train.csv'))
print(f"RAW_TRAIN_CSV_PATH: {RAW_TRAIN_CSV_PATH}")

RAW_TRAIN_AUDIOS_DIR = Path(os.path.join(RAW_DATASET_PATH, 'Train_submission', 'Train_submission'))
print(f"RAW_TRAIN_AUDIOS_DIR: {RAW_TRAIN_AUDIOS_DIR}")

NORMALIZED_TRAIN_CSV_PATH = Path(os.path.join(NORMALIZED_DATASET_PATH, 'train_metadata.csv'))
print(f"NORMALIZED_TRAIN_CSV_PATH: {NORMALIZED_TRAIN_CSV_PATH}")

NORMALIZED_TRAIN_AUDIOS_DIR = Path(os.path.join(NORMALIZED_DATASET_PATH, 'train_data'))
print(f"NORMALIZED_TRAIN_AUDIOS_DIR: {NORMALIZED_TRAIN_AUDIOS_DIR}")


RAW_TRAIN_CSV_PATH: ../datasets/raw/Metadata_Train.csv
RAW_TRAIN_AUDIOS_DIR: ../datasets/raw/Train_submission/Train_submission
NORMALIZED_TRAIN_CSV_PATH: ../datasets/normalized/train_metadata.csv
NORMALIZED_TRAIN_AUDIOS_DIR: ../datasets/normalized/train_data


In [47]:
# Leitura do CSV
df_train = pd.read_csv(RAW_TRAIN_CSV_PATH)

# Correção e normalização da coluna de classe
df_train["Class"] = (
    df_train["Class"]
    .str.lower()
    .str.replace(r"^sound_", "", regex=True)
    .str.replace(r"(?i)^guiatr$", "guitar", regex=True)
)

# Guarda o nome original antes da normalização
df_train["OriginalFilename"] = df_train["FileName"]

# Normaliza o nome dos arquivos
df_train["FileName"] = df_train["FileName"].apply(normalize_filename)

df_train.head()


Unnamed: 0,FileName,Class,OriginalFilename
0,1_e1_major_00.wav,guitar,1-E1-Major 00.wav
1,1_e1_major_01.wav,guitar,1-E1-Major 01.wav
2,1_e1_major_02.wav,guitar,1-E1-Major 02.wav
3,1_e1_major_03.wav,guitar,1-E1-Major 03.wav
4,1_e1_major_04.wav,guitar,1-E1-Major 04.wav


In [48]:
os.makedirs(NORMALIZED_DATASET_PATH, exist_ok=True)
NORMALIZED_TRAIN_AUDIOS_DIR.mkdir(parents=True, exist_ok=True)

In [49]:
missing_train_files = []

for _, row in df_train.iterrows():
    original_file = RAW_TRAIN_AUDIOS_DIR / row["OriginalFilename"]
    normalized_file = NORMALIZED_TRAIN_AUDIOS_DIR / row["FileName"]

    if original_file.exists():
        shutil.copy2(original_file, normalized_file)
    else:
        missing_train_files.append(str(original_file))

print(f"missing: {missing_train_files}")


missing: []


In [50]:
df_train = df_train.drop(columns=["OriginalFilename"])
df_train.columns = (
    df_train.columns
    .str.strip()
    .str.lower()
    .str.replace(r"[^a-z0-9]+", "_", regex=True)
    .str.strip("_")
)

df_train.to_csv(NORMALIZED_TRAIN_CSV_PATH, index=False)


In [51]:
print(f"✅ Arquivos normalizados e copiados para: {NORMALIZED_TRAIN_AUDIOS_DIR}")
print(f"📄 Novo CSV salvo em: {NORMALIZED_TRAIN_CSV_PATH}")

if missing_train_files:
    print(f"⚠️ Arquivos ausentes ({len(missing_train_files)}):")
    for f in missing_train_files[:5]:
        print(f" - {f}")
else:
    print("🎉 Todos os arquivos foram encontrados e copiados com sucesso!")

✅ Arquivos normalizados e copiados para: ../datasets/normalized/train_data
📄 Novo CSV salvo em: ../datasets/normalized/train_metadata.csv
🎉 Todos os arquivos foram encontrados e copiados com sucesso!
