# Simple training tutorial

In [1]:
from esc50_dataset import ESC50
root_path = "/Users/sebasmos/Documents/VE_paper/data" 
dataset = ESC50(root=root_path, download=False)
dataset.save_all_mel_spectrograms()

Loading audio files...


2000it [00:00, 58096.88it/s]

Audio files are saved in: /Users/sebasmos/Documents/VE_paper/data/ESC-50-master/audio
Mel spectrograms will be saved in: /Users/sebasmos/Documents/VE_paper/data/ESC-50-master/Mels_folds_dataset





Converting audio files to mel spectrograms...


1128it [19:32,  1.14s/it]

: 

In [None]:
!find /Users/sebasmos/Documents/VE_paper/data/ESC-50-master/Mels_folds_dataset  -type f | wc -l 


- [`audio/*.wav`](audio/)

  2000 audio recordings in WAV format (5 seconds, 44.1 kHz, mono) with the following naming convention:

  `{FOLD}-{CLIP_ID}-{TAKE}-{TARGET}.wav`

  - `{FOLD}` - index of the cross-validation fold,
  - `{CLIP_ID}` - ID of the original Freesound clip,
  - `{TAKE}` - letter disambiguating between different fragments from the same Freesound clip,
  - `{TARGET}` - class in numeric format [0, 49].

- [`meta/esc50.csv`](meta/esc50.csv)

  | <sub>filename</sub> | <sub>fold</sub> | <sub>target</sub> | <sub>category</sub> | <sub>esc10</sub> | <sub>src_file</sub> | <sub>take</sub> |
  | :--- | :--- | :--- | :--- | :--- | :--- | :--- |

  The `esc10` column indicates if a given file belongs to the *ESC-10* subset (10 selected classes, CC BY license).

- [`meta/esc50-human.xlsx`](meta/esc50-human.xlsx)

  Additional data pertaining to the crowdsourcing experiment (human classification accuracy).



In [47]:
# A - Animals
# N - Natural soundscapes & water sounds
# H - Human, non-speech sounds
# I - Interior/domestic sounds
# E-Exterior/urban noises

# Define the categories


# Define the categories based on your specified table


categories = {
    "A": ["dog", "rooster", "pig", "Cow", "Frog", "Cat", "Hen", "Insects", "Sheep", "Crow"],
    "N": ["Rain", "Sea waves", "Crackling fire", "Chirping birds", "Water drops", "Wind", "Pouring water", "Thunderstorm", "Crickets", "Toilet flush"],
    "H": ["Crying baby", "Sneezing", "Clapping", "Breathing","Coughing", "Footsteps", "Laughing",  "Brushing teeth", "Snoring", "drinking sipping"],
    "I": ["Door knock", "Mouse click","Keyboard typing", "Washing machine", "Vacuum cleaner", "Clock alarm", "Clock tick", "Door wood creaks", "Can opening", "Glass breaking"],
    "E": ["Helicopter", "Chainsaw", "Car horn", "Engine", "Train", "Siren", "Fireworks", "Hand saw", "Airplane", "Church bells"]
}

# Initialize a dictionary to store the indices for each category
category_indices = {category: [] for category in categories}

# Populate the category_indices dictionary
for category, classes in categories.items():
    for class_name in classes:
        # Convert to lowercase to match with class_to_idx keys
        class_name_lower = class_name.lower()
        if class_name_lower in dataset.class_to_idx:
            category_indices[category].append(dataset.class_to_idx[class_name_lower])

# Count total classes across all categories
total_classes = sum(len(indices) for indices in category_indices.values())

# Print the total number of classes
print(f"Total number of classes: {total_classes}")

Total number of classes: 46


In [None]:
import pandas as pd
import os

# Load the original meta.csv file
meta_path = '/Users/sebasmos/Documents/VE_paper/data/ESC-50-master/meta/esc50.csv'
meta_df = pd.read_csv(meta_path)

# Define your new categories
categories = {
    "A": ["dog", "rooster", "pig", "cow", "frog", "cat", "hen", "insects", "sheep", "crow"],
    "N": ["rain", "sea waves", "crackling fire", "chirping birds", "water drops", "wind", "pouring water", "thunderstorm", "crickets", "toilet flush"],
    "H": ["crying baby", "sneezing", "clapping", "breathing", "coughing", "footsteps", "laughing", "brushing teeth", "snoring", "drinking sipping"],
    "I": ["door knock", "mouse click", "keyboard typing", "washing machine", "vacuum cleaner", "clock alarm", "clock tick", "door wood creaks", "can opening", "glass breaking"],
    "E": ["helicopter", "chainsaw", "car horn", "engine", "train", "siren", "fireworks", "hand saw", "airplane", "church bells"]
}

# Create a mapping from class to new category
class_to_category = {}
for category, classes in categories.items():
    for class_name in classes:
        class_to_category[class_name.lower()] = category

# Add a new column for the category
meta_df['new_category'] = meta_df['category'].str.lower().map(class_to_category)

# Generate the path for each mel spectrogram image
mel_spectrograms_dir = '/Users/sebasmos/Documents/VE_paper/data/ESC-50-master/Mels_folds_dataset'
meta_df['mel_path'] = meta_df.apply(lambda row: os.path.join(mel_spectrograms_dir, f'fold{row["fold"]}', f'{row["filename"].replace(".wav", ".png")}'), axis=1)

# Save the updated meta.csv with new columns
updated_meta_path = os.path.join(mel_spectrograms_dir, 'updated_esc50.csv')
meta_df.to_csv(updated_meta_path, index=False)

print(f"Updated meta.csv saved at: {updated_meta_path}")

In [31]:
category_indices

{'Animals': [18, 37, 34, 13, 25, 5, 29, 39, 16],
 'Natural soundscapes & water sounds': [36, 38, 14, 7, 48, 49, 35, 43, 15, 28],
 'Human, non-speech sounds': [17, 41, 9, 1, 32, 12, 42, 31, 2],
 'Interior/domestic sounds': [19, 33, 31, 47, 46, 44, 10, 26, 11, 20],
 'Exterior/urban noises': [4, 6, 22, 23, 45, 40, 27, 0, 24, 2]}

In [46]:
len(category_indices["H"])

9

In [14]:
category_indices["Animals"]

[18, 37, 34, 13, 25, 5, 29, 39, 16]

In [4]:
dataset.class_to_idx

{'airplane': 0,
 'breathing': 1,
 'brushing teeth': 2,
 'can opening': 3,
 'car horn': 4,
 'cat': 5,
 'chainsaw': 6,
 'chirping birds': 7,
 'church bells': 8,
 'clapping': 9,
 'clock alarm': 10,
 'clock tick': 11,
 'coughing': 12,
 'cow': 13,
 'crackling fire': 14,
 'crickets': 15,
 'crow': 16,
 'crying baby': 17,
 'dog': 18,
 'door wood creaks': 19,
 'door wood knock': 20,
 'drinking sipping': 21,
 'engine': 22,
 'fireworks': 23,
 'footsteps': 24,
 'frog': 25,
 'glass breaking': 26,
 'hand saw': 27,
 'helicopter': 28,
 'hen': 29,
 'insects': 30,
 'keyboard typing': 31,
 'laughing': 32,
 'mouse click': 33,
 'pig': 34,
 'pouring water': 35,
 'rain': 36,
 'rooster': 37,
 'sea waves': 38,
 'sheep': 39,
 'siren': 40,
 'sneezing': 41,
 'snoring': 42,
 'thunderstorm': 43,
 'toilet flush': 44,
 'train': 45,
 'vacuum cleaner': 46,
 'washing machine': 47,
 'water drops': 48,
 'wind': 49}

In [5]:
y = [x for x in dataset.classes]
y

['airplane',
 'breathing',
 'brushing teeth',
 'can opening',
 'car horn',
 'cat',
 'chainsaw',
 'chirping birds',
 'church bells',
 'clapping',
 'clock alarm',
 'clock tick',
 'coughing',
 'cow',
 'crackling fire',
 'crickets',
 'crow',
 'crying baby',
 'dog',
 'door wood creaks',
 'door wood knock',
 'drinking sipping',
 'engine',
 'fireworks',
 'footsteps',
 'frog',
 'glass breaking',
 'hand saw',
 'helicopter',
 'hen',
 'insects',
 'keyboard typing',
 'laughing',
 'mouse click',
 'pig',
 'pouring water',
 'rain',
 'rooster',
 'sea waves',
 'sheep',
 'siren',
 'sneezing',
 'snoring',
 'thunderstorm',
 'toilet flush',
 'train',
 'vacuum cleaner',
 'washing machine',
 'water drops',
 'wind']