# Environmental Sound Dataset - Data Exploration

<!--
Project: Environmental Sound Dataset
Website: https://rskworld.in
Founded by: Molla Samser
Designer & Tester: Rima Khatun
Email: help@rskworld.in
Phone: +91 93305 39277
-->

This notebook demonstrates how to explore and analyze the Environmental Sound Dataset.


In [None]:
import sys
import os
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from load_data import load_environmental_sounds, get_class_distribution
from analyze import get_dataset_statistics

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


## Load Dataset


In [None]:
# Load training data
train_data, train_labels = load_environmental_sounds('train')
print(f"Loaded {len(train_data)} training samples")
print(f"Number of classes: {len(set(train_labels))}")
print(f"Classes: {set(train_labels)}")


## Class Distribution


In [None]:
# Get class distribution
class_dist = get_class_distribution(train_labels)

# Visualize
plt.figure(figsize=(12, 6))
classes = list(class_dist.keys())
counts = list(class_dist.values())

plt.bar(classes, counts)
plt.xlabel('Class')
plt.ylabel('Number of Samples')
plt.title('Class Distribution in Training Set')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


## Dataset Statistics


In [None]:
# Get overall statistics
stats = get_dataset_statistics()

print("Dataset Statistics:")
print(f"Total files: {stats['total_files']}")
print(f"Total duration: {stats['total_duration']:.2f} seconds ({stats['total_duration']/60:.2f} minutes)")
print(f"Average duration: {stats['avg_duration']:.2f} seconds")
print(f"Min duration: {stats['min_duration']:.2f} seconds")
print(f"Max duration: {stats['max_duration']:.2f} seconds")
print(f"\nClasses: {list(stats['classes'].keys())}")
print(f"Formats: {stats['formats']}")


## Audio Duration Analysis


In [None]:
import librosa

# Calculate durations
durations = []
for audio in train_data:
    durations.append(len(audio) / 22050)  # Assuming 22050 sample rate

# Plot histogram
plt.figure(figsize=(10, 6))
plt.hist(durations, bins=50, edgecolor='black')
plt.xlabel('Duration (seconds)')
plt.ylabel('Frequency')
plt.title('Distribution of Audio Durations')
plt.axvline(np.mean(durations), color='r', linestyle='--', label=f'Mean: {np.mean(durations):.2f}s')
plt.legend()
plt.tight_layout()
plt.show()

print(f"Mean duration: {np.mean(durations):.2f} seconds")
print(f"Median duration: {np.median(durations):.2f} seconds")
print(f"Std duration: {np.std(durations):.2f} seconds")


## Feature Extraction Example


In [None]:
from load_data import prepare_features

# Extract MFCC features
features = prepare_features(train_data[:100], feature_type='mfcc', n_mfcc=13)

print(f"Feature shape: {features.shape}")
print(f"Number of features per sample: {features.shape[1]}")

# Visualize feature distribution
plt.figure(figsize=(12, 6))
plt.imshow(features.T, aspect='auto', origin='lower', cmap='viridis')
plt.colorbar(label='MFCC Value')
plt.xlabel('Sample Index')
plt.ylabel('MFCC Coefficient')
plt.title('MFCC Features Visualization')
plt.tight_layout()
plt.show()
