In [2]:
# Cell 1: Install necessary libraries
!pip install librosa soundfile datasets codecarbon



In [3]:
# Cell 2: Import necessary libraries
import numpy as np
import librosa
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, IterableDataset
import torch.optim.lr_scheduler as lr_scheduler
from datasets import load_dataset
from huggingface_hub import login
from pathlib import Path
from codecarbon import EmissionsTracker  # For energy monitoring

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Cell 3: Sign in to Hugging Face for datasets
token = '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'
login(token)

In [5]:
# Cell 4: Load dataset in streaming mode
dataset = load_dataset("rfcx/frugalai", streaming=True)

# Limit the number of samples for training
max_samples = 36000  # Adjust based on your needs
dataset['train'] = dataset['train'].take(max_samples)

In [6]:
type(next(iter(dataset['train']))['audio']['array']) #np.size

numpy.ndarray

In [7]:
# Cell 5: Define the SpectrogramIterableDataset class with advanced preprocessing
class SpectrogramIterableDataset(IterableDataset):
    def __init__(self, iterable_dataset, n_fft=1024, hop_length=256, n_mels=64, target_size=(64, 64)):
        self.dataset = iterable_dataset
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.n_mels = n_mels
        self.target_size = target_size

    def add_noise(self, audio, noise_level=0.005):
        """Add random noise to the audio signal."""
        noise = np.random.randn(len(audio)) * noise_level
        return audio + noise

    def time_stretch(self, audio, rate=1.0):
        """Apply time-stretching to the audio signal."""
        return librosa.effects.time_stretch(audio, rate=rate)

    def pitch_shift(self, audio, sampling_rate, n_steps=2):
        """Apply pitch-shifting to the audio signal."""
        return librosa.effects.pitch_shift(audio, sr=sampling_rate, n_steps=n_steps)

    def frequency_masking(self, spectrogram, max_mask_freq=16):
        """Randomly mask frequency bands in the spectrogram."""
        freq_mask = np.random.randint(1, max_mask_freq + 1)  # Ensure at least 1 frequency band is masked
        start = np.random.randint(0, spectrogram.shape[0] - freq_mask)
        spectrogram[start:start + freq_mask, :] = 0
        return spectrogram

    def time_masking(self, spectrogram, max_mask_time=16):
        """Randomly mask time segments in the spectrogram."""
        time_mask = np.random.randint(1, max_mask_time + 1)  # Ensure at least 1 time segment is masked
        start = np.random.randint(0, spectrogram.shape[1] - time_mask)
        spectrogram[:, start:start + time_mask] = 0
        return spectrogram

    def compute_delta_features(self, spectrogram):
        """Compute delta and delta-delta features for temporal dynamics."""
        delta = librosa.feature.delta(spectrogram)
        delta_delta = librosa.feature.delta(spectrogram, order=2)
        return np.stack([spectrogram, delta, delta_delta], axis=0)

    def process_audio(self, audio_array, sampling_rate):
        # Skip records that are too short
        if len(audio_array) < self.n_fft:
            return None  # Return None to indicate that this record should be skipped

        # Data augmentation: Add noise, time-stretch, and pitch-shift
        audio_array = self.add_noise(audio_array)
        audio_array = self.time_stretch(audio_array, rate=np.random.uniform(0.9, 1.1))
        audio_array = self.pitch_shift(audio_array, sampling_rate, n_steps=np.random.randint(-2, 2))

        # Generate Mel spectrogram
        mel_spectrogram = librosa.feature.melspectrogram(
            y=audio_array, sr=sampling_rate, n_fft=self.n_fft,
            hop_length=self.hop_length, n_mels=self.n_mels
        )
        # Convert to log scale (dB)
        log_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)

        # Normalize to zero mean and unit variance
        if np.std(log_mel_spectrogram) > 0:  # Avoid division by zero
            log_mel_spectrogram = (log_mel_spectrogram - np.mean(log_mel_spectrogram)) / np.std(log_mel_spectrogram)
        else:
            log_mel_spectrogram = np.zeros_like(log_mel_spectrogram)  # Handle case where std is zero

        # Apply frequency and time masking
        log_mel_spectrogram = self.frequency_masking(log_mel_spectrogram)
        log_mel_spectrogram = self.time_masking(log_mel_spectrogram)

        # Compute delta features
        log_mel_spectrogram = self.compute_delta_features(log_mel_spectrogram)

        # Resize to target size
        log_mel_spectrogram = librosa.util.fix_length(log_mel_spectrogram, size=self.target_size[1], axis=2)
        log_mel_spectrogram = librosa.util.fix_length(log_mel_spectrogram, size=self.target_size[0], axis=1)


        #print(type(log_mel_spectrogram))
        #print(np.size(log_mel_spectrogram))
        #print(log_mel_spectrogram.size), c'est la même chose que np.size

        # Flatten the spectrogram into a 1D feature vector
        # Flatten the spectrogram into a 1D feature vector
        flattened_spectrogram = log_mel_spectrogram.flatten()

        return torch.tensor(flattened_spectrogram, dtype=torch.float32)  # Shape: (n_features,)

    def __iter__(self):
        skipped_count = 0
        for sample in iter(self.dataset):
            audio_array = sample['audio']['array']
            sampling_rate = sample['audio']['sampling_rate']
            label = sample['label']
            
            # Process audio to spectrogram
            spectrogram = self.process_audio(audio_array, sampling_rate)
            
            # Skip records that are too short
            if spectrogram is None:
                skipped_count += 1
                continue  # Skip this record
            
            yield spectrogram, label
    
        print(f"Skipped {skipped_count} records due to short audio length.")
    def __len__(self):
        # Counts items manually
        return sum(1 for _ in iter(self.dataset)) # Counts the number of items

In [10]:
max_samples = 10000
# Wrap the train IterableDataset
#wrapped_train_dataset = SpectrogramIterableDataset(dataset['train'].take(max_samples))
wrapped_train_dataset = SpectrogramIterableDataset(dataset['train'])


# Create DataLoader with smaller batch size
batch_size = 16  # Reduce from 32
train_loader = DataLoader(
    wrapped_train_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=0  # Set to 0 if CPU resources are limited
)

# Wrap the test IterableDataset
#wrapped_test_dataset = SpectrogramIterableDataset(dataset['test'].take(max_samples))
wrapped_test_dataset = SpectrogramIterableDataset(dataset['test'])


# Create DataLoader with smaller batch size
batch_size = 16  # Reduce from 32
test_loader = DataLoader(
    wrapped_test_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=0  # Set to 0 if CPU resources are limited
)


In [9]:
# Initialize CodeCarbon tracker
tracker = EmissionsTracker()
tracker.start()

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Collect flattened spectrograms and labels
X_train = []
y_train = []

for spectrogram, label in train_loader:
    X_train.append(spectrogram.numpy().reshape(spectrogram.shape[0], -1))  # Flatten each sample
    y_train.append(label.numpy())

# Convert lists to NumPy arrays
X_train = np.vstack(X_train)  # Stack all batches correctly
y_train = np.hstack(y_train)  # Flatten label array

# Initialize and train the RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Collect test data
X_test = []
y_test = []

for spectrogram, label in test_loader:
    X_test.append(spectrogram.numpy().reshape(spectrogram.shape[0], -1))  # Flatten each sample
    y_test.append(label.numpy())

X_test = np.vstack(X_test)
y_test = np.hstack(y_test)

# Predict and compute accuracy
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"RandomForestClassifier Accuracy: {accuracy * 100:.2f}%")
# Stop the tracker and print emissions
emissions = tracker.stop()
print(f"Training completed. Total CO2 emissions: {emissions} kg")

[codecarbon INFO @ 22:04:54] [setup] RAM Tracking...
[codecarbon INFO @ 22:04:54] [setup] CPU Tracking...
 Mac OS detected: Please install Intel Power Gadget or enable PowerMetrics sudo to measure CPU

[codecarbon INFO @ 22:05:01] CPU Model on constant consumption mode: Intel(R) Core(TM) i5-8257U CPU @ 1.40GHz
[codecarbon INFO @ 22:05:01] [setup] GPU Tracking...
[codecarbon INFO @ 22:05:01] No GPU found.
[codecarbon INFO @ 22:05:01] >>> Tracker's metadata:
[codecarbon INFO @ 22:05:01]   Platform system: macOS-10.16-x86_64-i386-64bit
[codecarbon INFO @ 22:05:01]   Python version: 3.11.7
[codecarbon INFO @ 22:05:01]   CodeCarbon version: 2.8.3
[codecarbon INFO @ 22:05:01]   Available RAM : 8.000 GB
[codecarbon INFO @ 22:05:01]   CPU count: 8
[codecarbon INFO @ 22:05:01]   CPU model: Intel(R) Core(TM) i5-8257U CPU @ 1.40GHz
[codecarbon INFO @ 22:05:01]   GPU count: None
[codecarbon INFO @ 22:05:01]   GPU model: None
[codecarbon INFO @ 22:05:02] Saving emissions data to file /Users/sarahle

Skipped 1 records due to short audio length.


[codecarbon INFO @ 22:12:32] Energy consumed for RAM : 0.000375 kWh. RAM Power : 3.0 W
[codecarbon INFO @ 22:12:32] Energy consumed for all CPUs : 0.000938 kWh. Total CPU Power : 7.5 W
[codecarbon INFO @ 22:12:32] 0.001313 kWh of electricity used since the beginning.
[codecarbon INFO @ 22:12:47] Energy consumed for RAM : 0.000388 kWh. RAM Power : 3.0 W
[codecarbon INFO @ 22:12:47] Energy consumed for all CPUs : 0.000969 kWh. Total CPU Power : 7.5 W
[codecarbon INFO @ 22:12:47] 0.001356 kWh of electricity used since the beginning.
[codecarbon INFO @ 22:13:02] Energy consumed for RAM : 0.000400 kWh. RAM Power : 3.0 W
[codecarbon INFO @ 22:13:02] Energy consumed for all CPUs : 0.001000 kWh. Total CPU Power : 7.5 W
[codecarbon INFO @ 22:13:02] 0.001400 kWh of electricity used since the beginning.
[codecarbon INFO @ 22:13:02] 0.000163 g.CO2eq/s mean an estimation of 5.153959266083935 kg.CO2eq/year
[codecarbon INFO @ 22:13:17] Energy consumed for RAM : 0.000413 kWh. RAM Power : 3.0 W
[codeca

Skipped 2 records due to short audio length.


[codecarbon INFO @ 22:22:12] Energy consumed for RAM : 0.000858 kWh. RAM Power : 3.0 W
[codecarbon INFO @ 22:22:12] Energy consumed for all CPUs : 0.002145 kWh. Total CPU Power : 7.5 W
[codecarbon INFO @ 22:22:12] 0.003002 kWh of electricity used since the beginning.


RandomForestClassifier Accuracy: 80.86%
Training completed. Total CO2 emissions: 0.0001682501698375308 kg


In [12]:
# Initialize CodeCarbon tracker
tracker = EmissionsTracker()
tracker.start()
# Collect test data
X_test = []
y_test = []

for spectrogram, label in test_loader:
    X_test.append(spectrogram.numpy().reshape(spectrogram.shape[0], -1))  # Flatten each sample
    y_test.append(label.numpy())

X_test = np.vstack(X_test)
y_test = np.hstack(y_test)

# Predict and compute accuracy
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"RandomForestClassifier Accuracy: {accuracy * 100:.2f}%")
# Stop the tracker and print emissions
emissions = tracker.stop()
print(f"Training completed. Total CO2 emissions: {emissions} kg")

[codecarbon INFO @ 22:37:42] [setup] RAM Tracking...
[codecarbon INFO @ 22:37:42] [setup] CPU Tracking...
 Mac OS detected: Please install Intel Power Gadget or enable PowerMetrics sudo to measure CPU

[codecarbon INFO @ 22:37:50] CPU Model on constant consumption mode: Intel(R) Core(TM) i5-8257U CPU @ 1.40GHz
[codecarbon INFO @ 22:37:50] [setup] GPU Tracking...
[codecarbon INFO @ 22:37:50] No GPU found.
[codecarbon INFO @ 22:37:50] >>> Tracker's metadata:
[codecarbon INFO @ 22:37:50]   Platform system: macOS-10.16-x86_64-i386-64bit
[codecarbon INFO @ 22:37:50]   Python version: 3.11.7
[codecarbon INFO @ 22:37:50]   CodeCarbon version: 2.8.3
[codecarbon INFO @ 22:37:50]   Available RAM : 8.000 GB
[codecarbon INFO @ 22:37:50]   CPU count: 8
[codecarbon INFO @ 22:37:50]   CPU model: Intel(R) Core(TM) i5-8257U CPU @ 1.40GHz
[codecarbon INFO @ 22:37:50]   GPU count: None
[codecarbon INFO @ 22:37:50]   GPU model: None
[codecarbon INFO @ 22:37:52] Saving emissions data to file /Users/sarahle

Skipped 3 records due to short audio length.


[codecarbon INFO @ 22:49:56] Energy consumed for RAM : 0.000603 kWh. RAM Power : 3.0 W
[codecarbon INFO @ 22:49:56] Energy consumed for all CPUs : 0.001507 kWh. Total CPU Power : 7.5 W
[codecarbon INFO @ 22:49:56] 0.002110 kWh of electricity used since the beginning.


RandomForestClassifier Accuracy: 80.23%
Training completed. Total CO2 emissions: 0.00011825135238403398 kg


In [None]:
# Initialize CodeCarbon tracker
tracker = EmissionsTracker()
tracker.start()

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Collect flattened spectrograms and labels
X_train = []
y_train = []

for spectrogram, label in train_loader:
    X_train.append(spectrogram.numpy().reshape(spectrogram.shape[0], -1))  # Flatten each sample
    y_train.append(label.numpy())

# Convert lists to NumPy arrays
X_train = np.vstack(X_train)  # Stack all batches correctly
y_train = np.hstack(y_train)  # Flatten label array

# Initialize and train the RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Collect test data
X_test = []
y_test = []

for spectrogram, label in test_loader:
    X_test.append(spectrogram.numpy().reshape(spectrogram.shape[0], -1))  # Flatten each sample
    y_test.append(label.numpy())

X_test = np.vstack(X_test)
y_test = np.hstack(y_test)

# Predict and compute accuracy
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"RandomForestClassifier Accuracy: {accuracy * 100:.2f}%")
# Stop the tracker and print emissions
emissions = tracker.stop()
print(f"Training completed. Total CO2 emissions: {emissions} kg")# Initialize CodeCarbon tracker
tracker = EmissionsTracker()
tracker.start()

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Collect flattened spectrograms and labels
X_train = []
y_train = []

for spectrogram, label in train_loader:
    X_train.append(spectrogram.numpy().reshape(spectrogram.shape[0], -1))  # Flatten each sample
    y_train.append(label.numpy())

# Convert lists to NumPy arrays
X_train = np.vstack(X_train)  # Stack all batches correctly
y_train = np.hstack(y_train)  # Flatten label array

# Initialize and train the RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Collect test data
X_test = []
y_test = []

for spectrogram, label in test_loader:
    X_test.append(spectrogram.numpy().reshape(spectrogram.shape[0], -1))  # Flatten each sample
    y_test.append(label.numpy())

X_test = np.vstack(X_test)
y_test = np.hstack(y_test)

# Predict and compute accuracy
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"RandomForestClassifier Accuracy: {accuracy * 100:.2f}%")
# Stop the tracker and print emissions
emissions = tracker.stop()
print(f"Training completed. Total CO2 emissions: {emissions} kg")

import numpy as np
import torch
import librosa
import joblib
from torch.utils.data import IterableDataset, DataLoader
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# ---------------------------------------
# Spectrogram Dataset Class
# ---------------------------------------
class SpectrogramIterableDataset(IterableDataset):
    def __init__(self, iterable_dataset, n_fft=1024, hop_length=256, n_mels=64, target_size=(64, 64)):
        self.dataset = iterable_dataset
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.n_mels = n_mels
        self.target_size = target_size

    def process_audio(self, audio_array, sampling_rate):
        """Convert audio to log-mel spectrogram"""
        mel_spectrogram = librosa.feature.melspectrogram(
            y=audio_array, sr=sampling_rate, n_fft=self.n_fft,
            hop_length=self.hop_length, n_mels=self.n_mels
        )
        log_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)

        # Normalize to [0,1]
        log_mel_spectrogram = (log_mel_spectrogram - np.min(log_mel_spectrogram)) / (
            np.max(log_mel_spectrogram) - np.min(log_mel_spectrogram)
        )

        # Resize to target size
        log_mel_spectrogram = librosa.util.fix_length(log_mel_spectrogram, size=self.target_size[1], axis=1)
        log_mel_spectrogram = librosa.util.fix_length(log_mel_spectrogram, size=self.target_size[0], axis=0)

        return torch.tensor(log_mel_spectrogram, dtype=torch.float32).unsqueeze(0)  # Add channel dim

    def __iter__(self):
        """Iterate over dataset"""
        for sample in iter(self.dataset):
            audio_array = sample['audio']['array']
            sampling_rate = sample['audio']['sampling_rate']
            label = sample['label']

            spectrogram = self.process_audio(audio_array, sampling_rate)
            yield spectrogram, label

    def __len__(self):
        return sum(1 for _ in iter(self.dataset))  # Count number of items

# ---------------------------------------
# Random Forest Model Class
# ---------------------------------------
class RandomForestAudioClassifier:
    def __init__(self, n_estimators=100, random_state=42):
        self.model = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)

    def extract_features(self, dataset):
        """Convert dataset to NumPy arrays for Random Forest"""
        X, y = [], []
        for spectrogram, label in dataset:
            X.append(spectrogram.numpy().flatten())  # Convert tensor to 1D NumPy array
            y.append(label)
        return np.array(X), np.array(y)

    def train(self, dataset):
        """Train the Random Forest model"""
        X_train, y_train = self.extract_features(dataset)
        self.model.fit(X_train, y_train)

        # Evaluate on training set
        y_train_pred = self.model.predict(X_train)
        train_acc = accuracy_score(y_train, y_train_pred)
        print(f"Training Accuracy: {train_acc:.4f}")

    def save_model(self, filename="/models/random_forest_model.pkl"):
        """Save the trained model"""
        joblib.dump(self.model, filename)
        print(f"Model saved to {filename}")

    def load_model(self, filename="/models/random_forest_model.pkl"):
        """Load a pre-trained model"""
        self.model = joblib.load(filename)
        print(f"Model loaded from {filename}")

    def predict(self, spectrogram):
        """Make a prediction on a single spectrogram"""
        X = spectrogram.numpy().flatten().reshape(1, -1)
        return self.model.predict(X)[0]

# ---------------------------------------
# Main Training Routine
# ---------------------------------------
if __name__ == "__main__":
    tracker = EmissionsTracker()
    tracker.start()

    # Wrap the train IterableDataset
    wrapped_train_dataset = SpectrogramIterableDataset(dataset['train'])

    # Create DataLoader with smaller batch size
    batch_size = 16  # Reduce from 32
    train_loader = DataLoader(
        wrapped_train_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=0  # Set to 0 if CPU resources are limited
    )

    # Wrap the test IterableDataset
    wrapped_test_dataset = SpectrogramIterableDataset(dataset['test'])

    # Create DataLoader with smaller batch size
    batch_size = 16  # Reduce from 32
    test_loader = DataLoader(
        wrapped_test_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=0  # Set to 0 if CPU resources are limited
    )


    # Initialize and train the model
    rf_classifier = RandomForestAudioClassifier(n_estimators=100)
    rf_classifier.train(train_loader)

    # Save the trained model
    rf_classifier.save_model()

    y_pred = rf_classifier(train_loader)
    accuracy_score = accuracy_score(train_loader, y_pred)
    
    # Stop the tracker and print emissions
    emissions = tracker.stop()
    print(f"Training completed. Total CO2 emissions: {emissions} kg")
    print('Accuracy score: ' + accuracy_score)