In [2]:
# Standard libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import librosa
from google.colab import drive
import os

from librosa import load as lload
from librosa.feature import mfcc
from librosa.feature.inverse import mfcc_to_audio

# Mount Google Drive for access
drive.mount('/content/drive')

from sklearn.preprocessing import MinMaxScaler

# Base directory path
base_directory_path = '/content/drive/MyDrive/207-Project/notebooks/RG/3_species/librosa_loaded_sr16000/'

#data augmention
from librosa.effects import time_stretch, pitch_shift
from librosa.effects import time_stretch, pitch_shift
from scipy.ndimage import shift




Mounted at /content/drive


In [3]:

# Function to extract 5-second  chunks from audio
def extract_5sec_chunks(
    audio_array: np.ndarray,
    window_size_s: float = 5.0,
    hop_size_s: float = 2.5, #(with overlap)
    sample_rate=16000,
    # n_fft=2048,
    # n_mfcc=13,
) -> np.ndarray:
    frame_length = int(window_size_s * sample_rate)
    frame_step = int(hop_size_s * sample_rate)
    framed_audio = tf.signal.frame(audio_array, frame_length, frame_step, pad_end=False)

    return framed_audio



In [4]:


# Load metadata
dataset_path = '/content/drive/MyDrive/207-Project/notebooks/RG/3_species/'
metadata_path = os.path.join(dataset_path, "train_val.csv")
metadata_df = pd.read_csv(metadata_path)

# Create a dictionary to map filenames to labels
labels_dict = metadata_df.set_index('filename_npy')['primary_label'].to_dict()

# Get unique filenames from the metadata
filenames = set(base_directory_path + '/' + metadata_df['filename_npy'].unique())

# Split train and validation data
train_df = metadata_df[metadata_df['data'] == 'train']
validate_df = metadata_df[metadata_df['data'] == 'val']

In [5]:
def mfcc_chunks(dataframe: pd.DataFrame, sample_rate=16000, n_mfcc=13, n_fft=2048) -> tuple[np.ndarray, np.ndarray]:
    y = []
    X = []

    for i, row in dataframe.iterrows():
      label = row['primary_label']
      tensor_5sec = row['audio_chunks']

      for each in tensor_5sec:
        y.append(label)
        each = np.array(each)
        X_mfcc = mfcc(y=each, sr=sample_rate)
        X.append(X_mfcc)

    assert len(y) == len(X)

    y = np.array(y)
    X = np.array(X)



    return X, y

In [6]:
def mfcc_chunks_augment(dataframe, sample_rate=16000, n_mfcc=13, n_fft=2048):
    y = []
    X = []

    for i, row in dataframe.iterrows():
        label = row['primary_label']
        tensor_5sec = row['audio_chunks']
        duraion = row['duration_secs_32000']

        for each in tensor_5sec:
            y.append(label)
            each = np.array(each)

            # Data augmentation
            for _ in range(4):
                # Create augmented versions of the audio
                # time_stretched_audio = librosa.effects.time_stretch(each, rate=np.random.uniform(0.8, 1.2))
                pitch_shifted_audio = librosa.effects.pitch_shift(each, sr=sample_rate, n_steps=np.random.randint(-3, 4))
                # noisy_audio = each + 0.01 * np.random.randn(len(each))
                # shifted_audio = np.roll(each, np.random.randint(-500, 500))

                # # Compute MFCCs for each augmented audio and append to the list
                # X_mfcc_augmented = librosa.feature.mfcc(y=time_stretched_audio, sr=sample_rate, n_mfcc=n_mfcc, n_fft=n_fft)
                # X.append(X_mfcc_augmented)

                X_mfcc_augmented = librosa.feature.mfcc(y=pitch_shifted_audio, sr=sample_rate, n_mfcc=n_mfcc, n_fft=n_fft)
                X.append(X_mfcc_augmented)

                # X_mfcc_augmented = librosa.feature.mfcc(y=noisy_audio, sr=sample_rate, n_mfcc=n_mfcc, n_fft=n_fft)
                # X.append(X_mfcc_augmented)

                # X_mfcc_augmented = librosa.feature.mfcc(y=shifted_audio, sr=sample_rate, n_mfcc=n_mfcc, n_fft=n_fft)
                # X.append(X_mfcc_augmented)

                # Extend the label list for each augmented audio
                # y.extend([label] * 2)

    assert len(y) == len(X)

    y = np.array(y)
    X = np.array(X)

    return X, y









In [7]:
# Load all train audio data one time
train_audios = []
for filename in train_df['filename_npy']:
    audio = np.load('/content/drive/MyDrive/207-Project/data/train/librosa_loaded/' + filename)
    train_audios.append(audio)

In [8]:
# Load all validate audio data one time
val_audios = []
for filename in validate_df['filename_npy']:
    audio = np.load('/content/drive/MyDrive/207-Project/data/train/librosa_loaded/' + filename)
    val_audios.append(audio)

In [9]:
print(len(train_audios))
print(len(val_audios))

657
283


In [10]:
#Initialize an empty list to store the chunk data for training data
train_chunks = []

for audio in train_audios:
  train_chunks.append(extract_5sec_chunks(audio))

len(train_chunks)

657

In [11]:
#Initialize an empty list to store the chunk data for VALIDATION data
val_chunks = []

for audio in val_audios:
  val_chunks.append(extract_5sec_chunks(audio))

len(val_chunks)

283

In [12]:
#add chunks to trainig data
train_df['audio_chunks'] = train_chunks
train_df.head()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['audio_chunks'] = train_chunks


Unnamed: 0,primary_label,filename,type,filename_npy,rating,duration_secs_32000,country,continent,data,audio_chunks
0,eaywag1,eaywag1/XC718442.ogg,blank,eaywag1/XC718442.npy,good,12.538781,FR,EUROPE,train,"((tf.Tensor(8.509553e-06, shape=(), dtype=floa..."
1,eaywag1,eaywag1/XC675682.ogg,call,eaywag1/XC675682.npy,good,35.657,RU,EUROPE,train,"((tf.Tensor(-4.2310858e-06, shape=(), dtype=fl..."
2,eaywag1,eaywag1/XC722533.ogg,blank,eaywag1/XC722533.npy,good,58.104,RU,EUROPE,train,"((tf.Tensor(-6.66614e-06, shape=(), dtype=floa..."
3,eaywag1,eaywag1/XC673617.ogg,call,eaywag1/XC673617.npy,poor,18.756,GB,EUROPE,train,"((tf.Tensor(6.5648255e-06, shape=(), dtype=flo..."
4,eaywag1,eaywag1/XC675935.ogg,call,eaywag1/XC675935.npy,good,16.666,RU,EUROPE,train,"((tf.Tensor(-2.9578205e-06, shape=(), dtype=fl..."


In [13]:
#add chunks to validation data
validate_df['audio_chunks'] = val_chunks
validate_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validate_df['audio_chunks'] = val_chunks


Unnamed: 0,primary_label,filename,type,filename_npy,rating,duration_secs_32000,country,continent,data,audio_chunks
657,eaywag1,eaywag1/XC182483.ogg,call,eaywag1/XC182483.npy,good,21.072,KZ,ASIA,val,"((tf.Tensor(-7.202849e-06, shape=(), dtype=flo..."
658,eaywag1,eaywag1/XC675723.ogg,call,eaywag1/XC675723.npy,good,83.33,RU,EUROPE,val,"((tf.Tensor(-1.6016907e-05, shape=(), dtype=fl..."
659,eaywag1,eaywag1/XC589278.ogg,call,eaywag1/XC589278.npy,good,13.32,FR,EUROPE,val,"((tf.Tensor(-2.0342413e-06, shape=(), dtype=fl..."
660,eaywag1,eaywag1/XC467878.ogg,call,eaywag1/XC467878.npy,poor,6.582,BE,EUROPE,val,"((tf.Tensor(2.5765403e-06, shape=(), dtype=flo..."
661,eaywag1,eaywag1/XC634278.ogg,call,eaywag1/XC634278.npy,good,26.616,PT,EUROPE,val,"((tf.Tensor(7.290323e-06, shape=(), dtype=floa..."


In [14]:
%who


LabelEncoder	 MinMaxScaler	 StandardScaler	 audio	 base_directory_path	 dataset_path	 drive	 extract_5sec_chunks	 filename	 
filenames	 labels_dict	 layers	 librosa	 lload	 metadata_df	 metadata_path	 mfcc	 mfcc_chunks	 
mfcc_chunks_augment	 mfcc_to_audio	 models	 np	 os	 pd	 pitch_shift	 shift	 tf	 
time_stretch	 train_audios	 train_chunks	 train_df	 train_test_split	 val_audios	 val_chunks	 validate_df	 


In [15]:
del audio
del base_directory_path
del dataset_path
del drive
del extract_5sec_chunks
del filename
del filenames
# del labels_dict
del layers
del metadata_df
del metadata_path
del os
del train_audios
del train_chunks
del val_audios
del val_chunks
del train_test_split


In [16]:
import gc
gc.collect()

27631

In [17]:
%who


LabelEncoder	 MinMaxScaler	 StandardScaler	 gc	 labels_dict	 librosa	 lload	 mfcc	 mfcc_chunks	 
mfcc_chunks_augment	 mfcc_to_audio	 models	 np	 pd	 pitch_shift	 shift	 tf	 time_stretch	 
train_df	 validate_df	 


In [None]:
# call mfcc augment
X_train, y_train = mfcc_chunks_augment(train_df)

AssertionError: ignored

In [None]:
#convert to X and y with mfcc
X_val, y_val = mfcc_chunks(validate_df)

In [None]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

In [None]:
# validate_df.head()
print(train_df['primary_label'].unique())

In [None]:
del train_df
del validate_df
del mfcc
del mfcc_chunks
del mfcc_to_audio

In [None]:
import json

In [None]:
X_train_list = X_train.tolist()
y_train_list = y_train.tolist()

X_val_list = X_val.tolist()
y_val_list = y_val.tolist()

# Save data to JSON files
with open('/content/drive/MyDrive/My-207/X_train.json', 'w') as file:
    json.dump(X_train_list, file)

with open('/content/drive/MyDrive/My-207/y_train.json', 'w') as file:
    json.dump(y_train_list, file)

with open('/content/drive/MyDrive/My-207/X_val.json', 'w') as file:
    json.dump(X_val_list, file)

with open('/content/drive/MyDrive/My-207/y_val.json', 'w') as file:
    json.dump(y_val_list, file)

In [None]:
# Data normalization
# Create the scaler
scaler = MinMaxScaler()

# Flatten X_train into a 2-dimensional array
X_train = X_train.reshape(X_train.shape[0], -1)
# Fit and transform the training data
X_train = scaler.fit_transform(X_train)

# Flatten X_val into a 2-dimensional array
X_val = X_val.reshape(X_val.shape[0], -1)

# Transform the validation data using the scaler parameters from the training data
X_val = scaler.transform(X_val)

# Convert string labels to numerical labels for training and validation data
label_encoder = LabelEncoder()
all_labels = np.array(list(labels_dict.values()))
label_encoder.fit(all_labels)
y_train_encoded = label_encoder.transform(y_train)
y_val_encoded = label_encoder.transform(y_val)


In [None]:
X_train_list = X_train.tolist()
y_train_list = y_train.tolist()

X_val_list = X_val.tolist()
y_val_list = y_val.tolist()

# Save data to JSON files
with open('/content/drive/MyDrive/My-207/X_train_n.json', 'w') as file:
    json.dump(X_train_list, file)

with open('/content/drive/MyDrive/My-207/y_train_n.json', 'w') as file:
    json.dump(y_train_list, file)

with open('/content/drive/MyDrive/My-207/X_val_n.json', 'w') as file:
    json.dump(X_val_list, file)

with open('/content/drive/MyDrive/My-207/y_val_n.json', 'w') as file:
    json.dump(y_val_list, file)

In [None]:
# # Data normalization
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# X_train_normalized = scaler.fit_transform(X_train)
# X_val_normalized = scaler.transform(X_val)

# # Convert string labels to numerical labels for training and validation data
# label_encoder = LabelEncoder()
# all_labels = np.array(list(labels_dict.values()))
# label_encoder.fit(all_labels)
# y_train_encoded = label_encoder.transform(y_train)
# # y_val_encoded = label_encoder.transform(y_val)

# # # Train the model
# # history = model.fit(X_train_normalized, y_train_encoded, validation_data=(X_val_normalized, y_val_encoded), batch_size=32, epochs=10)

# # Data normalization
# from sklearn.preprocessing import StandardScaler
# # Reshape X_train to 2D array
# num_samples, num_chunks, num_mfcc_features = X_train.shape
# X_train_reshaped = X_train.reshape(-1, num_mfcc_features)

# # Now apply the StandardScaler
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train_reshaped)

# # Reshape X_val to 2D array
# num_samples, num_chunks, num_mfcc_features = X_val.shape
# X_val_reshaped = X_val.reshape(-1, num_mfcc_features)

# # Now apply the StandardScaler using the same scaler fitted on X_train
# X_val = scaler.transform(X_val_reshaped)

# # Convert string labels to numerical labels for training and validation data
# label_encoder = LabelEncoder()
# all_labels = np.array(list(labels_dict.values()))
# label_encoder.fit(all_labels)
# y_train = label_encoder.transform(y_train)
# y_val = label_encoder.transform(y_val)

In [None]:
# print(labels_dict)

In [None]:

# # Build network topology
# model = tf.keras.Sequential([
#     # Input layer
#     tf.keras.layers.Flatten(input_shape=X_train.shape[1:]),

#     # 1st dense layer
#     tf.keras.layers.Dense(512, activation='relu'),

#     # 2nd dense layer
#     tf.keras.layers.Dense(256, activation='relu'),

#     # 3rd dense layer
#     tf.keras.layers.Dense(64, activation='relu'),

#     # Output layer
#     tf.keras.layers.Dense(3, activation='softmax')
# ])


In [None]:

# # Compile model
# optimiser = tf.keras.optimizers.Adam(learning_rate=0.0001)
# model.compile(optimizer=optimiser, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# # Summary of the model
# model.summary()


In [None]:

# # # Train the model
# history = model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=32, epochs=10)

In [None]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)