In [1]:
# Standard libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import librosa
from google.colab import drive
import os

from librosa import load as lload
from librosa.feature import mfcc
from librosa.feature.inverse import mfcc_to_audio

# Mount Google Drive for access
drive.mount('/content/drive')

from sklearn.preprocessing import MinMaxScaler

# Base directory path
base_directory_path = '/content/drive/MyDrive/207-Project/notebooks/RG/3_species/librosa_loaded_sr16000/'



Mounted at /content/drive


In [2]:

# Function to extract 5-second  chunks from audio
def extract_5sec_chunks(
    audio_array: np.ndarray,
    window_size_s: float = 5.0,
    hop_size_s: float = 2.5, #(with overlap)
    sample_rate=16000,
    # n_fft=2048,
    # n_mfcc=13,
) -> np.ndarray:
    frame_length = int(window_size_s * sample_rate)
    frame_step = int(hop_size_s * sample_rate)
    framed_audio = tf.signal.frame(audio_array, frame_length, frame_step, pad_end=False)

    return framed_audio



In [3]:


# Load metadata
dataset_path = '/content/drive/MyDrive/207-Project/notebooks/RG/3_species/'
metadata_path = os.path.join(dataset_path, "train_val.csv")
metadata_df = pd.read_csv(metadata_path)
metadata_df = metadata_df[metadata_df['duration_secs_32000'] >= 8]

# Create a dictionary to map filenames to labels
labels_dict = metadata_df.set_index('filename_npy')['primary_label'].to_dict()

# Get unique filenames from the metadata
filenames = set(base_directory_path + '/' + metadata_df['filename_npy'].unique())

# Split train and validation data
train_df = metadata_df[metadata_df['data'] == 'train']
validate_df = metadata_df[metadata_df['data'] == 'val']

In [4]:
def extract_final_features_from_dataframe(df):
    mfccs = []
    target_labels = []
    for index, row in df.iterrows():
        class_label = row["primary_label"]

        # Extract MFCC and spectral centroid features for each frame
        mfcc_frames, target_label = extract_mfcc_and_spectral_centroid_from_frames(framed_audio, class_label, sample_rate=16000)

        mfccs.extend(mfcc_frames)
        target_labels.extend(target_label)
    return mfccs, target_labels

In [5]:
#scale data
def scale_data(data):
    # Reshape the data to 2D array
    num_samples, num_time_steps, num_features = data.shape
    data_reshaped = data.reshape(num_samples, num_features * num_time_steps)

    # Create the MinMaxScaler object and fit it to the data
    scaler = MinMaxScaler(feature_range=(0, 1))
    data_scaled = scaler.fit_transform(data_reshaped)

    # Reshape the scaled data back to its original shape
    data_scaled = data_scaled.reshape(num_samples, num_time_steps, num_features)

    return data_scaled

In [6]:
# def mfcc_chunks(dataframe: pd.DataFrame, sample_rate=16000, n_mfcc=13, n_fft=2048) -> tuple[np.ndarray, np.ndarray]:
    # y = []
    # X = []

    # for i, row in dataframe.iterrows():
    #   label = row['primary_label']
    #   tensor_5sec = row['audio_chunks']

    #   for each in tensor_5sec:
    #     y.append(label)
    #     each = np.array(each)
    #     X_mfcc = mfcc(y=each, sr=sample_rate)
    #     X.append(X_mfcc)

    # assert len(y) == len(X)

    # y = np.array(y)
    # X = np.array(X)



    # return X, y

def extract_mfcc_and_spectral_centroid_from_frames(framed_audio, class_label, sample_rate=16000, n_mfcc=20, n_chroma=12):
    mfcc_frames = []
    spectral_centroid_frames = []
    target_label = []

    for frame in framed_audio:
        frame = np.array(frame)

        # Extract MFCC from the main audio frame
        mfcc = librosa.feature.mfcc(y=frame, sr=sample_rate, n_mfcc=n_mfcc)
        transposed_mfcc = mfcc.T

        # Extract spectral centroid from the main audio frame
        spectral_centroid = librosa.feature.spectral_centroid(y=frame, sr=sample_rate)
        transposed_spectral_centroid = spectral_centroid.T

        # Append the features to the respective lists
        mfcc_frames.append(transposed_mfcc)
        spectral_centroid_frames.append(transposed_spectral_centroid)
        target_label.append(class_label)

    # Combine MFCC and spectral centroid features along the time axis
    combined_frames = [np.hstack((mfcc, spectral_centroid)) for mfcc, spectral_centroid in zip(mfcc_frames, spectral_centroid_frames)]
    combined_frames = np.array(combined_frames)

    # Convert the lists to numpy arrays
    target_label = np.array(target_label)

    return combined_frames, target_label

In [7]:
# Load all train audio data one time
train_audios = []
for filename in train_df['filename_npy']:
    audio = np.load('/content/drive/MyDrive/207-Project/data/train/librosa_loaded/' + filename)
    train_audios.append(audio)

In [None]:
# Load all validate audio data one time
val_audios = []
for filename in validate_df['filename_npy']:
    audio = np.load('/content/drive/MyDrive/207-Project/data/train/librosa_loaded/' + filename)
    val_audios.append(audio)

In [None]:
print(len(train_audios))
print(len(val_audios))

657
283


In [None]:
#Initialize an empty list to store the chunk data for training data
train_chunks = []

for audio in train_audios:
  train_chunks.append(extract_5sec_chunks(audio))

len(train_chunks)

657

In [None]:
#Initialize an empty list to store the chunk data for VALIDATION data
val_chunks = []

for audio in val_audios:
  val_chunks.append(extract_5sec_chunks(audio))

len(val_chunks)

283

In [None]:
# Extract the mfcc embeddings from the model
train_embeddings, labels = extract_final_features_from_dataframe(train_chunks)

In [None]:
#add chunks to trainig data
train_df['audio_chunks'] = train_chunks
train_df.head()
train_df = train_df.sample(frac=1, random_state=1234)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['audio_chunks'] = train_chunks


In [None]:
#add chunks to validation data
validate_df['audio_chunks'] = val_chunks
validate_df.head()
validate_df = validate_df.sample(frac=1, random_state=1234)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validate_df['audio_chunks'] = val_chunks


In [None]:
%who


LabelEncoder	 MinMaxScaler	 StandardScaler	 audio	 base_directory_path	 dataset_path	 drive	 extract_5sec_chunks	 filename	 
filenames	 labels_dict	 layers	 librosa	 lload	 metadata_df	 metadata_path	 mfcc	 mfcc_chunks	 
mfcc_to_audio	 models	 np	 os	 pd	 tf	 train_audios	 train_chunks	 train_df	 
train_test_split	 val_audios	 val_chunks	 validate_df	 


In [None]:
del audio
del base_directory_path
del dataset_path
del drive
del extract_5sec_chunks
del filename
del filenames
# del labels_dict
del layers
del metadata_df
del metadata_path
del os
del train_audios
del train_chunks
del val_audios
del val_chunks
del train_test_split


In [None]:
import gc
gc.collect()

28549

In [None]:
%who


LabelEncoder	 MinMaxScaler	 StandardScaler	 gc	 labels_dict	 librosa	 lload	 mfcc	 mfcc_chunks	 
mfcc_to_audio	 models	 np	 pd	 tf	 train_df	 validate_df	 


In [None]:
#convert to X and y
X_train, y_train = mfcc_chunks(train_df)

In [None]:
#convert to X and y
X_val, y_val = mfcc_chunks(validate_df)

In [None]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

X_train shape: (7889, 20, 157)
y_train shape: (7889,)
X_val shape: (3121, 20, 157)
y_val shape: (3121,)


In [None]:
# validate_df.head()
print(train_df['primary_label'].unique())

['comsan' 'eaywag1' 'barswa']


In [None]:
del train_df
del validate_df
del mfcc
del mfcc_chunks
del mfcc_to_audio

In [None]:
import json

In [None]:
X_train_list = X_train.tolist()
y_train_list = y_train.tolist()

X_val_list = X_val.tolist()
y_val_list = y_val.tolist()

# Save data to JSON files
with open('/content/drive/MyDrive/My-207/X_train_mfcc_aug.json', 'w') as file:
    json.dump(X_train_list, file)

with open('/content/drive/MyDrive/My-207/y_train_mfcc_aug.json', 'w') as file:
    json.dump(y_train_list, file)

with open('/content/drive/MyDrive/My-207/X_val_mfcc_aug.json', 'w') as file:
    json.dump(X_val_list, file)

with open('/content/drive/MyDrive/My-207/y_val_mfcc_aug.json', 'w') as file:
    json.dump(y_val_list, file)

In [None]:
# Data normalization
# Create the scaler
scaler = MinMaxScaler()

# Flatten X_train into a 2-dimensional array
X_train = X_train.reshape(X_train.shape[0], -1)
# Fit and transform the training data
X_train = scaler.fit_transform(X_train)

# Flatten X_val into a 2-dimensional array
X_val = X_val.reshape(X_val.shape[0], -1)

# Transform the validation data using the scaler parameters from the training data
X_val = scaler.transform(X_val)

# Convert string labels to numerical labels for training and validation data
label_encoder = LabelEncoder()
all_labels = np.array(list(labels_dict.values()))
label_encoder.fit(all_labels)
y_train_encoded = label_encoder.transform(y_train)
y_val_encoded = label_encoder.transform(y_val)


In [None]:
X_train_list = X_train.tolist()
y_train_list = y_train.tolist()

X_val_list = X_val.tolist()
y_val_list = y_val.tolist()

# Save data to JSON files
with open('/content/drive/MyDrive/My-207/X_train_n.json', 'w') as file:
    json.dump(X_train_list, file)

with open('/content/drive/MyDrive/My-207/y_train_n.json', 'w') as file:
    json.dump(y_train_list, file)

with open('/content/drive/MyDrive/My-207/X_val_n.json', 'w') as file:
    json.dump(X_val_list, file)

with open('/content/drive/MyDrive/My-207/y_val_n.json', 'w') as file:
    json.dump(y_val_list, file)

In [None]:
# # Data normalization
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# X_train_normalized = scaler.fit_transform(X_train)
# X_val_normalized = scaler.transform(X_val)

# # Convert string labels to numerical labels for training and validation data
# label_encoder = LabelEncoder()
# all_labels = np.array(list(labels_dict.values()))
# label_encoder.fit(all_labels)
# y_train_encoded = label_encoder.transform(y_train)
# # y_val_encoded = label_encoder.transform(y_val)

# # # Train the model
# # history = model.fit(X_train_normalized, y_train_encoded, validation_data=(X_val_normalized, y_val_encoded), batch_size=32, epochs=10)

# # Data normalization
# from sklearn.preprocessing import StandardScaler
# # Reshape X_train to 2D array
# num_samples, num_chunks, num_mfcc_features = X_train.shape
# X_train_reshaped = X_train.reshape(-1, num_mfcc_features)

# # Now apply the StandardScaler
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train_reshaped)

# # Reshape X_val to 2D array
# num_samples, num_chunks, num_mfcc_features = X_val.shape
# X_val_reshaped = X_val.reshape(-1, num_mfcc_features)

# # Now apply the StandardScaler using the same scaler fitted on X_train
# X_val = scaler.transform(X_val_reshaped)

# # Convert string labels to numerical labels for training and validation data
# label_encoder = LabelEncoder()
# all_labels = np.array(list(labels_dict.values()))
# label_encoder.fit(all_labels)
# y_train = label_encoder.transform(y_train)
# y_val = label_encoder.transform(y_val)

In [None]:
# print(labels_dict)

In [None]:

# # Build network topology
# model = tf.keras.Sequential([
#     # Input layer
#     tf.keras.layers.Flatten(input_shape=X_train.shape[1:]),

#     # 1st dense layer
#     tf.keras.layers.Dense(512, activation='relu'),

#     # 2nd dense layer
#     tf.keras.layers.Dense(256, activation='relu'),

#     # 3rd dense layer
#     tf.keras.layers.Dense(64, activation='relu'),

#     # Output layer
#     tf.keras.layers.Dense(3, activation='softmax')
# ])


In [None]:

# # Compile model
# optimiser = tf.keras.optimizers.Adam(learning_rate=0.0001)
# model.compile(optimizer=optimiser, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# # Summary of the model
# model.summary()


In [None]:

# # # Train the model
# history = model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=32, epochs=10)

In [None]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

X_train shape: (7889, 3140)
y_train shape: (7889,)
X_val shape: (3121, 3140)
y_val shape: (3121,)
