In [1]:
from google.colab import drive, auth
import sys
import numpy as np
import pandas as pd
import tensorflow_hub as hub
import librosa
import matplotlib.pyplot as plt
import csv
from IPython.display import Audio

#sklearn libraries
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

#tensorflow for modles
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, LSTM, Conv2D, MaxPooling2D, Flatten, concatenate, Reshape, BatchNormalization
import tensorflow_hub as hub
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import Callback,EarlyStopping
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from keras import metrics
tf.get_logger().setLevel('INFO')

#mount drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#declare sampling rate
sampling_rate = 16000
#get the audio path of all the audios
audio_path = '/content/drive/MyDrive/UCB-MIDS/SEM-2/MACHINE-LEARNING-207/207-Project/data/train/librosa_loaded/'

In [None]:
#Read the data and get the shapes of the dataframe
bird_df = pd.read_csv('/content/drive/MyDrive/UCB-MIDS/SEM-2/MACHINE-LEARNING-207/207-Project/notebooks/RG/3_species/train_val.csv')

#lets only keep audio files > 8 sec
bird_df = bird_df[bird_df['duration_secs_32000'] >= 8]
print(bird_df.shape)

bird_df.head(2)

(794, 9)


Unnamed: 0,primary_label,filename,type,filename_npy,rating,duration_secs_32000,country,continent,data
0,eaywag1,eaywag1/XC718442.ogg,blank,eaywag1/XC718442.npy,good,12.538781,FR,EUROPE,train
1,eaywag1,eaywag1/XC675682.ogg,call,eaywag1/XC675682.npy,good,35.657,RU,EUROPE,train


In [None]:
#Gather the training data from the main dataset
bird_train_df = bird_df[bird_df['data'] == 'train']
print("train data:",bird_train_df.shape)

#Gather the separate validation set from the csv
bird_val_df =  bird_df[bird_df['data'] == 'val']
print("validation data:",bird_val_df.shape)

train data: (560, 9)
validation data: (234, 9)


In [None]:
#Function to load the audio
def load_audio(file_name):
    audio = np.load(audio_path + file_name)
    return audio

In [None]:
#chunk the audio into 5 sec frame with no overlaps
def frame_audio(
      audio_array: np.ndarray,
      window_size_s: float = 8.0,
      hop_size_s: float = 4.0,
      sample_rate = sampling_rate,
      ) -> np.ndarray:

    """Helper function for framing audio for inference."""
    """ using tf.signal """
    if window_size_s is None or window_size_s < 0:
        return audio_array[np.newaxis, :]
    frame_length = int(window_size_s * sample_rate)
    hop_length = int(hop_size_s * sample_rate)
    framed_audio = tf.signal.frame(audio_array, frame_length, hop_length, pad_end=False)
    return framed_audio

In [None]:
def extract_mfcc_and_spectral_centroid_from_frames(framed_audio, class_label, sample_rate=16000, n_mfcc=20, n_chroma=12):
    mfcc_frames = []
    spectral_centroid_frames = []
    target_label = []

    for frame in framed_audio:
        frame = np.array(frame)

        # Extract MFCC from the main audio frame
        mfcc = librosa.feature.mfcc(y=frame, sr=sample_rate, n_mfcc=n_mfcc)
        transposed_mfcc = mfcc.T

        # Extract spectral centroid from the main audio frame
        spectral_centroid = librosa.feature.spectral_centroid(y=frame, sr=sample_rate)
        transposed_spectral_centroid = spectral_centroid.T

        # Append the features to the respective lists
        mfcc_frames.append(transposed_mfcc)
        spectral_centroid_frames.append(transposed_spectral_centroid)
        target_label.append(class_label)

    # Combine MFCC and spectral centroid features along the time axis
    combined_frames = [np.hstack((mfcc, spectral_centroid)) for mfcc, spectral_centroid in zip(mfcc_frames, spectral_centroid_frames)]
    combined_frames = np.array(combined_frames)

    # Convert the lists to numpy arrays
    target_label = np.array(target_label)

    return combined_frames, target_label

In [None]:
def extract_final_features_from_dataframe(df):
    mfccs = []
    target_labels = []
    for index, row in df.iterrows():
        file_name = str(row['filename_npy'])
        class_label = row["primary_label"]
        duration = row['duration_secs_32000']

        #For audios with duration >= 8 seconds, proceed as before
        audio = load_audio(file_name)  # Load the audio using the load_audio function
        framed_audio = frame_audio(audio)  # Frame the audio into 8-second segments

        # Extract MFCC and spectral centroid features for each frame
        mfcc_frames, target_label = extract_mfcc_and_spectral_centroid_from_frames(framed_audio, class_label, sample_rate=sampling_rate)

        mfccs.extend(mfcc_frames)
        target_labels.extend(target_label)
    return mfccs, target_labels

In [None]:
#encode the target variable
def encode_labels(y_train, y_val):
    # Perform one-hot encoding on y_train
    le = LabelEncoder()
    encoded_y_train = to_categorical(le.fit_transform(y_train))

    # Perform one-hot encoding on y_val
    le = LabelEncoder()
    encoded_y_val = to_categorical(le.fit_transform(y_val))

    return encoded_y_train, encoded_y_val

In [None]:
#scale data
def scale_data(data):
    # Reshape the data to 2D array
    num_samples, num_time_steps, num_features = data.shape
    data_reshaped = data.reshape(num_samples, num_features * num_time_steps)

    # Create the MinMaxScaler object and fit it to the data
    scaler = MinMaxScaler(feature_range=(0, 1))
    data_scaled = scaler.fit_transform(data_reshaped)

    # Reshape the scaled data back to its original shape
    data_scaled = data_scaled.reshape(num_samples, num_time_steps, num_features)

    return data_scaled

In [None]:
#create a function to shuffle the data
def shuffle_data(x, y, random_state=1234):
    num_samples = x.shape[0]
    random_indices = np.random.RandomState(random_state).permutation(num_samples)
    x_shuffled = x[random_indices]
    y_shuffled = y[random_indices]
    return x_shuffled, y_shuffled

In [None]:
# Extract the mfcc embeddings from the model
train_embeddings, labels = extract_final_features_from_dataframe(bird_train_df)

In [None]:
train_embeddings[:1]

[array([[-6.40857910e+02,  6.46771851e+01, -6.29885521e+01, ...,
         -1.08033371e+00, -1.50528407e+00,  1.98683053e+03],
        [-3.67023804e+02,  6.11631927e+01, -6.40725555e+01, ...,
         -1.08003788e+01, -6.31683826e+00,  2.06714511e+03],
        [-2.92503204e+02,  5.88846893e+01, -6.74059753e+01, ...,
         -7.55926609e+00, -2.87263107e+00,  2.07534286e+03],
        ...,
        [-2.91017456e+02,  6.15616760e+01, -5.74858971e+01, ...,
         -1.14104509e+00, -2.49208689e+00,  2.13842288e+03],
        [-2.91305878e+02,  5.37731171e+01, -5.42837334e+01, ...,
         -5.22096395e+00, -1.28884435e-01,  2.33472283e+03],
        [-3.29628143e+02,  4.81204491e+01, -4.91389999e+01, ...,
         -9.29372597e+00,  2.12649322e+00,  2.52854892e+03]])]

In [None]:
#verify the shapes
x_train = np.array(train_embeddings)
y_train = np.array(labels)
print(x_train.shape)
print(y_train.shape)

(4555, 251, 21)
(4555,)


In [None]:
#extract the embeddings
val_embeddings, val_labels = extract_final_features_from_dataframe(bird_val_df)

In [None]:
#display a sample embedding from the validation dataset
val_embeddings[:1]

[array([[-3.40613800e+02,  1.27008263e+02,  4.42717743e+01, ...,
         -6.83337808e-01,  1.10607815e+01,  7.48194361e+02],
        [-2.83616699e+02,  1.24639816e+02,  3.80632324e+01, ...,
          5.67251980e-01,  6.37503433e+00,  8.75218688e+02],
        [-2.75975647e+02,  1.24306831e+02,  3.49290543e+01, ...,
          2.29616117e+00,  3.15063739e+00,  9.94964999e+02],
        ...,
        [-2.73981232e+02,  1.36268066e+02,  5.17533417e+01, ...,
          6.51623189e-01,  2.89341331e+00,  8.93063267e+02],
        [-2.67342560e+02,  1.30733444e+02,  4.79498978e+01, ...,
          1.71484911e+00,  4.02341795e+00,  9.79677857e+02],
        [-2.83436401e+02,  1.16292267e+02,  3.49498978e+01, ...,
         -4.25243855e+00,  2.17064023e-02,  1.20689067e+03]])]

In [None]:
#verify the val data shapes
x_val = np.array(val_embeddings)
y_val = np.array(val_labels)
print(x_val.shape)
print(y_val.shape)

(1785, 251, 21)
(1785,)


In [None]:
#call the function to encode y
y_train_encoded, y_val_encoded = encode_labels(y_train, y_val)
#verify the shape
print(y_train_encoded.shape)
print(y_val_encoded.shape)

(4555, 3)
(1785, 3)


In [None]:
#scale the extracted embeddings
x_train_scaled = scale_data(x_train)
x_val_scaled = scale_data(x_val)
print(x_train_scaled[:1])

[[[0.33978738 0.68793538 0.28209973 ... 0.51217256 0.38610621 0.26785829]
  [0.59280273 0.66793327 0.28650357 ... 0.43383001 0.35844648 0.280442  ]
  [0.67914246 0.64029561 0.33524839 ... 0.528568   0.42202951 0.28287074]
  ...
  [0.64161968 0.68365215 0.38953686 ... 0.58838139 0.50795896 0.28877641]
  [0.65400476 0.6328816  0.38573987 ... 0.50833723 0.53667046 0.3188964 ]
  [0.65841814 0.62336723 0.39894912 ... 0.45432782 0.58814107 0.33660049]]]


In [None]:
#shuffle both train and validation sets
x_shuffled_train, y_shuffled_train = shuffle_data(x_train_scaled, y_train_encoded)
x_shuffled_val, y_shuffled_val = shuffle_data(x_val_scaled, y_val_encoded)

print(x_shuffled_train.shape)
print(y_shuffled_train.shape)

print(x_shuffled_val.shape)
print(y_shuffled_val.shape)

(4555, 251, 21)
(4555, 3)
(1785, 251, 21)
(1785, 3)


In [None]:
# Reshape x_shuffled_train and x_shuffled_val to 2D tabular format
x_train_reshaped = x_shuffled_train.reshape((x_shuffled_train.shape[0], -1))
x_val_reshaped = x_shuffled_val.reshape((x_shuffled_val.shape[0], -1))

In [None]:
# Convert the training and validation labels to integers (required for XGBoost)
y_train_int = np.argmax(y_shuffled_train, axis=1)
y_val_int = np.argmax(y_shuffled_val, axis=1)

In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
# Convert the training and validation data to DMatrix format for XGBoost
d_train = xgb.DMatrix(x_train_reshaped, label=y_train_int)
d_val = xgb.DMatrix(x_val_reshaped, label=y_val_int)

#set params for xgboost
params = {
    'objective': 'multi:softmax',
    'num_class': 3,
    'eval_metric': 'mlogloss',
    'eta': 0.1,
    'max_depth': 6,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 0,
    'nthread': 4,
    'seed': 42
}

# Train the XGBoost model
num_boost_round = 100
evallist = [(d_train, 'train'), (d_val, 'validation')]
model = xgb.train(params=params, dtrain=d_train, num_boost_round=num_boost_round, evals=evallist)

# Predict on the validation set
y_pred_int = model.predict(d_val)

# Calculate accuracy on the validation set
accuracy = accuracy_score(y_val_int, y_pred_int)
print(f"Validation Accuracy: {accuracy:.4f}")

[0]	train-mlogloss:1.03747	validation-mlogloss:1.09154
[1]	train-mlogloss:0.98148	validation-mlogloss:1.08813
[2]	train-mlogloss:0.93340	validation-mlogloss:1.08462
[3]	train-mlogloss:0.88641	validation-mlogloss:1.08250
[4]	train-mlogloss:0.84429	validation-mlogloss:1.07424
[5]	train-mlogloss:0.80713	validation-mlogloss:1.06910
[6]	train-mlogloss:0.77116	validation-mlogloss:1.06058
[7]	train-mlogloss:0.73748	validation-mlogloss:1.05533
[8]	train-mlogloss:0.70595	validation-mlogloss:1.05190
[9]	train-mlogloss:0.67674	validation-mlogloss:1.04584
[10]	train-mlogloss:0.65001	validation-mlogloss:1.04187
[11]	train-mlogloss:0.62158	validation-mlogloss:1.03905
[12]	train-mlogloss:0.59839	validation-mlogloss:1.03281
[13]	train-mlogloss:0.57585	validation-mlogloss:1.03116
[14]	train-mlogloss:0.55247	validation-mlogloss:1.03123
[15]	train-mlogloss:0.53139	validation-mlogloss:1.02755
[16]	train-mlogloss:0.51246	validation-mlogloss:1.02482
[17]	train-mlogloss:0.49171	validation-mlogloss:1.02019
[1

In [None]:
# Predict on the training set
y_train_pred_int = model.predict(d_train)

# Calculate accuracy on the training set
training_accuracy = accuracy_score(y_train_int, y_train_pred_int)
print(f"Training Accuracy: {training_accuracy:.4f}")

Training Accuracy: 1.0000


XGBoost is a boosting technique that sequentially creates decision trees, each tree improving upon the mistakes of the previous one. The final result is a sum of outputs from all the trees. The above model is overfitting

In [36]:
import xgboost as xgb
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

# Create and train the XGBoost model
xgb_model = xgb.XGBClassifier()
xgb_model.fit(x_train_reshaped, y_train_int)

# Make predictions on training and validation datasets
y_pred_train = xgb_model.predict(x_train_reshaped)
y_pred_val = xgb_model.predict(x_val_reshaped)

# Define class labels
class_labels = ['barswa', 'comsan', 'eaywag1']

# Calculate and plot the confusion matrix for training data
confusion_matrix_train = confusion_matrix(y_shuffled_train, y_pred_train)
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix_train, annot=True, fmt='d', cmap='Blues', xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Training Data')
plt.show()

# Print precision, recall, and F1-score for training data
print("Training Classification Report:")
print(classification_report(y_train_int, y_pred_train, digits=3, zero_division='warn'))

# Calculate and plot the confusion matrix for validation data
confusion_matrix_val = confusion_matrix(y_val_int, y_pred_val)
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix_val, annot=True, fmt='d', cmap='Blues', xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Validation Data')
plt.show()

# Print precision, recall, and F1-score for validation data
print("Validation Classification Report:")
print(classification_report(y_shuffled_val, y_pred_val, digits=3, zero_division='warn'))


ValueError: ignored