# Import libraries

In [None]:
# Standard libraries
import numpy as np
import pandas as pd
import time

# for audio
from IPython.display import Audio
import librosa

# for preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

# for modeling
import tensorflow as tf
from sklearn.metrics import classification_report
!pip install transformers
from transformers import AutoFeatureExtractor, ASTModel
import torch

# For visualization
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

# For drive access
from google.colab import drive
import os
drive.mount('/content/drive')

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m24.1 MB/s[0m eta [36m0:00:0

# Load train data csv file

In [None]:
df = pd.read_csv('/content/drive/MyDrive/207/207-Project/notebooks/RG/3_species/train_val.csv')

df.head()

Unnamed: 0,primary_label,filename,type,filename_npy,rating,duration_secs_32000,country,continent,data
0,eaywag1,eaywag1/XC718442.ogg,blank,eaywag1/XC718442.npy,good,12.538781,FR,EUROPE,train
1,eaywag1,eaywag1/XC675682.ogg,call,eaywag1/XC675682.npy,good,35.657,RU,EUROPE,train
2,eaywag1,eaywag1/XC722533.ogg,blank,eaywag1/XC722533.npy,good,58.104,RU,EUROPE,train
3,eaywag1,eaywag1/XC673617.ogg,call,eaywag1/XC673617.npy,poor,18.756,GB,EUROPE,train
4,eaywag1,eaywag1/XC675935.ogg,call,eaywag1/XC675935.npy,good,16.666,RU,EUROPE,train


In [None]:
len(df)

940

In [None]:
val_df = df[df['data'] == 'val']
val_df = val_df.sample(frac=1, random_state=1234)
val_df.head

<bound method NDFrame.head of     primary_label              filename  type          filename_npy rating  \
822        comsan   comsan/XC669042.ogg  call   comsan/XC669042.npy   good   
887        barswa    barswa/XC57672.ogg  song    barswa/XC57672.npy   good   
798        comsan   comsan/XC665016.ogg  call   comsan/XC665016.npy   good   
839        comsan   comsan/XC648403.ogg  call   comsan/XC648403.npy   good   
825        comsan   comsan/XC636689.ogg  call   comsan/XC636689.npy   good   
..            ...                   ...   ...                   ...    ...   
800        comsan   comsan/XC638592.ogg  call   comsan/XC638592.npy   good   
809        comsan   comsan/XC493565.ogg  call   comsan/XC493565.npy   good   
861        barswa   barswa/XC182025.ogg  call   barswa/XC182025.npy   good   
710       eaywag1  eaywag1/XC597931.ogg  call  eaywag1/XC597931.npy   good   
868        barswa   barswa/XC184418.ogg  both   barswa/XC184418.npy   good   

     duration_secs_32000  country

# Extract the last hidden state from AST pre-trained model as feature

https://huggingface.co/docs/transformers/main/en/model_doc/audio-spectrogram-transformer#transformers.ASTModel

In [None]:
feature_extractor = AutoFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
model = ASTModel.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

Downloading (…)rocessor_config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/346M [00:00<?, ?B/s]

In [None]:
val_features = []

for filename in val_df['filename_npy']:
  audio = np.load('/content/drive/MyDrive/207/207-Project/data/train/librosa_loaded/' + filename)
  inputs = feature_extractor(audio, sampling_rate=16000, return_tensors="pt")
  with torch.no_grad():
    outputs = model(**inputs)
  last_hidden_states = outputs.last_hidden_state
  last_hidden_states = last_hidden_states.squeeze(0) # sqeeuze last_hidden_states of shape [1, 1214, 768] to [1214, 768]
  val_features.append(last_hidden_states)

val_features = np.array(val_features)

# val_features is a list containing n_samples of last_hidden_states, each of shape [1214,768]
# stack each last_hidden_states so the val_features becomes shape (n_samples, 1214,768)
val_features = np.stack(val_features, axis=0)

display(val_features.shape)
display(val_features[0])

  val_features = np.array(val_features)
  val_features = np.array(val_features)


(283, 1214, 768)

array([[-0.8480662 ,  0.64514935, -1.4136099 , ...,  0.33828968,
         0.01694099, -0.05107679],
       [-0.8849972 ,  0.30583918, -1.6923755 , ...,  0.82020116,
        -1.1473043 , -0.24052975],
       [ 0.98675936,  0.05624479,  0.69751906, ..., -0.6233058 ,
        -0.20992553,  0.88590056],
       ...,
       [-0.22977757,  1.8763975 , -0.69093287, ..., -1.5139933 ,
         0.46572894, -0.41817388],
       [-0.44668093,  1.4240711 , -0.44707435, ...,  0.51184773,
        -1.6833861 ,  1.3018007 ],
       [-1.8168279 ,  1.1495466 , -2.638916  , ..., -1.9477934 ,
        -2.7128537 ,  0.5862413 ]], dtype=float32)

In [None]:
del feature_extractor
del model

# Extract classes

In [None]:
val_y = val_df['primary_label']
val_y.shape

(283,)

In [None]:
del df
del val_df
del AutoFeatureExtractor
del ASTModel
del inputs
del outputs
del filename
del audio
del last_hidden_states
del torch

# Encode classes

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder().fit(val_y)
val_y = label_encoder.transform(val_y)
classes = list(label_encoder.inverse_transform([0,1,2]))
classes

['barswa', 'comsan', 'eaywag1']

In [None]:
print(len(val_y))
print(val_y[:5])
print(classes)

283
[1 0 1 1 1]
['barswa', 'comsan', 'eaywag1']


In [None]:
del LabelEncoder
del label_encoder

In [None]:
%who

Audio	 MinMaxScaler	 classes	 classification_report	 drive	 librosa	 np	 os	 pd	 
plt	 sns	 tf	 time	 val_features	 val_y	 


# shuffle the data before feeding into the model

In [None]:
val_len = len(val_y)
np.random.seed(1234)
val_indices = np.random.permutation(val_len)
val_features = np.array([val_features[i] for i in val_indices])
val_y = np.array([val_y[i] for i in val_indices])

display(val_y.shape)
display(val_y[:5])

(283,)

array([0, 2, 0, 1, 1])

# evaluate the results on validation data

In [None]:
val_results = {}

In [None]:
val_features.shape

(283, 1214, 768)

In [17]:
model = tf.keras.models.load_model('/content/drive/MyDrive/207/207-Project/notebooks/RG/3_species/class_methods/models_h5/9b.model.h5')

In [18]:
model.evaluate(val_features, val_y)



[1.1891164779663086, 0.9081271886825562]