# Forewords

1. This notebook covers the data preprocessing step of the preprocessing process for the project. Please follow the steps outlined in the dataset folder to download the dataset before attempting this notebook.
2. If you followed the previous steps, you would have had the csv file and audio files saved on your Google Drive and ready for use.
3. Please update the directory path below to your Google Drive path for the respective files as needed.

<a href="https://colab.research.google.com/github/rachlllg/Project_Bird-Song-Classifier-with-Machine-Learning/blob/main/preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [1]:
# drive access
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# standard libraries
import numpy as np
import pandas as pd

In [3]:
# for audio
from IPython.display import Audio
import librosa

In [4]:
# for train_test split
from sklearn.model_selection import train_test_split

# Load train_metadata.csv

In [5]:
# change the file path to your local path
df = pd.read_csv('/content/drive/MyDrive/project/train_metadata.csv')

df.head()

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename
0,abethr1,[],['song'],4.3906,38.2788,Turdus tephronotus,African Bare-eyed Thrush,Rolf A. de By,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://www.xeno-canto.org/128013,abethr1/XC128013.ogg
1,abethr1,[],['call'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,3.5,https://www.xeno-canto.org/363501,abethr1/XC363501.ogg
2,abethr1,[],['song'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,3.5,https://www.xeno-canto.org/363502,abethr1/XC363502.ogg
3,abethr1,[],['song'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://www.xeno-canto.org/363503,abethr1/XC363503.ogg
4,abethr1,[],"['call', 'song']",-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,4.5,https://www.xeno-canto.org/363504,abethr1/XC363504.ogg


In [6]:
df['primary_label'].value_counts()

barswa     500
wlwwar     500
thrnig1    500
eaywag1    500
comsan     500
          ... 
lotcor1      1
whctur2      1
whhsaw1      1
afpkin1      1
crefra2      1
Name: primary_label, Length: 264, dtype: int64

# Preprocessing

## 1. include only the 3 species selected (barswa, comsan, and eaywag1)

In [7]:
interested_species = ['barswa', 'comsan', 'eaywag1']

In [8]:
filtered_df = df[df['primary_label'].isin(interested_species)]

filtered_df.head()

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename
895,barswa,[],"['call', 'flight call']",46.4605,6.3914,Hirundo rustica,Barn Swallow,Bram Piot,Creative Commons Attribution-NonCommercial-Sha...,2.5,https://www.xeno-canto.org/113914,barswa/XC113914.ogg
896,barswa,[],['song'],35.0307,-120.6205,Hirundo rustica,Barn Swallow,Thomas G. Graves,Creative Commons Attribution-NonCommercial-Sha...,4.5,https://www.xeno-canto.org/129647,barswa/XC129647.ogg
897,barswa,[],['song'],45.3675,-73.8566,Hirundo rustica,Barn Swallow,Patrick Turgeon,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://www.xeno-canto.org/132406,barswa/XC132406.ogg
898,barswa,[],"['call', 'female', 'male', 'song']",56.1559,47.4939,Hirundo rustica,Barn Swallow,Albert Lastukhin,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://www.xeno-canto.org/133096,barswa/XC133096.ogg
899,barswa,[],['song'],55.9937,-3.5605,Hirundo rustica,Barn Swallow,Mike Nelson,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://www.xeno-canto.org/133802,barswa/XC133802.ogg


In [9]:
filtered_df['primary_label'].value_counts()

barswa     500
comsan     500
eaywag1    500
Name: primary_label, dtype: int64

## 2. remove duplicates (same 'duration', 'location', 'type', 'primary_label', 'author')

### Step 1. Extract duration using librosa

In [10]:
# change the file path to your local path
path = '/content/drive/MyDrive/project/train_audio/'

In [11]:
sr = 16000

In [12]:
example = filtered_df['filename'].iloc[0]
example

'barswa/XC113914.ogg'

In [13]:
audio, _ = librosa.load(path + example, sr=sr)
Audio(audio, rate=sr)

In [14]:
def get_audio_duration(file_path, sr=sr):
  audio, _ = librosa.load(file_path, sr=sr)
  duration = librosa.get_duration(y=audio, sr=sr)
  return duration

In [15]:
filtered_df['duration'] = filtered_df['filename'].apply(lambda x: get_audio_duration(path + x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['duration'] = filtered_df['filename'].apply(lambda x: get_audio_duration(path + x))


In [16]:
filtered_df['duration'].dtypes

dtype('float64')

In [17]:
filtered_df.head()

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename,duration
895,barswa,[],"['call', 'flight call']",46.4605,6.3914,Hirundo rustica,Barn Swallow,Bram Piot,Creative Commons Attribution-NonCommercial-Sha...,2.5,https://www.xeno-canto.org/113914,barswa/XC113914.ogg,57.077562
896,barswa,[],['song'],35.0307,-120.6205,Hirundo rustica,Barn Swallow,Thomas G. Graves,Creative Commons Attribution-NonCommercial-Sha...,4.5,https://www.xeno-canto.org/129647,barswa/XC129647.ogg,106.728
897,barswa,[],['song'],45.3675,-73.8566,Hirundo rustica,Barn Swallow,Patrick Turgeon,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://www.xeno-canto.org/132406,barswa/XC132406.ogg,10.788562
898,barswa,[],"['call', 'female', 'male', 'song']",56.1559,47.4939,Hirundo rustica,Barn Swallow,Albert Lastukhin,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://www.xeno-canto.org/133096,barswa/XC133096.ogg,11.57225
899,barswa,[],['song'],55.9937,-3.5605,Hirundo rustica,Barn Swallow,Mike Nelson,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://www.xeno-canto.org/133802,barswa/XC133802.ogg,15.490625


### Step 2. Find duplicates

same 'duration', 'location', 'type', 'primary_label', 'author'

In [19]:
duplicates = filtered_df.duplicated(subset=['duration', 'latitude', 'type', 'primary_label', 'author'], keep=False)
duplicated_rows = filtered_df[duplicates]

display(len(duplicated_rows))
display(duplicated_rows)

15

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename,duration
1203,barswa,[],"['call', 'song', 'various calls']",58.7542,23.8439,Hirundo rustica,Barn Swallow,Stanislas Wroza,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://www.xeno-canto.org/575747,barswa/XC575747.ogg,36.12
1204,barswa,[],"['call', 'song', 'various calls']",58.7542,23.8439,Hirundo rustica,Barn Swallow,Stanislas Wroza,Creative Commons Attribution-NonCommercial-Sha...,0.0,https://www.xeno-canto.org/575749,barswa/XC575749.ogg,36.12
1289,barswa,[],"['adult', 'song']",52.8858,23.8293,Hirundo rustica,Barn Swallow,Ireneusz Oleksik,Creative Commons Attribution-NonCommercial-Sha...,4.5,https://www.xeno-canto.org/664976,barswa/XC664976.ogg,64.704
1290,barswa,[],"['adult', 'song']",52.8858,23.8293,Hirundo rustica,Barn Swallow,Ireneusz Oleksik,Creative Commons Attribution-NonCommercial-Sha...,4.5,https://www.xeno-canto.org/664977,barswa/XC664977.ogg,64.704
1316,barswa,[],['call'],51.3672,5.8406,Hirundo rustica,Barn Swallow,Ad Hilders,Creative Commons Attribution-NonCommercial-Sha...,3.0,http://xeno-canto.org/671669,barswa/XC671669.ogg,10.057
1318,barswa,[],['call'],51.3672,5.8406,Hirundo rustica,Barn Swallow,Ad Hilders,Creative Commons Attribution-NonCommercial-Sha...,3.0,http://xeno-canto.org/671721,barswa/XC671721.ogg,10.057
1389,barswa,[],[''],56.6346,9.7837,Hirundo rustica,Barn Swallow,Ad Hilders,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://xeno-canto.org/747069,barswa/XC747069.ogg,10.109
1390,barswa,[],[''],56.6346,9.7837,Hirundo rustica,Barn Swallow,Ad Hilders,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://xeno-canto.org/747232,barswa/XC747232.ogg,10.109
5287,comsan,[],['nocturnal flight call'],51.4309,-2.8518,Actitis hypoleucos,Common Sandpiper,Paul Williams,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://www.xeno-canto.org/586329,comsan/XC586329.ogg,7.026
5288,comsan,[],['nocturnal flight call'],51.4309,-2.8518,Actitis hypoleucos,Common Sandpiper,Paul Williams,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://www.xeno-canto.org/586330,comsan/XC586330.ogg,7.026


In [20]:
dup1, _ = librosa.load(path + duplicated_rows['filename'].iloc[0], sr=sr)
Audio(dup1, rate=sr)

In [21]:
dup2, _ = librosa.load(path + duplicated_rows['filename'].iloc[1], sr=sr)
Audio(dup2, rate=sr)

### Step 3. Remove duplicates

In [22]:
display(len(filtered_df))

1500

In [23]:
filtered_df = filtered_df.drop_duplicates(subset=['duration', 'latitude', 'type', 'primary_label', 'author'], keep='first')

display(len(filtered_df))

1492

## 3. Split filtered_df to train and test

In [24]:
train_df, test_df = train_test_split(filtered_df, test_size=0.3, random_state=1234)

In [25]:
display(len(train_df))
display(train_df.head())

1044

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename,duration
5295,comsan,[],"['flight call', 'nocturnal flight call']",50.7542,4.5672,Actitis hypoleucos,Common Sandpiper,Frederik Fluyt,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://www.xeno-canto.org/587730,comsan/XC587730.ogg,5.746937
6169,eaywag1,[],"['call', 'sex uncertain']",69.3585,88.2378,Motacilla flava,Western Yellow Wagtail,Alexander Hellquist,Creative Commons Attribution-NonCommercial-Sha...,3.0,http://xeno-canto.org/675944,eaywag1/XC675944.ogg,5.355
5388,comsan,[],"['life stage uncertain', 'nocturnal flight cal...",41.1698,0.9761,Actitis hypoleucos,Common Sandpiper,Xavier Riera,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://www.xeno-canto.org/664012,comsan/XC664012.ogg,10.488
971,barswa,[],"['alarm call', 'flight call']",,,Hirundo rustica,Barn Swallow,Daniel Parker,Creative Commons Attribution-NonCommercial-Sha...,2.5,https://www.xeno-canto.org/268804,barswa/XC268804.ogg,76.538813
1344,barswa,[],['song'],53.9299,-2.9833,Hirundo rustica,Barn Swallow,Chris Batty,Creative Commons Attribution-NonCommercial-Sha...,2.5,http://xeno-canto.org/690498,barswa/XC690498.ogg,83.0955


In [26]:
display(len(test_df))
display(test_df.head())

448

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename,duration
1364,barswa,[],[''],53.2509,5.598,Hirundo rustica,Barn Swallow,Gosse Hoekstra,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://xeno-canto.org/721711,barswa/XC721711.ogg,19.069375
5164,comsan,[],"['flight call', 'nocturnal flight call']",48.8306,2.1999,Actitis hypoleucos,Common Sandpiper,Stanislas Wroza,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://www.xeno-canto.org/496602,comsan/XC496602.ogg,28.995938
6211,eaywag1,[],[''],43.3298,4.8364,Motacilla flava,Western Yellow Wagtail,Stanislas Wroza,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://xeno-canto.org/718445,eaywag1/XC718445.ogg,7.340438
5939,eaywag1,"['cohmar1', 'hoopoe']","['call', 'flight call']",37.1357,-7.6138,Motacilla flava,Western Yellow Wagtail,Nelson Conceição,Creative Commons Attribution-NonCommercial-Sha...,4.5,https://www.xeno-canto.org/481360,eaywag1/XC481360.ogg,151.944
1356,barswa,[],[''],19.3551,-99.0467,Hirundo rustica,Barn Swallow,Manuel Grosselet,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://xeno-canto.org/698512,barswa/XC698512.ogg,18.703688


# Save the train_df and test_df for future use

In [27]:
train_df.to_csv('/content/drive/MyDrive/project/train_df.csv', index=False)
test_df.to_csv('/content/drive/MyDrive/project/test_df.csv', index=False)