In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import json
import torch
import matplotlib.pyplot as plt
import torchaudio

BASE_DIR = '../input/birdclef-2022/'

In [None]:
train_metadata = pd.read_csv(f'{BASE_DIR}/train_metadata.csv')

with open(f'{BASE_DIR}/scored_birds.json') as json_file:
    scored_birds = json.load(json_file)

In [None]:
samples_n_channels = dict()
samples_sample_rate = dict()
samples_seconds = dict()
samples_max = dict()

for filename in tqdm(train_metadata['filename']):
    frames, samples_sample_rate[filename] = torchaudio.load('../input/birdclef-2022/train_audio/' + filename)
    samples_n_channels[filename] = frames.shape[0]
    n_frames = frames.shape[1]
    samples_seconds[filename] = float(n_frames) / float(samples_sample_rate[filename])
    samples_max[filename] = torch.max(torch.abs(frames)).detach().numpy().item()

# Analize the audio files

First, for every bird in the scored bird, let's have a look on 60 seconds from a files

In [None]:
from glob import glob
for bird in scored_birds:
    files = glob(f'../input/birdclef-2022/train_audio/{bird}/*.ogg')
    filename = files[0]
    frames, sample_rate = torchaudio.load(filename)
    frames = frames.cpu().detach().numpy()
    n_channels = frames.shape[0]
    if frames.shape[1] > 60*sample_rate:
        frames = frames[:, :60*sample_rate]
        
    plt.figure(figsize=(20, 3))
    for c in range(n_channels):
        plt.subplot(1, 2, c+1)
        plt.plot(frames[c, :])
        plt.title(f'{filename} channel {c}')

plt.show()


#### audio length distrubution

In [None]:
seconds = samples_seconds.values()
print(f'minimum length = {np.min(list(seconds))}')
print(f'maximum length = {np.max(list(seconds))}')
print(f'average length = {np.mean(list(seconds))}')
n_bins = int(np.max(list(seconds)) / 5)
plt.figure(figsize=(90, 20))
plt.hist(list(seconds), bins=n_bins)
plt.show()

#### Number of channels in each audio file

In [None]:
print(f'The number of files with only one channel is {list(samples_n_channels.values()).count(1)}')
print(f'The number of files with only one channel is {list(samples_n_channels.values()).count(2)}')


#### sample rate

In [None]:
sample_rates = np.unique(list(samples_sample_rate.values()))
print(sample_rates)

#### Maximum value in the audio files

In [None]:
plt.figure()
max_values = list(samples_max.values())
plt.hist(max_values, bins=20)
plt.show()

# Analize the metadata

First let's try to look at all the metadat files (train_metadat.csv and eBird_Taxonomy_v2021.csv). This part is partially copied from https://www.kaggle.com/hasanbasriakcay/birdclef22-eda-noise-reduction)

In [None]:
train_metadata.head()

In [None]:
# Make sure that there are not nans in the metadata
train_metadata.isna().sum()

Now I want to check how much time every bird is a primary bird and how much time every bird appears to be scondary bird

In [None]:
train_metadata = pd.read_csv(f'{BASE_DIR}/train_metadata.csv')

birds_count_dict = train_metadata['primary_label'].value_counts().to_dict()

secondary_labels_as_arr = train_metadata['secondary_labels'].map(lambda x: x[1:-1].replace("'", "").split(", "))
all_secondary = secondary_labels_as_arr.aggregate('sum')
birds_count = [{'Bird': k, 'n_primary': birds_count_dict[k], 'n_secondary': all_secondary.count(k), 'total_time_primary': 0, 'total_time_secondary': 0} for k in birds_count_dict.keys()]
birds_count = pd.DataFrame(birds_count)
birds_count = birds_count.set_index('Bird', drop=True)
for _, row in train_metadata.iterrows():
    seconds = samples_seconds[row.filename]
    birds_count.loc[row.primary_label].total_time_primary += seconds
    if len(row.secondary_labels) > 2:
        arr = row.secondary_labels[1:-1].replace("'", "").split(", ")
        for secondary_bird in arr:
            birds_count.loc[secondary_bird].total_time_secondary += seconds

birds_count = birds_count.sort_values(by=['total_time_primary', 'total_time_secondary', 'n_primary', 'n_secondary'], ascending=False)
print(birds_count)
birds_count.to_csv('birds_count.csv')

The full output is at birds_count.csv. As you can easily see, the data is extremely not balanced and we have to think about ways to make it balanced.

Let's check the data only on the scored birds

In [None]:
n_primary = [birds_count.n_primary[bird] for bird in scored_birds]
n_secondary = [birds_count.n_secondary[bird] for bird in scored_birds]
total_time_primary = [birds_count.total_time_primary[bird] for bird in scored_birds]
total_time_secondary = [birds_count.total_time_secondary[bird] for bird in scored_birds]
print(scored_birds)
print(n_primary)
print(n_secondary)
print(total_time_primary)
print(total_time_secondary)

plt.figure(figsize=(20, 3))
plt.bar(scored_birds, n_primary)
plt.title('n_primary')
plt.figure(figsize=(20, 3))
plt.bar(scored_birds, n_secondary)
plt.title('n_secondary')
plt.figure(figsize=(20, 3))
plt.bar(scored_birds, total_time_primary)
plt.title('total_time_primary')
plt.figure(figsize=(20, 3))
plt.bar(scored_birds, total_time_secondary)
plt.title('total_time_secondary')
plt.show()

Let's check the rating distribution.

In [None]:
plt.figure()
plt.hist(train_metadata.rating, bins=10)
plt.title('rating of all audio files')

plt.figure()
is_scored = train_metadata.primary_label.map(lambda x: x in scored_birds)
relevant_metadata = train_metadata[is_scored]
plt.hist(relevant_metadata.rating, bins=10)
plt.title('rating of the scored audio files')


plt.show()

It looks like most of the samples have good quality.