In [None]:
import pandas as pd
import numpy as np
import joblib
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision.models import resnet18, ResNet18_Weights
from torchvision import transforms
from IPython.display import Audio
import librosa
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
import tqdm

import random
import glob
import os
import ast

import sys
sys.path.append("..")
import utils

In [None]:
RANDOM_SEED = 21

# Set seed for experiment reproducibility
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True

In [None]:
is_in_kaggle_env = utils.get_is_in_kaggle_env()

data_path = '/kaggle/input/birdclef-2023/' if is_in_kaggle_env else '../data/'

device = 'cpu' if is_in_kaggle_env else utils.determine_device()

if not is_in_kaggle_env and not os.path.exists('../data'):
    print("Downloading data...")
    !kaggle competitions download -c 'birdclef-2023'
    !mkdir ../data
    !unzip -q birdclef-2023.zip -d ../data
    !rm birdclef-2023.zip

df_metadata_csv = pd.read_csv(f"{data_path}/train_metadata.csv")

audio_data_dir = f"{data_path}/train_audio/"

In [None]:
df_metadata_csv.head(3)

In [None]:
df_metadata_csv[df_metadata_csv['filename'].str.contains("XC321277")]

In [None]:
audio_path = df_metadata_csv.iloc[0, 11]
print(f"Audio path: {audio_path}")

primary_label = df_metadata_csv.iloc[0, 0]
print(f"Primary label: {primary_label}")

secondary_labels = df_metadata_csv.iloc[0, 1]
print(f"Secondary labels: {secondary_labels}")

file_id = audio_path.split('/')[-1].split('.')[0]
print(f"File ID: {file_id}")

print(f"df length: {len(df_metadata_csv)}")

In [None]:
df_metadata_csv.info()

In [None]:
class_counts = df_metadata_csv["primary_label"].value_counts()

two_or_less_samples_rows = df_metadata_csv[df_metadata_csv["primary_label"].isin(class_counts[class_counts < 3].index)]

print(f"Number of unique classes with less than 2 samples: {len(two_or_less_samples_rows['primary_label'].unique())}")
print(f"Number of rows with less than 2 samples: {len(two_or_less_samples_rows)}")
print(f"Primary labels with less than 2 samples: {two_or_less_samples_rows['primary_label'].unique()}")

In [None]:
# Drop rows with primary_label that have two or less samples
print(f"Number of rows before dropping: {len(df_metadata_csv)}")
df_metadata_csv = df_metadata_csv[~df_metadata_csv["primary_label"].isin(class_counts[class_counts < 3].index)]
print(f"Number of rows after dropping: {len(df_metadata_csv)}")

In [None]:
# go through each row of a 10% random sample of train_df, get the audio file length, and add it to a list, show the progress using tqdm

audio_lengths_s = []
for i, row in tqdm.tqdm(df_metadata_csv.sample(frac=0.1).iterrows(), total=int(len(df_metadata_csv) * 0.1)):
    audio_path = row["filename"]
    audio_path = f"{audio_data_dir}/{audio_path}"
    audio, sr = librosa.load(audio_path)
    audio_lengths_s.append(len(audio) / sr)

In [None]:
# plot the audio lengths
plt.hist(audio_lengths_s, bins=100)
plt.show()

In [None]:
# plot the audio lengths as histogram until 80th percentile
plt.hist(audio_lengths_s, bins=1000)
plt.xlim(0, np.percentile(audio_lengths_s, 80))
plt.show()

In [None]:
max(audio_lengths_s), min(audio_lengths_s), np.percentile(audio_lengths_s, 80)

In [None]:
np.mean(audio_lengths_s), np.median(audio_lengths_s)

In [None]:
#Â Get unique classes
unique_primary_classes = df_metadata_csv.primary_label.unique()

secondary_classes = df_metadata_csv.secondary_labels.tolist()
unique_secondary_classes = set()
for class_list_str in secondary_classes:
    class_list = ast.literal_eval(class_list_str)
    for c in class_list:
        unique_secondary_classes.add(c)

classes = set(unique_primary_classes).union(unique_secondary_classes)
print(f"Number of classes: {len(classes)}")

classes_in_secondary_but_not_primary = unique_secondary_classes.difference(unique_primary_classes)
print(f"Number of classes in secondary but not primary: {len(classes_in_secondary_but_not_primary)}")