In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
try:
    import resnest
except ModuleNotFoundError:
    !pip install -q "../input/resnest50-fast-package/resnest-0.0.6b20200701/resnest"

In [None]:
# 匯入一些會用到的模組
import numpy as np
import librosa as lb
import soundfile as sf
import pandas as pd
import cv2
from pathlib import Path
import re
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle

from PIL import Image

import torch
from torch import nn
from  torch.utils.data import Dataset, DataLoader

from tqdm.notebook import tqdm

import time
import resnest
from resnest.torch import resnest50

In [None]:
# 設定程式中會用到的參數的預測值，或指定一些會用到的檔案路徑
NUM_CLASSES = 397
SR = 32000  # 音頻採樣率 32 kHz
DURATION = 5  # 計算時間序列的的持續時間（以秒為單位） -> 讀取時長
THRESH = 0.1  # thresh: threshold minimum power for log spectrogram

SPEC_SHAPE = (48, 128) # height x width
FMIN=5
FMAX=None


DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("DEVICE:", DEVICE)

TEST_AUDIO_ROOT = Path("../input/birdclef-2021/test_soundscapes")
SAMPLE_SUB_PATH = "../input/birdclef-2021/sample_submission.csv"
TARGET_PATH = None
    
if not len(list(TEST_AUDIO_ROOT.glob("*.ogg"))):
    TEST_AUDIO_ROOT = Path("../input/birdclef-2021/train_soundscapes")
    SAMPLE_SUB_PATH = None
    # SAMPLE_SUB_PATH = "../input/birdclef-2021/sample_submission.csv"
    TARGET_PATH = Path("../input/birdclef-2021/train_soundscape_labels.csv")
    
# TRAIN_AUDIO_ROOT = Path("../input/birdclef-2021/train_short_audio")
# TRAIN_AUDIO_IMAGES_SAVE_ROOT = Path("audio_images") # Where to save the mels images
# TRAIN_AUDIO_IMAGES_SAVE_ROOT.mkdir(exist_ok=True, parents=True)
# TRAIN_AUDIO_TO_IMAGES_SAVE_ROOT = Path("audio_to_images") # Where to save the mels images
# TRAIN_AUDIO_TO_IMAGES_SAVE_ROOT.mkdir(exist_ok=True, parents=True)

In [None]:
# 由 pandas 讀取 csv，並存在一個 dataframe 裡
df = pd.read_csv(TARGET_PATH)
df

In [None]:
# 檢視資料的概要資訊
df.info()

In [None]:
# 查看是否有欄位存在 NaN 值 (True 代表有 NaN 值存在)
df.isna().any()

In [None]:
sns.countplot(df['site'])
# 提供的音檔中只有兩個地區的資料 COR 及 SSW

https://blog.csdn.net/zzc15806/article/details/79603994
- 參數介紹
    - sr：採樣率、取樣率
    - hop_length：幀移
    - overlapping：連續幀之間的重疊部分
    - n_fft：窗口大小、視窗大小
    - n_mels ：產生的梅爾帶數
    - fmin ：最低頻率（Hz）
    - fmax：最高頻率（以Hz為單位）。如果為None，則使用fmax = sr / 2.0
- 會返回y值
    - y ：音頻時間序列

In [None]:
class MelSpecComputer:  #梅爾頻譜圖，為頻譜特徵提取的一個方法
    def __init__(self, sr, n_mels, fmin, fmax, **kwargs):
        self.sr = sr
        self.n_mels = n_mels
        self.fmin = fmin
        self.fmax = fmax
        kwargs["n_fft"] = kwargs.get("n_fft", self.sr//10)
        kwargs["hop_length"] = kwargs.get("hop_length", self.sr//(10*4))
        self.kwargs = kwargs

    def __call__(self, y):

        melspec = lb.feature.melspectrogram(
            y, sr=self.sr, n_mels=self.n_mels, fmin=self.fmin, fmax=self.fmax, **self.kwargs,
        )

        # 將功率譜(幅值平方)轉換為分貝(dB)單位
        melspec = lb.power_to_db(melspec).astype(np.float32)  # astype()：對資料型別進行轉換
        return melspec

In [None]:
# mono：單聲道
# X 是一個數組
def mono_to_color(X, eps=1e-6, mean=None, std=None):
    mean = mean or X.mean()
    std = std or X.std()
    X = (X - mean) / (std + eps)
    
    _min, _max = X.min(), X.max()

    if (_max - _min) > eps:
        V = np.clip(X, _min, _max)  # 將X數組中的數值限制在min、max中。e.g. 若<min則該數就會變成min
        V = 255 * (V - _min) / (_max - _min)
        V = V.astype(np.uint8)
    else:
        V = np.zeros_like(X, dtype=np.uint8)  # 返回與指定數組具有相同形狀和數據類型的數組，並且數組中的值都為0。

    return V

def crop_or_pad(y, length):
    if len(y) < length:
        y = np.concatenate([y, length - np.zeros(len(y))])
    elif len(y) > length:
        y = y[:length]
    return y

In [None]:
class BirdCLEFDataset(Dataset):
    def __init__(self, data, sr=SR, n_mels=128, fmin=5, fmax=None, duration=DURATION, step=None, res_type="kaiser_fast", resample=True):
        
        self.data = data
        
        self.sr = sr
        self.n_mels = n_mels
        self.fmin = fmin
        self.fmax = fmax or self.sr//2

        self.duration = duration
        self.audio_length = self.duration*self.sr
        self.step = step or self.audio_length
        
        self.res_type = res_type
        self.resample = resample

        self.mel_spec_computer = MelSpecComputer(sr=self.sr, n_mels=self.n_mels, fmin=self.fmin, fmax=self.fmax)
    
    def __len__(self):
        return len(self.data)
    
    @staticmethod
    def normalize(image):
        image = image.astype("float32", copy=False) / 255.0
        image = np.stack([image, image, image])
        return image
    
    def audio_to_image(self, audio):
        melspec = self.mel_spec_computer(audio) 
        image = mono_to_color(melspec)
        image = self.normalize(image)
        return image

    def read_file(self, filepath):
        audio, orig_sr = sf.read(filepath, dtype="float32")  # 讀取soundfile

        if self.resample and orig_sr != self.sr:  # 重取樣
            audio = lb.resample(audio, orig_sr, self.sr, res_type=self.res_type)
          
        audios = []
        for i in range(self.audio_length, len(audio) + self.step, self.step):
            start = max(0, i - self.audio_length)
            end = start + self.audio_length
            audios.append(audio[start:end])
            
        if len(audios[-1]) < self.audio_length:
            audios = audios[:-1]
            
        images = [self.audio_to_image(audio) for audio in audios]
        images = np.stack(images)
        
        return images
    
        
    def __getitem__(self, idx):
        return self.read_file(self.data.loc[idx, "filepath"])

## 讀取 train_metadata.csv 並取出評分高的錄音檔

In [None]:
# train = pd.read_csv('../input/birdclef-2021/train_metadata.csv',)

# # only use high quality samples
# train = train.query('rating>=4')

# birds_count = {}
# for bird_species, count in zip(train.primary_label.unique(), 
#                                train.groupby('primary_label')['primary_label'].count().values):
#     birds_count[bird_species] = count

# most_represented_birds = [key for key,value in birds_count.items() if value >= 200] 

# TRAIN = train.query('primary_label in @most_represented_birds')

# 將音訊轉成梅爾頻譜圖

In [None]:
# def get_spectrograms(filepath, primary_label, output_dir):
    
#     # Open the file with lb (limited to the first 15 seconds)
#     sig, rate = lb.load(filepath, sr=SR, offset=None, duration=15)
    
#     # Split DURATION into five second chunks
#     sig_splits = []
#     for i in range(0, len(sig), int(DURATION * SR)):
#         split = sig[i:i + int(DURATION * SR)]

#         # End of DURATION?
#         if len(split) < int(DURATION * SR):
#             break
        
#         sig_splits.append(split)
        
#     # Extract mel spectrograms for each audio chunk
#     s_cnt = 0
#     saved_samples = []
#     for chunk in sig_splits:
        
#         hop_length = int(DURATION * SR / (SPEC_SHAPE[1] - 1))
#         mel_spec = lb.feature.melspectrogram(y=chunk, 
#                                                   sr=SR, 
#                                                   n_fft=1024, 
#                                                   hop_length=hop_length, 
#                                                   n_mels=SPEC_SHAPE[0], 
#                                                   fmin=FMIN, 
#                                                   fmax=FMAX)
    
#         mel_spec = lb.power_to_db(mel_spec, ref=np.max) 
        
#         # Normalize
#         mel_spec -= mel_spec.min()
#         mel_spec /= mel_spec.max()
        
#         # Save as image file
#         save_dir = os.path.join(output_dir, primary_label)
#         if not os.path.exists(save_dir):
#             os.makedirs(save_dir)
#         save_path = os.path.join(save_dir, filepath.rsplit(os.sep, 1)[-1].rsplit('.', 1)[0] + '_' + str(s_cnt) + '.png')
#         im = Image.fromarray(mel_spec * 255.0).convert("L")
#         im.save(save_path)
        
#         saved_samples.append(save_path)
#         s_cnt += 1
        
        
#     return saved_samples

# print('FINAL NUMBER OF AUDIO FILES IN TRAINING DATA:', len(TRAIN))

In [None]:
# samples = []
# with tqdm(total=len(TRAIN)) as pbar:
#     for idx, row in TRAIN.iterrows():
#         pbar.update(1)
        
#         if row.primary_label in most_represented_birds:
#             audio_file_path = os.path.join(TRAIN_AUDIO_ROOT, row.primary_label, row.filename)
#             samples += get_spectrograms(audio_file_path, row.primary_label, TRAIN_AUDIO_IMAGES_SAVE_ROOT)

# TRAIN_SPECS = shuffle(samples, random_state=1337)

In [None]:
# # Plot the first 12 spectrograms of TRAIN_SPECS
# plt.figure(figsize=(15, 7))
# for i in range(12):
#     spec = Image.open(TRAIN_SPECS[i])
#     plt.subplot(3, 4, i + 1)
#     plt.title(TRAIN_SPECS[i].split(os.sep)[-1])
#     plt.imshow(spec, origin='lower')

In [None]:
list(TEST_AUDIO_ROOT.glob("*.ogg"))

In [None]:
TEST_AUDIO_ROOT

In [None]:
data = pd.DataFrame(
     [(path.stem, *path.stem.split("_"), path) for path in Path(TEST_AUDIO_ROOT).glob("*.ogg")],
    columns = ["filename", "id", "site", "date", "filepath"]
)
print(data.shape)
data.head(20)

In [None]:
df_train = pd.read_csv("../input/birdclef-2021/train_metadata.csv")

# print(len(df_train["primary_label"]))  # train_metadata內總共有62874筆資料
# print(len(df_train["primary_label"].unique()))  # 有397種label

LABEL_IDS = {label: label_id for label_id,label in enumerate(sorted(df_train["primary_label"].unique()))}  # 將種類加上id 變成dict
INV_LABEL_IDS = {val: key for key,val in LABEL_IDS.items()}  # dict 倒轉 key val 位置交換 -> {label_id: 'label'}

In [None]:
df_train.head(5)

In [None]:
df_train.info()

In [None]:
df_train.isna().any()

In [None]:
# 在給定的數據集中(網友給的評價)，可以看出評價都偏好
sns.countplot(df_train['rating'])

In [None]:
test_data = BirdCLEFDataset(data=data)
len(test_data), test_data[0].shape

In [None]:
def load_net(checkpoint_path, num_classes=NUM_CLASSES):
    net = resnest50(pretrained=False)
    net.fc = nn.Linear(net.fc.in_features, num_classes)
    dummy_device = torch.device("cpu")
    d = torch.load(checkpoint_path, map_location=dummy_device)
    for key in list(d.keys()):
        d[key.replace("model.", "")] = d.pop(key)
    net.load_state_dict(d)
    net = net.to(DEVICE)
    net = net.eval()
    return net

## 將CNN深度學習模型匯入

In [None]:
checkpoint_paths = [
    Path("../input/kkiller-birdclef-models-public/birdclef_resnest50_fold0_epoch_10_f1_val_06471_20210417161101.pth"),
]


nets = [
        load_net(checkpoint_path.as_posix()) for checkpoint_path in checkpoint_paths
]

print(nets)

In [None]:
@torch.no_grad()
def get_thresh_preds(out, thresh=None):
    thresh = thresh or THRESH
    o = (-out).argsort(1)
    npreds = (out > thresh).sum(1)
    preds = []
    for oo, npred in zip(o, npreds):
        preds.append(oo[:npred].cpu().numpy().tolist())
    return preds

In [None]:
def get_bird_names(preds):
    bird_names = []
    for pred in preds:
        if not pred:
            bird_names.append("nocall")
        else:
            bird_names.append(" ".join([INV_LABEL_IDS[bird_id] for bird_id in pred]))
    return bird_names

In [None]:
def predict(nets, test_data, names=True):
    preds = []
    with torch.no_grad():
        for idx in  tqdm(list(range(len(test_data)))):
            xb = torch.from_numpy(test_data[idx]).to(DEVICE)
            pred = 0.
            for net in nets:
                o = net(xb)
                o = torch.sigmoid(o)

                pred += o

            pred /= len(nets)
            
            if names:
                pred = get_bird_names(get_thresh_preds(pred))

            preds.append(pred)
    return preds

## 將測試數據丟入演算法中預測

In [None]:
pred_probas = predict(nets, test_data, names=False)
print(len(pred_probas))

In [None]:
preds = [get_bird_names(get_thresh_preds(pred, thresh=THRESH)) for pred in pred_probas]

In [None]:
def preds_as_df(data, preds):
    sub = {
        "row_id": [],
        "birds": [],
    }
    
    for row, pred in zip(data.itertuples(False), preds):
        row_id = [f"{row.id}_{row.site}_{5*i}" for i in range(1, len(pred)+1)]
        sub["birds"] += pred
        sub["row_id"] += row_id
        
    sub = pd.DataFrame(sub)
    
    if SAMPLE_SUB_PATH:
        sample_sub = pd.read_csv(SAMPLE_SUB_PATH, usecols=["row_id"])
        sub = sample_sub.merge(sub, on="row_id", how="left")
        sub["birds"] = sub["birds"].fillna("nocall")
    return sub

每段音頻將以5秒為一單位分割去做分類預測

故一筆十分鐘的音訊會有120筆資料，總共會產生120*20=2400行數據

In [None]:
sub = preds_as_df(data, preds)
print(sub.shape)
sub

In [None]:
sub.to_csv("submission.csv", index=False)

# 模組評分

In [None]:
def get_metrics(s_true, s_pred):
    s_true = set(s_true.split())
    s_pred = set(s_pred.split())
    n, n_true, n_pred = len(s_true.intersection(s_pred)), len(s_true), len(s_pred)
    
    prec = n/n_pred
    rec = n/n_true
    f1 = 2*prec*rec/(prec + rec) if prec + rec else 0
    
    return {"f1": f1, "prec": prec, "rec": rec, "n_true": n_true, "n_pred": n_pred, "n": n}

In [None]:
if TARGET_PATH:
    sub_target = pd.read_csv(TARGET_PATH)
    sub_target = sub_target.merge(sub, how="left", on="row_id")
    
    print(sub_target["birds_x"].notnull().sum(), sub_target["birds_x"].notnull().sum())
    assert sub_target["birds_x"].notnull().all()
    assert sub_target["birds_y"].notnull().all()
    
    df_metrics = pd.DataFrame([get_metrics(s_true, s_pred) for s_true, s_pred in zip(sub_target.birds_x, sub_target.birds_y)])
    
    print(df_metrics.mean())