In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import typing as tp
import yaml
import random
import os
import sys
import soundfile as sf
import librosa
import librosa.display
import cv2
import matplotlib.pyplot as plt
import time
import pickle
import glob
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
# import resnest.torch as resnest_torch

from torchvision import models

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
# from resnet import ResNet, Bottleneck

from albumentations.core.transforms_interface import DualTransform, BasicTransform
import albumentations as albu

pd.options.display.max_rows = 500
pd.options.display.max_columns = 500

In [2]:
INPUT_ROOT = Path("/home/knikaido/work/Rainforest-Connection/data")
RAW_DATA = INPUT_ROOT / "rfcx-species-audio-detection"
TRAIN_AUDIO_DIR = RAW_DATA / "train"
# TRAIN_RESAMPLED_AUDIO_DIRS = [
#   INPUT_ROOT / "birdsong-resampled-train-audio-{:0>2}".format(i)  for i in range(5)
# ]
TEST_AUDIO_DIR = RAW_DATA / "test"
FEAT_DIR =  '/home/knikaido/work/Rainforest-Connection/Git/feature/05/'

In [3]:
train = pd.read_csv(RAW_DATA / "train_tp.csv")
# train['tp'] = 1
# train_fp = pd.read_csv(RAW_DATA / "train_fp.csv")
# train_fp['tp'] = 0
# whole = pd.concat([train, train_fp])
train

Unnamed: 0,recording_id,species_id,songtype_id,t_min,f_min,t_max,f_max
0,003bec244,14,1,44.5440,2531.250,45.1307,5531.25
1,006ab765f,23,1,39.9615,7235.160,46.0452,11283.40
2,007f87ba2,12,1,39.1360,562.500,42.2720,3281.25
3,0099c367b,17,4,51.4206,1464.260,55.1996,4565.04
4,009b760e6,10,1,50.0854,947.461,52.5293,10852.70
...,...,...,...,...,...,...,...
1211,fe8d9ac40,13,1,53.4720,93.750,54.0960,843.75
1212,fea6b438a,4,1,43.5787,2531.250,45.7653,4031.25
1213,ff2eb9ce5,0,1,15.2267,5906.250,16.0213,8250.00
1214,ffb8d8391,5,1,14.3467,4781.250,16.6987,10406.20


In [4]:
wav_pathes = sorted(glob.glob(FEAT_DIR + '*.npy'))
len(wav_pathes)

1132

In [5]:
# tgt_path_list = []
# for i in tqdm(range(len(train))):
#     for j in range(len(wav_pathes)):
#         if train['recording_id'][i] in wav_pathes[j]:
#             tgt_path_list.append(wav_pathes[j])

In [6]:
tat_path_np = np.array(wav_pathes)

In [7]:
# tat_path_list_count = pd.Series(tat_path_np).value_counts().index.tolist()

In [8]:
tat_path_list_df = pd.Series(tat_path_np)

In [9]:
len(tat_path_list_df)

1132

In [10]:
tat_path_list_df.name = 'path'
tat_path_list_df

0       /home/knikaido/work/Rainforest-Connection/Git/...
1       /home/knikaido/work/Rainforest-Connection/Git/...
2       /home/knikaido/work/Rainforest-Connection/Git/...
3       /home/knikaido/work/Rainforest-Connection/Git/...
4       /home/knikaido/work/Rainforest-Connection/Git/...
                              ...                        
1127    /home/knikaido/work/Rainforest-Connection/Git/...
1128    /home/knikaido/work/Rainforest-Connection/Git/...
1129    /home/knikaido/work/Rainforest-Connection/Git/...
1130    /home/knikaido/work/Rainforest-Connection/Git/...
1131    /home/knikaido/work/Rainforest-Connection/Git/...
Name: path, Length: 1132, dtype: object

In [11]:
# for i in range(len(tat_path_list_df)):
#     print(tat_path_list_df[i][-15:])

In [12]:
train.sort_values('recording_id').head(20)
train.reset_index(inplace=True, drop=True)
train

Unnamed: 0,recording_id,species_id,songtype_id,t_min,f_min,t_max,f_max
0,003bec244,14,1,44.5440,2531.250,45.1307,5531.25
1,006ab765f,23,1,39.9615,7235.160,46.0452,11283.40
2,007f87ba2,12,1,39.1360,562.500,42.2720,3281.25
3,0099c367b,17,4,51.4206,1464.260,55.1996,4565.04
4,009b760e6,10,1,50.0854,947.461,52.5293,10852.70
...,...,...,...,...,...,...,...
1211,fe8d9ac40,13,1,53.4720,93.750,54.0960,843.75
1212,fea6b438a,4,1,43.5787,2531.250,45.7653,4031.25
1213,ff2eb9ce5,0,1,15.2267,5906.250,16.0213,8250.00
1214,ffb8d8391,5,1,14.3467,4781.250,16.6987,10406.20


In [13]:
names = []
for i in tqdm(range(len(train))):
    names.append(FEAT_DIR + train.iloc[i]['recording_id'] + '.npy')

100%|██████████| 1216/1216 [00:00<00:00, 8891.01it/s]


In [14]:
len(names)

1216

In [15]:
train['name'] = names
train

Unnamed: 0,recording_id,species_id,songtype_id,t_min,f_min,t_max,f_max,name
0,003bec244,14,1,44.5440,2531.250,45.1307,5531.25,/home/knikaido/work/Rainforest-Connection/Git/...
1,006ab765f,23,1,39.9615,7235.160,46.0452,11283.40,/home/knikaido/work/Rainforest-Connection/Git/...
2,007f87ba2,12,1,39.1360,562.500,42.2720,3281.25,/home/knikaido/work/Rainforest-Connection/Git/...
3,0099c367b,17,4,51.4206,1464.260,55.1996,4565.04,/home/knikaido/work/Rainforest-Connection/Git/...
4,009b760e6,10,1,50.0854,947.461,52.5293,10852.70,/home/knikaido/work/Rainforest-Connection/Git/...
...,...,...,...,...,...,...,...,...
1211,fe8d9ac40,13,1,53.4720,93.750,54.0960,843.75,/home/knikaido/work/Rainforest-Connection/Git/...
1212,fea6b438a,4,1,43.5787,2531.250,45.7653,4031.25,/home/knikaido/work/Rainforest-Connection/Git/...
1213,ff2eb9ce5,0,1,15.2267,5906.250,16.0213,8250.00,/home/knikaido/work/Rainforest-Connection/Git/...
1214,ffb8d8391,5,1,14.3467,4781.250,16.6987,10406.20,/home/knikaido/work/Rainforest-Connection/Git/...


In [16]:
train.drop_duplicates(subset='recording_id', keep='first')

Unnamed: 0,recording_id,species_id,songtype_id,t_min,f_min,t_max,f_max,name
0,003bec244,14,1,44.5440,2531.250,45.1307,5531.25,/home/knikaido/work/Rainforest-Connection/Git/...
1,006ab765f,23,1,39.9615,7235.160,46.0452,11283.40,/home/knikaido/work/Rainforest-Connection/Git/...
2,007f87ba2,12,1,39.1360,562.500,42.2720,3281.25,/home/knikaido/work/Rainforest-Connection/Git/...
3,0099c367b,17,4,51.4206,1464.260,55.1996,4565.04,/home/knikaido/work/Rainforest-Connection/Git/...
4,009b760e6,10,1,50.0854,947.461,52.5293,10852.70,/home/knikaido/work/Rainforest-Connection/Git/...
...,...,...,...,...,...,...,...,...
1211,fe8d9ac40,13,1,53.4720,93.750,54.0960,843.75,/home/knikaido/work/Rainforest-Connection/Git/...
1212,fea6b438a,4,1,43.5787,2531.250,45.7653,4031.25,/home/knikaido/work/Rainforest-Connection/Git/...
1213,ff2eb9ce5,0,1,15.2267,5906.250,16.0213,8250.00,/home/knikaido/work/Rainforest-Connection/Git/...
1214,ffb8d8391,5,1,14.3467,4781.250,16.6987,10406.20,/home/knikaido/work/Rainforest-Connection/Git/...


In [17]:
train.to_pickle(str(RAW_DATA / 'train_gby_mel_raw_uni.pkl'))
# tat_path_list_df.to_pickle(str(output_dir / 'train_gby_wavpath.pkl'))