A_step2_inference_BirdClef

In [None]:
import os
import pandas as pd
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import StratifiedKFold
import cv2
import os
import matplotlib.pyplot as plt
from math import ceil

import warnings
warnings.filterwarnings("ignore")

In [None]:
# repeat for shorter audio
def repeat_one_axis(x_, nb):
    return np.repeat(x_[np.newaxis,:], nb, axis=0).flatten()

aa = np.array([1,2])
repeat_one_axis(aa, 4)

In [None]:
ONLY_SCORED = True


TEST_AUDIOS = [_name for _name in os.listdir('../input/birdclef-2022/test_soundscapes') if 'ogg' in _name]

sub = pd.DataFrame({"filename":TEST_AUDIOS})
sub['path'] = '../input/birdclef-2022/test_soundscapes/' + sub['filename']
sub['file_id'] = sub['filename'].str.replace('.ogg','')

In [None]:
sub.head()

In [None]:
#train.primary_label.value_counts()

## PARAMTERS

In [None]:
SR  = 32_000
DURATION = 5
NMELS = 128

## MAKING TF RECORDS

In [None]:
import librosa as lb
import soundfile as sf
import tensorflow as tf


In [None]:
def get_audio(filename):
    audio, orig_sr = sf.read(filename, dtype="float32")
    if orig_sr !=SR:
        audio = lb.resample(audio, orig_sr,SR , res_type="kaiser_fast")
    if len(audio.shape)>1:audio = audio[:, 0]
    return audio
#=======================================================
class MelSpecComputer:
    def __init__(self, sr, n_mels, fmin, fmax, **kwargs):
        self.sr = sr
        self.n_mels = n_mels
        self.fmin = fmin
        self.fmax = fmax
        kwargs["n_fft"] = kwargs.get("n_fft", self.sr//10)
        kwargs["hop_length"] = kwargs.get("hop_length", self.sr//(10*4))
        self.kwargs = kwargs

    def __call__(self, y):

        melspec = lb.feature.melspectrogram(
            y, sr=self.sr, n_mels=self.n_mels, fmin=self.fmin, fmax=self.fmax, **self.kwargs,
        )

        melspec = lb.power_to_db(melspec).astype(np.float32)
        return melspec
#================================================

In [None]:
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

AUTO = tf.data.experimental.AUTOTUNE
print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
#==============================================#
def train_serialize_example(feature0, feature1, feature2):
    feature = {
      'filename'         : _bytes_feature(feature0),
      'time'      : _int64_feature(feature1),
      'audio'         : _bytes_feature(feature2),    
  }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()
#==============================================

In [None]:
EXTRACTOR = MelSpecComputer(sr=SR, n_mels=NMELS, fmin=0,fmax=None)
EPS = 1e-5
if 'data' not in os.listdir(): os.mkdir('./data')
WAVE_LENGTH = DURATION * SR

def make_tf_record(df):
    with tf.io.TFRecordWriter(f'./data/test.tfrec') as writer:
        for i,row in tqdm(df.iterrows()):
            # loading audio with label
            filepath = row['path']
            filename = row['filename']
            audio = get_audio(filepath)
            # making batches
            nb_batches = ceil(audio.shape[0] / WAVE_LENGTH)
            
            for cnt in range(nb_batches):
                audio_batch = audio[cnt*WAVE_LENGTH:(cnt+1)*WAVE_LENGTH]
                if len(audio_batch)<WAVE_LENGTH:
                    if cnt==0:
                        rep = round(float(WAVE_LENGTH)/len(audio_batch))
                        audio_batch = repeat_one_axis(audio_batch, rep)
                    else:
                        audio_batch = audio[-WAVE_LENGTH:]
                #
                mel = EXTRACTOR(audio_batch)
                _min, _max = mel.min(), mel.max()
                mel = 255 * (mel - _min ) / (_max - _min + EPS)
                mel = mel.astype(np.uint8)
                img = np.stack([mel, mel, mel], axis=-1)
                
                example = train_serialize_example(str.encode(filename), 
                                                  (cnt+1)*DURATION, 
                                                  cv2.imencode('.png', img)[1].tobytes())
                writer.write(example)
                #end for
            #
    return 0

In [None]:
make_tf_record(sub)

## READBACK

In [None]:
IMG_SIZE = 128

def read_test_tfrecord(example):
    tfrec_format = {
        'audio'                        : tf.io.FixedLenFeature([], tf.string),
        'filename'                        : tf.io.FixedLenFeature([], tf.string),
        'time'                       : tf.io.FixedLenFeature([], tf.int64)
    }           
    example = tf.io.parse_single_example(example, tfrec_format)
    return example['filename'], example['time'], example["audio"]
#=====
def parse_test(fname, cnt, img):   
    img = tf.image.decode_png(img, channels=3)
    img = tf.cast(img, tf.float32) / 255.0
    img = tf.image.resize(img, [IMG_SIZE, IMG_SIZE])
    return fname, cnt, img
#===========
def make_test_dataset(filenames, batch_size=64):
    ds = tf.data.TFRecordDataset(filenames)
    ds  = ds.map(read_test_tfrecord)
    ds = ds.map(parse_test)
    ds = ds.batch(batch_size)
    ds = ds.prefetch(AUTO)
    return ds
#==================

In [None]:
BATCH_SIZE = 516
ds_test  = make_test_dataset(['./data/test.tfrec'], batch_size=BATCH_SIZE)

In [None]:
NETS = [tf.keras.models.load_model(f'../input/bird-cnn/w{idx}.h5') for idx in range(5)]

In [None]:
print(NETS[0].summary())

In [None]:
data_iter = iter(ds_test)
idx = 0

FN, CNT, PRED = [], [], []
for fn, cnt, img in tqdm(data_iter):
    idx += 1
    FN.append(fn.numpy().astype(str))
    CNT.append(cnt.numpy())
    # predict with img with your models
    pred = 0
    for net in NETS:
        pred += net.predict(img, verbose=0) / 5
    #PRED.append(pred.argmax(axis=1))
    PRED.append(pred)
#===
FN = np.concatenate(FN)
CNT = np.concatenate(CNT)
PRED = np.concatenate(PRED)

In [None]:
NB_LABELS = 21
df = pd.DataFrame({"file_id":FN, "end_time":CNT})
dg = pd.DataFrame(PRED, columns=[f"prob-{idx}" for idx in range(NB_LABELS)])
df = df.join(dg)

In [None]:
1 / NB_LABELS

In [None]:
df = pd.wide_to_long(df, ['prob'], i=['file_id','end_time'], j='label', sep='-').reset_index()

df_lab = pd.read_csv('../input/birdclef-trials/labels.csv')
df_lab.columns = ['bird','NB_BIRDS','label']
df = df.merge(df_lab[['label','bird']], on='label')
df.drop('label', axis=1, inplace=True)

In [None]:
df.head()

In [None]:
#
df['file_id'] = df['file_id'].str.replace('.ogg', '')
df['target'] = (df['prob']>1./NB_LABELS)
#df['target'] = (df['prob']>0.26)
df['row_id'] = df['file_id'] + '_' + df['bird'] + '_' + df['end_time'].astype(str)

df = df.sort_values(by=['file_id','end_time','bird']).reset_index(drop=True)

In [None]:
df.head(21)

In [None]:
df.target.sum()

In [None]:
df_sub = df[['row_id','target']].copy()

In [None]:
df_sub.head(21)

In [None]:
df_sub.shape

In [None]:
df_sub.to_csv('submission.csv', index=False)