![tcs_1x](https://user-images.githubusercontent.com/50532530/121583499-f6fcfe80-ca4d-11eb-875f-c79d63fca3b5.jpeg)

In [None]:
import numpy as np
import librosa as lb
import librosa.display as lbd
import soundfile as sf
from  soundfile import SoundFile
import pandas as pd
from  IPython.display import Audio
from pathlib import Path

from matplotlib import pyplot as plt

from tqdm.notebook import tqdm
import joblib, json

from  sklearn.model_selection  import StratifiedKFold

In [None]:
PART_ID = 0 # The start index in the below list, by changing it you will compute mels on another subset
PART_INDEXES = [0,15718, 31436, 47154, 62874] # The train_set is splitted into 4 subsets

In [None]:
SR = 32_000
DURATION = 7 
SEED = 666

DATA_ROOT = Path("../input/birdclef-2021")
TRAIN_AUDIO_ROOT = Path("../input/birdclef-2021/train_short_audio")
TRAIN_AUDIO_IMAGES_SAVE_ROOT = Path("audio_images") # Where to save the mels images
TRAIN_AUDIO_IMAGES_SAVE_ROOT.mkdir(exist_ok=True, parents=True)

In [None]:
def get_audio_info(filepath):
    """Get some properties from  an audio file"""
    with SoundFile(filepath) as f:
        sr = f.samplerate
        frames = f.frames
        duration = float(frames)/sr
    return {"frames": frames, "sr": sr, "duration": duration}

In [None]:
PART_ID+1

In [None]:
import os

In [None]:
def make_df(n_splits=5, seed=SEED, nrows=None):
    
    df = pd.read_csv(DATA_ROOT/"train_metadata.csv", nrows=nrows)

    LABEL_IDS = {label: label_id for label_id,label in enumerate(sorted(df["primary_label"].unique()))}
    
    df = df.iloc[0:15718]

    df["label_id"] = df["primary_label"].map(LABEL_IDS)

    df["filepath"] =[str(TRAIN_AUDIO_ROOT/primary_label/filename) for primary_label,filename in zip(df.primary_label, df.filename) ]

    pool = joblib.Parallel(4)
    mapper = joblib.delayed(get_audio_info)
    tasks = [mapper(filepath) for filepath in df.filepath]

    df = pd.concat([df, pd.DataFrame(pool(tqdm(tasks)))], axis=1, sort=False)
    
    skf = StratifiedKFold(n_splits=n_splits, random_state=seed, shuffle=True)
    splits = skf.split(np.arange(len(df)), y=df.label_id.values)
    df["fold"] = -1

    for fold, (train_set, val_set) in enumerate(splits):
        
        df.loc[df.index[val_set], "fold"] = fold

    return LABEL_IDS, df

In [None]:
_,df2 = make_df(nrows=None)

In [None]:
df=pd.read_csv("../input/train-meta-2/gandfaatgaya.csv")
df=df[0:15718]

In [None]:
df2['Cluster_point']=df['Cluster_point']
df=df2
path1='../input/240255-noise-signal/noise1'
path2='../input/3060-signal-noise/noise1'
path3='../input/120135signalandnoise/noise1'
path4='../input/135150signalandnoise/noise1'
path5='../input/150180-signal-and-noise/noise1'
path6='../input/180210-signal-and-noise/noise1'
path7='../input/210240signalandnoise/noise1'
path8='../input/255270noisesignal/noise1'
path9='../input/270300-signal-and-noise/noise1'
path10='../input/300330-signal-and-noise/noise1'
path11='../input/360397signalandnoise/noise1'
path12='../input/6090-signal-and-noise/noise1'
path13='../input/90120-signal-noise/noise1'
path14='../input/signals-and-noise-part-1/noise1'
path15='../input/330345signalandnoise/noise1'
path16='../input/345360signalandnoise/noise1'

noise=[]

paths=[path1,path2,path3,path4,path5,path6,path7,path8,path9,path10,path11,path12,path13,path14,path15,path16]

for j in range(len(df)):
    c=0
    for path in paths:
        for i in os.listdir(path):
            #print(df['primary_label'][j])
            if(df['primary_label'][j]==i):
                #print(os.listdir(path+'/'+i))
                for x in os.listdir(path+'/'+i):
                    if(df['filename'][j][:-4]==x[:-4]):
                        noise.append(path+'/'+i+'/'+x)
                        #print('ok2')
                        c=1
                        break
            if(c==1):
                break
        if(c==1):
            break

df['noise filepath'] = noise

noise_cluster=[]
for i in df['Cluster_point'].unique():
    noise_cluster.append(df['noise filepath'][np.where(df['Cluster_point']==i)[0]])

noise_cluster_final=[]
for i in df['Cluster_point']:
    noise_cluster_final.append(noise_cluster[i])
    
df['final_noise']=noise_cluster_final

In [None]:
len(noise)

In [None]:
df.head()

In [None]:
class MelSpecComputer:
    def __init__(self, sr, n_mels, fmin, fmax, **kwargs):
        self.sr = sr
        self.n_mels = n_mels
        self.fmin = fmin
        self.fmax = fmax
        kwargs["n_fft"] = kwargs.get("n_fft", self.sr//10)
        kwargs["hop_length"] = kwargs.get("hop_length", self.sr//(10*4))
        self.kwargs = kwargs

    def __call__(self, y):

        melspec = lb.feature.melspectrogram(
            y, sr=self.sr, n_mels=self.n_mels, fmin=self.fmin, fmax=self.fmax, **self.kwargs,
        )

        melspec = lb.power_to_db(melspec).astype(np.float32)
        return melspec

In [None]:
def mono_to_color(X, eps=1e-6, mean=None, std=None):
    mean = mean or X.mean()
    std = std or X.std()
    X = (X - mean) / (std + eps)
    
    _min, _max = X.min(), X.max()

    if (_max - _min) > eps:
        V = np.clip(X, _min, _max)
        V = 255 * (V - _min) / (_max - _min)
        V = V.astype(np.uint8)
    else:
        V = np.zeros_like(X, dtype=np.uint8)

    return V

def crop_or_pad(y, length, is_train=True, start=None):
    if len(y) < length:
        y = np.concatenate([y, np.zeros(length - len(y))])
        
        n_repeats = length // len(y)
        epsilon = length % len(y)
        
        y = np.concatenate([y]*n_repeats + [y[:epsilon]])
        
    elif len(y) > length:
        if not is_train:
            start = start or 0
        else:
            start = start or np.random.randint(len(y) - length)

        y = y[start:start + length]

    return y

In [None]:
class AudioToImage:
    def __init__(self, sr=SR, n_mels=128, fmin=0, fmax=None, duration=DURATION, step=None, res_type="kaiser_fast", resample=True):

        self.sr = sr
        self.n_mels = n_mels
        self.fmin = fmin
        self.fmax = fmax or self.sr//2

        self.duration = duration
        self.audio_length = self.duration*self.sr
        self.step = step or self.audio_length
        
        self.res_type = res_type
        self.resample = resample

        self.mel_spec_computer = MelSpecComputer(sr=self.sr, n_mels=self.n_mels, fmin=self.fmin,
                                                 fmax=self.fmax)
        
    def audio_to_image(self, audio,row):
        
        audio=add_noise(audio,row.final_noise)
        melspec = self.mel_spec_computer(audio)
        #melspec=signal_noise_ratio(melspec)
        image = mono_to_color(melspec)
#         image = normalize(image, mean=None, std=None)
        return image

    def __call__(self, row, save=True):
#       max_audio_duration = 10*self.duration
#       init_audio_length = max_audio_duration*row.sr
        
#       start = 0 if row.duration <  max_audio_duration else np.random.randint(row.frames - init_audio_length)
    
      audio, orig_sr = sf.read(row.filepath, dtype="float32")
      #audio,_=signal_noise_split(audio)
      if self.resample and orig_sr != self.sr:
        audio = lb.resample(audio, orig_sr, self.sr, res_type=self.res_type)
        
      audios = [audio[i:i+self.audio_length] for i in range(0, max(1, len(audio) - self.audio_length + 1), self.step)]
      audios[-1] = crop_or_pad(audios[-1] , length=self.audio_length)
      images = [self.audio_to_image(audio,row) for audio in audios]
      images = np.stack(images)
        
      if save:
        path = TRAIN_AUDIO_IMAGES_SAVE_ROOT/f"{row.primary_label}/{row.filename}.npy"
        path.parent.mkdir(exist_ok=True, parents=True)
        np.save(str(path), images)
      else:
        return  row.filename, images

In [None]:
import os

In [None]:
dur_frame = 7
len_frame = dur_frame * SR
n_frame = len_frame // 512 + 1
import cv2

In [None]:
from librosa.core import spectrum
from scipy.ndimage.morphology import binary_dilation, binary_erosion
def signal_noise_split(audio):
    S, _ = spectrum._spectrogram(y=audio, power=1.0, n_fft=2048, hop_length=512, win_length=2048)

    col_median = np.median(S, axis=0, keepdims=True)
    row_median = np.median(S, axis=1, keepdims=True)
    S[S < row_median * 3] = 0.0
    S[S < col_median * 3] = 0.0
    S[S > 0] = 1

    S = binary_erosion(S, structure=np.ones((4, 4)))
    S = binary_dilation(S, structure=np.ones((4, 4)))

    indicator = S.any(axis=0)
    indicator = binary_dilation(indicator, structure=np.ones(4), iterations=2)

    mask = np.repeat(indicator, 512)
    mask = binary_dilation(mask, structure=np.ones(2048 - 512), origin=-(2048 -512)//2)
    mask = mask[:len(audio)]
    signal = audio[mask]
    noise = audio[~mask]
    return signal,noise
def add_noise(signal,i,n=4):
    
    p_noise = np.random.uniform(0, 1, n)
    i_noise = np.where(p_noise > 0.5)[0]
    alpha_noise = np.random.uniform(0, 0.5, n)
    path_noise = np.random.choice(i, n)
    for i in i_noise:
        noise, _ = lb.load(path_noise[i], sr=SR, mono=True, duration=7*32000, res_type='kaiser_fast')
        len_noise = len(noise)
        
        i_start = 0
        if len_noise < len_frame:
            i_start = np.random.randint(len_frame - len_noise)
        print(signal[i_start: i_start + len_noise].shape)
        print(alpha_noise[i].shape)
        print(noise.shape)
        signal[i_start: i_start + len_noise] += noise[:signal[i_start: i_start + len_noise].shape[0]] * alpha_noise[i]
    return signal
def signal_noise_ratio(spec):
      spec = spec.copy()

      col_median = np.median(spec, axis=0, keepdims=True)
      row_median = np.median(spec, axis=1, keepdims=True)

      spec[spec < row_median * 1.25] = 0.0
      spec[spec < col_median * 1.15] = 0.0
      spec[spec > 0] = 1.0

      spec = cv2.medianBlur(spec, 3)
      spec = cv2.morphologyEx(spec, cv2.MORPH_CLOSE, np.ones((3, 3), np.float32))

      spec_sum = spec.sum()
      try:
          snr = spec_sum / (spec.shape[0] * spec.shape[1] * spec.shape[2])
      except:
          snr = spec_sum / (spec.shape[0] * spec.shape[1])

      return snr

In [None]:
def get_audios_as_images(df):
    pool = joblib.Parallel(2)
    
    converter = AudioToImage(step=int(DURATION*0.666*SR))
    mapper = joblib.delayed(converter)
    tasks = [mapper(row) for row in df.itertuples(False)]
    
    pool(tqdm(tasks))

In [None]:
int(DURATION)

In [None]:
get_audios_as_images(df)