In [None]:
from fastai.vision.all import *
import librosa as librosa

# Loading Specs

In [None]:
def chunk_to_spec(chunk, SPEC_HEIGHT=64,SPEC_WIDTH=256, rate=32000, FMIN=200, FMAX=12500):
    mel_spec = librosa.feature.melspectrogram(y=chunk, 
                                              sr=32000, 
                                              n_fft=1024, 
                                              hop_length=int(32000 * 5 / (SPEC_WIDTH - 1)), 
                                              n_mels=SPEC_HEIGHT, 
                                              fmin=FMIN, 
                                              fmax=FMAX)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    return mel_spec_db

In [None]:
df = pd.read_csv('../input/peak-identification/info_df.csv')
print(df.shape)
df.head(3)

In [None]:
fn, l, y, peaks = df.sample().values[0]
start_time = min(l/32000 - 5, max(float(peaks.split('#')[0])-2.5, 2.5))
y, sr = librosa.load(fn, sr=32000, offset=start_time, duration=5)
plt.imshow(chunk_to_spec(y, SPEC_HEIGHT=128), cmap='inferno')

# DataLoaders

In [None]:
class TitledImage(fastuple):
    def show(self, ctx=None, **kwargs): show_titled_image(self, ctx=ctx, **kwargs)

class ClipTransform(ItemTransform):

    def __init__(self, df):
        self.df=df
        self.vocab,self.o2i = uniqueify(df['label'], sort=True, bidir=True)
        
    def encodes(self, i, from_np=False):
        f, l, label, peaks = df.iloc[i].values
        clip_num = random.choice([0, 0, 0, 1, 1, 2, 3, 4, 5, 6]) # More prob chose big peak
        start_time = min(l/32000 - 5, max(float(peaks.split('#')[clip_num])-2.5, 2.5))
        y, sr = librosa.load(f, sr=32000, offset=start_time, duration=5)
        spec = chunk_to_spec(y,SPEC_HEIGHT=112,SPEC_WIDTH=224)
        spec -= np.min(spec) 
        spec /= 80 # np.max(spec) # Normalize
        spec =  torch.unsqueeze(tensor(spec), 0)
        spec = torch.cat([spec, spec, spec]) # Stack three channels to simulate RGB if using a pretrained model
        return spec, self.o2i[label]
    
    def decodes(self, x):
        return TitledImage(x[0],self.vocab[x[1]])


df_small = df
clip_tfm = ClipTransform(df)
train =  df_small.sample(frac=0.8)
train_idx, valid_idx = list(train.index), df_small[~df_small.index.isin(train.index)].index
print('train and val size', len(train_idx), len(valid_idx))
train_tl= TfmdLists(train_idx, clip_tfm)
valid_tl= TfmdLists(valid_idx, clip_tfm)
dls = DataLoaders.from_dsets(train_tl, valid_tl, bs=16)
dls = dls.cuda()
xb, yb = dls.one_batch()
print(xb.shape)
dls.show_batch(max_n=3)

In [None]:
learn = cnn_learner(dls, models.resnet18, loss_func=FocalLossFlat(), metrics=[accuracy], cbs=[ShowGraphCallback(), CSVLogger()])

In [None]:
print('Model created and ready for training')

In [None]:
learn.unfreeze()
learn.fit_one_cycle(3, slice(1e-4, 1e-3))

In [None]:
learn.recorder.plot_loss()
plt.savefig('loss_plot.png')

In [None]:
learn.save('stage-1')

In [None]:
learn.remove_cb(CSVLogger) # Not pickleable
learn.export('baseline_3e.pkl')

In [None]:
# !pip install -q wandb
# import wandb
# wandb.init()
# from fastai.callback.wandb import *
# # And add cbs=WandbCallback() to log