In [35]:
from fastai.data.all import *
from fastai.vision.all import *
from IPython.utils import io
import librosa
sys.path.append('../')

In [2]:
train_path = Path("./Dataset/Dataset/IRMAS_Training_Data")
noise_path = Path("./Dataset/Dataset/IRMAS_Training_Data/noi")
valid_path = Path("./Dataset/Dataset/IRMAS_Validation_Data")
test_path = Path("./Dataset/Dataset/IRMAS_Test_Data")
grand_path = Path("./Dataset/Dataset")

In [3]:
get_song_files = FileGetter(extensions='.wav', recurse=True)

In [4]:
def is_IRMAS_train(pat: Path):
    return str(pat).find("IRMAS_Training_Data") != -1

def is_IRMAS_valid(pat: Path):
    return str(pat).find("IRMAS_Validation_Data") != -1
    
def is_IRMAS_test(pat: Path):
    return str(pat).find("IRMAS_Test_Data") != -1

In [5]:
def get_IRMAS_train_label(pat: Path):
    r = re.search("\[[^(\[\])]+\]", pat.name)
    if r:
        return [r.group()[1:-1]]
    return []

In [6]:
def get_IRMAS_valid_label(pat: Path):
    with open(os.path.splitext(str(pat))[0] + ".txt") as file:
        return file.read().split()

In [7]:
def get_label(pat: Path):
    if is_IRMAS_train(pat):
        return get_IRMAS_train_label(pat)
    return get_IRMAS_valid_label(pat)

def get_single_label(pat: Path):
    return get_label(pat)[:1]

In [8]:
n_fft = 512  # 1024
hop_length = 256  # 512
f_min = 20
f_max = 8000
sample_rate = 44100

In [9]:
def get_song(pat: Path):
    return librosa.load(pat, sr=None)[0]
class ToSong(Transform):
    def encodes(self, song):
        if isinstance(song, Path):
            return get_song(song)
        return song

In [10]:
def extend_to_3sr(song):
    aplen = sample_rate*3 - len(song)
    if aplen < 0: aplen = 0
    song = np.concatenate([song, np.zeros(aplen, dtype="float32")])
    return song

In [11]:
class RandomClip(Transform):
    split_idx=0
    def encodes(self, song):
        maxran = len(song)-sample_rate*3 + 1
        if maxran <= 0: maxran = 1

        i = np.random.randint(maxran)
        # i=0
        song = song[i:i+sample_rate*3]
        return extend_to_3sr(song)
        
class CenterClip(Transform):
    split_idx=1
    def encodes(self, song):
        i = int((len(song) - sample_rate*3) / 2)
        song = song[i:i+sample_rate*3]
        return extend_to_3sr(song)

random_clip = RandomClip().encodes
center_clip = CenterClip().encodes

In [13]:
def get_spec(song):
    stft = librosa.stft(song, n_fft=n_fft, hop_length=hop_length)
    S_db = librosa.amplitude_to_db(np.abs(stft), ref=np.max)
    return S_db

In [22]:
class ToSpec(ItemTransform):
    def __init__(self):
        self.pipe = Pipeline([get_spec, PILImage.create])
    def encodes(self, item):
        x = item[0]
        return [self.pipe(x)] + item[1:]

In [15]:
def AccuracyMulti(tresh=0.5):
    def acc(x, y):
        return 1 - (((x > tresh).float() - y).abs()).float().mean()
    return acc

tensor(0.8333)

In [16]:
get_song_tfms = [ToSong(), RandomClip(), CenterClip()]
get_label_tfms = [get_label, MultiCategorize(), OneHotEncode()]

In [17]:
def get_dataset(items, splitter=RandomSplitter()):
    splits = splitter(items)
    return Datasets(items, [get_song_tfms, get_label_tfms], splits=splits)

In [23]:
splitter = FuncSplitter(lambda x: is_IRMAS_test(x))
items = get_song_files(grand_path)
train_dset = get_dataset(items, splitter)

after_augm = [
    ToSpec(),
    Resize((256, 156), method=ResizeMethod.Squish),
]

def get_dataloader(ds, after_augm=after_augm):
    after_item = after_augm + [ToTensor(), IntToFloatTensor()]
    return ds.dataloaders(bs=64, after_item=after_item, shuffle=True)


In [25]:
dls = get_dataloader(train_dset)
learn = vision_learner(dls, resnet18, pretrained=True, metrics=AccuracyMulti(tresh=0.8))
learn.load('MLBLCLA_model');

<fastai.learner.Learner at 0x7f5ed53d5c30>

In [27]:
learn.validate()

(#2) [0.30292537808418274,0.8910554051399231]

In [28]:
def get_sl_windows(song, step=sample_rate):
    ran = range(0, len(song)-sample_rate-step, step)
    if len(ran) == 0: yield extend_to_3sr([0])
    for i in ran: 
        yield extend_to_3sr(song[i:i+sample_rate*3])

In [36]:
def predict(learn, song, step=sample_rate, tresh=0.8, perc=0.25):
    instr = np.zeros(len(learn.dls.vocab))
    for n, sl in enumerate(get_sl_windows(song)):
        with io.capture_output() as captured:
            instr[learn.predict(sl)[1] > tresh] += 1
    return learn.dls.vocab[instr > (n+1)*perc]

In [37]:
predict(learn, get_song(items[0]))

(#1) ['gel']