In [None]:
# %load create_object_audio_files_true.py
from __future__ import division
import os
import cv2
from collections import defaultdict
import numpy as np
from shutil import copyfile

from librosa import load
from librosa.output import write_wav
from librosa.feature import melspectrogram
from librosa.display import specshow
from librosa import power_to_db

import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt

def generate_spectrogram(wav_slice, sample_rate, start, end, spec_out_file):
    plt.figure(figsize=(256.0/400.0, 256.0/400.0), dpi=400)
    plt.axis('off')
    S = melspectrogram(y=wav_slice, sr=sample_rate)
    specshow(power_to_db(S, ref=np.max))
    plt.savefig(spec_out_file, dpi=400, bbox_inches='tight', pad_inches=0)
    plt.clf()
    del wav_slice, sample_rate, S

def process_split(in_file, in_path, out_path, split):
    wav_in_path = os.path.join(in_path, split, 'wav')
    img_in_path = os.path.join(in_path, split, 'img')
    wav_out_path = os.path.join(out_path, split, 'wav')
    img_out_path = os.path.join(out_path, split, 'img')
    spec_out_path = os.path.join(out_path, split, 'spec')
    
    box_data = defaultdict(list)
    slice_data = defaultdict(list)
    data = open(in_file, 'r').readlines()[1:]
    ln_cnt = 0
    for line in data:
        ln_cnt += 1
        file_id, img_file, box0, box1, box2, box3, wav_file, start, end, org_obj, mm_obj, score, _ = line.split('\t')
        box0, box1, box2, box3, start, end = int(float(box0)), int(float(box1)), int(float(box2)), int(float(box3)), float(start)/1000, float(end)/1000
        img_in_file = os.path.join(img_in_path, img_file)
        img_out_file = os.path.join(img_out_path, file_id+'.jpg')
        img_box = cv2.imread(img_in_file, cv2.IMREAD_COLOR)[box1:box3, box0:box2, :]
        cv2.imwrite(img_out_file, img_box)
        wav_in_file = os.path.join(wav_in_path, wav_file)
        wav_out_file = os.path.join(wav_out_path, file_id+'.wav')
        wav_slice, sample_rate = load(wav_in_file, sr=16000, offset=start, duration=end-start)
        write_wav(wav_out_file, wav_slice, sample_rate)
        spec_out_file = os.path.join(spec_out_path, file_id+'.png')
        generate_spectrogram(wav_slice, sample_rate, start, end, spec_out_file)
        print(ln_cnt)
    return ln_cnt
for split in ['train', 'test', 'val']:
    in_file = 'small_dataset_objects/{}_split.tsv'.format(split)
    in_path = 'small_dataset'
    out_path = 'small_dataset_objects'
    process_split(in_file, in_path, out_path, split)
