In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [None]:
import os
import numpy as np
import librosa
%matplotlib inline  
import matplotlib.pyplot as plt
import librosa.display
import pickle 
import re #regular expression
import tensorflow as tf
from functools import reduce
def raw_to_mel(audio, sampling_rate, window_size, hop_length, n_freqs):
    """Go from 1D numpy array containing audio waves to mel spectrogram.

    Parameters:
        audio: 1D numpy array containing the audio.
        sampling_rate: Sampling rate of audio.
        window_size: STFT window size.
        hop_length: Distance between successive STFT windows.
        n_freqs: Number of mel frequency bins.

    Returns:
        Processed spectrogram, bins x time.

    """    
    spectro = librosa.stft(audio, n_fft=window_size, hop_length=hop_length,
                           center=True)
    power = np.abs(spectro)**2
    mel = librosa.feature.melspectrogram(S=power, sr=sampling_rate,
                                         n_mels=n_freqs)
    logmel = np.log(mel + 1e-11)
    return logmel

def raw_to_pcen(audio, sampling_rate, window_size, hop_length, n_freqs):
    """Go from 1D numpy array containing audio waves to PCEN spectrogram.

    Parameters might not be optimal...

    Parameters:
        audio: 1D numpy array containing the audio.
        sampling_rate: Sampling rate of audio.
        window_size: STFT window size.
        hop_length: Distance between successive STFT windows.
        n_freqs: Number of mel frequency bins.

    Returns:
        PCEN spectrogram, bins x time.

    """
    spectro = np.abs(librosa.stft(audio, n_fft=window_size,
                                  hop_length=hop_length, center=True))
    mel = librosa.feature.melspectrogram(S=spectro, sr=sampling_rate,
                                         n_mels=n_freqs)
    pcen = librosa.pcen(mel, sr=sampling_rate, hop_length=hop_length,
                        time_constant=0.285)
    return pcen

def chs_to_inds(char_list, mapping):
    """Helper to convert a list of characters to a list of corresponding indices.

    Parameters:
        char_list: List of characters (or string).
        mapping: Dict mapping characters to indices.

    Returns:
        List of character indices.
    """
    return [mapping[ch] for ch in char_list]
  
  
def trim_audio(audio, sr, start, end):
    """start, end - in seconds (float)
    Return:
        Trimmed audio, None if out of boudaries
    """
    assert start < end, "End time step should be bigger then start time step."
    audio_segment = audio[round(start*sr): round(end*sr)]
    return audio_segment

def mel_plot(S):
    plt.figure(figsize=(10,4))
    librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max), #convert power to db
                             y_axis='log',x_axis='time')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Mel spectrogram')
    plt.tight_layout()
    return plt.show()

def audio_stack(audio_dir,audio_path, trim = True):
    '''
    If trim is True, silent region 2s before and after the audio will be removed
    
    Returns: 
        List of audio
        Sampling rate
        List of audio duration
    '''
    audio_list = [] #audio time-series
    t_list = [] #time duration 
    for audio in audio_dir:
        path = os.path.join(audio_path, audio)
        y, sr = librosa.load(path)
        if trim == True:
            audio_length = librosa.get_duration(y=y, sr=sr)
            y, sr = librosa.load(path, offset=2.0, duration = (audio_length -4.0))
        audio_list.append(y)
        t_list.append(librosa.get_duration(y=y, sr=sr))
    
    print('Sample rate: {} Hz'.format(sr)) #sample rate is same for all audio    
    return audio_list, sr, t_list
  
train_audio_path="/content/gdrive/My Drive/DAMP"
audio_dir_train = sorted(os.listdir(train_audio_path))
audio_list_train, sr_train, t_list_train = audio_stack(audio_dir_train,train_audio_path)



def remove_header(read_txt, write_txt):
    '''
    Remove header from lyrics transcription.
    
    Parameters:
        read_txt: Lyrics transcription text file that contains header
        write_txt: New txt file with lyrics without header
    '''
   
    f = open(read_txt, 'r')
    l = []
    k = []
    write_file = open(write_txt,'w')
    for x in f:
        l.append((x.split(' ',1)[0])) #headers before the true lyrics start
        k.append((x.split(' ',1)[1])) #lyrics transcription without headers
    #Example
#     print('Headers:',l[0])
#     print('Lyrics without headers:', k[0])
    
    for i in range(len(k)):    
        write_file.write(k[i])

    return None


def map_text2char(read_txt):
    
    '''
    Remove special characters/symbols and convert all characters into lower case
    Parameters:
        read_txt: Txt file to be input
    '''
    chars = set()
    with open(read_txt) as text:
        for line in text:
            line = re.sub('[^A-Za-z\']+', ' ', line).lower()
            chars.update(set(line))
            #print(chars)
            #print(range(3, len(chars)+3))
            mapping = dict(zip(chars, range(3, len(chars)+3)))
            #print(mapping)
            mapping["<PAD>"] = 0
            mapping["<S>"] = 1
            mapping["</S>"] = 2

        #save vocab
        with open('damp_vocab', 'wb') as fp:
            pickle.dump(mapping, fp)
        
        return mapping

def enumerate_lines(file):
    with open(file) as text:
        return [line for i, line in enumerate(text)]

Sample rate: 22050 Hz


In [None]:
train_remove_header = remove_header('/content/gdrive/My Drive/train.txt','DAMP_train_lyrics.txt')
train_enum = enumerate_lines('DAMP_train_lyrics.txt')
train_mapping = map_text2char('DAMP_train_lyrics.txt')
#test_mapping= map_text2char(test_enum[0])
print(train_mapping)
print(train_enum[0])


for i in range(10):
    logmel = raw_to_mel(audio_list_train[i], sr_train, window_size=2048, hop_length=512, n_freqs=128)
    for line in train_enum[i]:
        line = re.sub('[^A-Za-z0-9\']+', ' ', line).lower()
        mapped = [train_mapping["<S>"]] + chs_to_inds(line, train_mapping) + [train_mapping["</S>"]]
        print(line,end='')
    print(logmel.shape)

{'t': 3, 'd': 4, 'c': 5, 'z': 6, 'l': 7, 'r': 8, 'k': 9, 'u': 10, 'j': 11, 'e': 12, 'g': 13, 'w': 14, 'm': 15, 'f': 16, "'": 17, 'n': 18, 'x': 19, 'v': 20, 'o': 21, 'q': 22, 'b': 23, 'i': 24, 's': 25, ' ': 26, 'p': 27, 'h': 28, 'a': 29, 'y': 30, '<PAD>': 0, '<S>': 1, '</S>': 2}
IT'S BIGGER IT'S BIGGER THAN YOU AND YOU ARE NOT ME 

it's bigger it's bigger than you and you are not me  (128, 339)
the lengths that i will go to the distance in your eyes oh no  (128, 464)
set it up that's me in the corner that's me in the spotlight  (128, 558)
spotlight losing my religion trying to keep a view and i don't know if i can do it  (128, 352)
much i haven't said enough i thought that i heard you laughing  (128, 361)
i thought that i heard you sing i think i thought i saw you try  (128, 525)
every whisper of every waking hour i'm choosing my confessions  (128, 394)
brought me to my knees failed what if all these fantasies come flailing around  (128, 408)
now i've said too much i thought that i hear

In [None]:
def create_tfrecords(out_filename, enum_list, audio_list, path_to_save=''):
    """Process audio files and annotations into TFRecords data file.

    Parameters:
        out_filename: 1D numpy array containing the audio.
        enum_list: enumarate list of text line from lyrics script.
        audio_list: list of input audio (test/train)
        path_audio: Path to retrieved raw data.
    """
    if path_to_save:
        print("tfrecords will be saved here:", path_to_save)
        if not os.path.exists(path_to_save):
            os.makedirs(path_to_save)
        target_path = path_to_save
    else:
        print("Current working directory is used to save tfrecords:", os.getcwd())
        target_path = os.getcwd()
        
    with tf.python_io.TFRecordWriter(os.path.join(target_path, out_filename + ".tfrecords")) as writer:
        for i in range(len(audio_list)):
            logmel = raw_to_mel(audio_list[i], sr_train, window_size = 2048, hop_length = 512, n_freqs = 128)
            for line in enum_list[i]:
                line = re.sub('[^A-Za-z0-9\']+', ' ', line).lower()
                mapped_seq = [train_mapping["<S>"]] + chs_to_inds(line, train_mapping) + [train_mapping["</S>"]]    

                flatten_logmel_shape = reduce(lambda x, y: x * y, logmel.shape)
                flatten_logmel = np.reshape(logmel, [flatten_logmel_shape, ])

            tfex = tf.train.Example(features=tf.train.Features(feature={
                "seq": tf.train.Feature(int64_list=tf.train.Int64List(value=mapped_seq)),
                "mel": tf.train.Feature(float_list=tf.train.FloatList(value=flatten_logmel)),
                "mel_shape": tf.train.Feature(int64_list=tf.train.Int64List(value=logmel.shape))

            }))
            writer.write(tfex.SerializeToString())
        print("Saved to: {}".format(os.path.join(target_path, out_filename + ".tfrecords")))

In [None]:
def parse_seq(example_proto):
    """
    Needed to read the stored .tfrecords data -- import this in your
    training script.
    
    Parameters:
        example_proto: Protocol buffer of single example.
    Return: 
        Tuple of Tensors containing the logmel spectogram and parsed sequence.
    """
    features = {"seq": tf.VarLenFeature(tf.int64),
                "mel": tf.VarLenFeature(tf.float32),
                "mel_shape": tf.FixedLenFeature([2], tf.int64)}
    parsed_features = tf.parse_single_example(example_proto, features)
    sparse_seq = parsed_features["seq"]
    sparse_mel= parsed_features["mel"]
    mel = tf.sparse_to_dense(sparse_mel.indices, sparse_mel.dense_shape, sparse_mel.values)
    mel = tf.reshape(mel, parsed_features["mel_shape"])
    seq = tf.sparse_to_dense(sparse_seq.indices, sparse_seq.dense_shape, sparse_seq.values)
    return mel, seq

In [None]:
#out_filename=DAMP1.tfrecords
create_tfrecords('train', train_enum, audio_list_train, "/content/gdrive/My Drive")

tfrecords will be saved here: /content/gdrive/My Drive
Saved to: /content/gdrive/My Drive/train.tfrecords


In [None]:

out_path_training = "/content/gdrive/My Drive/train"
data = tf.data.TFRecordDataset(out_path_training + ".tfrecords")
data = data.map(lambda x: parse_seq(x))
data = data.padded_batch(4, padded_shapes=([128, None],[None]))

iterator = data.make_initializable_iterator()
next_batch = iterator.get_next()

with tf.Session() as sess:
    sess.run(iterator.initializer)
    for i in range(1):
        example = sess.run([next_batch])
        print(example)
        print("Mel spectr. - {}".format(example[0][0].shape), "Sequence - {}".format(example[0][1].shape))

[(array([[[ -4.8699765,  -4.2327137,  -4.8743734, ...,   0.       ,
           0.       ,   0.       ],
        [ -3.7408812,  -3.2661116,  -3.6742923, ...,   0.       ,
           0.       ,   0.       ],
        [ -1.6931499,  -2.327741 ,  -2.8682632, ...,   0.       ,
           0.       ,   0.       ],
        ...,
        [-14.530863 , -15.9171   , -25.328186 , ...,   0.       ,
           0.       ,   0.       ],
        [-14.555361 , -15.941596 , -25.32828  , ...,   0.       ,
           0.       ,   0.       ],
        [-14.570981 , -15.957215 , -25.328272 , ...,   0.       ,
           0.       ,   0.       ]],

       [[ -4.7134166,  -5.2503133,  -4.632083 , ...,   0.       ,
           0.       ,   0.       ],
        [ -3.0904372,  -3.3358045,  -2.8856297, ...,   0.       ,
           0.       ,   0.       ],
        [ -3.5631924,  -3.182612 ,  -3.3891728, ...,   0.       ,
           0.       ,   0.       ],
        ...,
        [-23.194237 , -24.276678 , -25.32822  , ...,

In [None]:
print(len(audio_list_train))

47


In [None]:
def convert_to_numpy(input, output_path,npy_name):
    input=np.asarray(input)
    outputfilepath=os.path.join(output_path,npy_name)
    np.save(outputfilepath +'.npy', input)
    



(47,)


"\n    os.mkdir(output_path)  \n    new_img_list = save(img_list, img_affine, output_path, label)\n    images = three_to_two(path = output_path + '/*')\n    \n    if label=='FALSE':\n        img = min_max_norm(images)\n        np.save(npy_name+'.npy', img)\n    else:\n        img_lbl = label_outliers(images)\n        np.save(npy_name+'.npy', img_lbl)\n"