In [None]:
"""
To Replicate idea from this paper, https://arxiv.org/pdf/1905.04348.pdf: 
    Multiclass Language Identification using Deep Learning on Spectral Images of Audio Signals
Found this dataset:
    Kaggle Spoken Language Identification (https://www.kaggle.com/toponowicz/spoken-language-identification)
Borrowed some code form our Community Git to convert audio to spectrograms:
    https://github.com/datarobot-community/tutorials-for-data-scientists/blob/master/VisualAI/Python/VisualAI%20Heartbeats/heartbeat_visual_AI.ipynb

Steve Cultrera, 1/7/2022

"""

In [3]:
import librosa

In [86]:
import os
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
import csv
from random import random
import shutil

In [27]:
class AudioConf:
    # Preprocessing settings
    sampling_rate = 44100
    window_function = 'hanning'
    hop_length_percentage = 0.25 # determines the amount of overlap, higher means more frames
    window_length = 2048  # length of the FFT window
    hop_length = int(window_length * hop_length_percentage)  # number of samples between successive frames.
        

def scale_minmax(X, min=0.0, max=255.0):
    """ rescaling from dB to image pixel values from 0 to 255
    """
    X_std = (X - X.min()) / (X.max() - X.min())
    X_scaled = X_std * (max - min) + min
    return X_scaled.astype(np.uint8)


def raw_audio_frames_to_logmelspectrogram(raw_audio, conf):
    melspectrogram = librosa.feature.melspectrogram(
        raw_audio,
        sr=conf.sampling_rate,
        hop_length=conf.hop_length,
        n_fft=conf.window_length,
        window=conf.window_function,
    )

    logmelspectrogram = librosa.power_to_db(melspectrogram)
    logmelspectrogram = scale_minmax(logmelspectrogram).astype('uint8')
    cmap = plt.get_cmap('magma')
    rgba_logmelspectrogram = cmap(logmelspectrogram)
    rgb_logmelspectrogram = (np.delete(rgba_logmelspectrogram, 3, 2) * 255).astype('uint8')
    return rgb_logmelspectrogram

## Just some tests to get one file working

In [13]:
project_dir = "/Users/steve.cultrera/Documents/kaggle_spoken_language_identification"

one_file = os.path.join(project_dir, "archive/train/train", "de_f_5d2e7f30d69f2d1d86fd05f3bbe120c2.fragment1.flac")



In [14]:
y, sr = librosa.load(one_file)

In [15]:
sr

22050

In [29]:
logmelspectrogram = raw_audio_frames_to_logmelspectrogram(y, AudioConf)

In [30]:
pillow_image = Image.fromarray(logmelspectrogram)

In [31]:
pillow_image.save("test.png", "PNG")

In [61]:
pillow_image.size

(431, 128)

In [63]:
# files aren't that much bigger than max
print(224*224, 431*128)

50176 55168


In [65]:
print(random())

0.37984711672670657


## Process an entire directory

In [82]:
#-- Toggled between this two...

#-- all the files from train came to ~7GB, so let's halve that:
#input_DIR = os.path.join(project_dir, "archive/train/train")
#sample_rate = 0.5

input_DIR = os.path.join(project_dir, "archive/test/test")
sample_rate = 1

In [83]:
output_csv = os.path.join(project_dir, "kaggle_sli_data", "kaggle_data.csv")

In [84]:
input_DIR

'/Users/steve.cultrera/Documents/kaggle_spoken_language_identification/archive/test/test'

In [85]:
with open(output_csv, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["language", "gender", "file_id", "image"])
    
    for filename in os.listdir(input_DIR):
        
        if filename.endswith(".flac") and random() <= sample_rate:
            language = filename[:2]
            gender = filename[3:4]
            file_id = filename[5:]
            #print(filename)
            #print(language, gender, file_id)
            
            
            #-- generate the image 
            y, sr = librosa.load(os.path.join(input_DIR, filename))
            logmelspectrogram = raw_audio_frames_to_logmelspectrogram(y, AudioConf)
            pillow_image = Image.fromarray(logmelspectrogram)
            output_image_filename = file_id + ".png"
            pillow_image.save(os.path.join(project_dir, "kaggle_sli_data", "image", output_image_filename))
            
            #-- finally, write the row to csv now that we have everything
            writer.writerow([language, gender, file_id, "image/" + output_image_filename])

In [88]:
#-- make zipped file which can be imported into platform
shutil.make_archive("kaggle_sli_data_TEST", 'zip', os.path.join(project_dir, "kaggle_sli_data"))

'/Users/steve.cultrera/Documents/kaggle_spoken_language_identification/kaggle_sli_data_TEST.zip'