In [None]:
# % pylab inline
import os
import numpy as np
import pandas as pd
import librosa
import librosa.display
import glob 
import matplotlib.pyplot as plt
import shutil

This function is used to plot out a waveplot of audio file. It expects the filename (along with it's path as its argument)

In [None]:
def showSpectrogram(filename):
    data, sampling_rate = librosa.load(filename)
    plt.figure(figsize=(12, 4))
    librosa.display.waveplot(data, sr=sampling_rate)

In [None]:
showSpectrogram('./Audio_Data/Chinese/clips/common_voice_zh-CN_18659733.mp3')

Below are 2 function, create_spectrogram & create_mfcc_spectrogram.

These function create an image from audio file (argument filename specifies this, which should be the path of the audio file along with it's name) and stores it with a name and path defined by the argument newFilePath which should be the path of the image along with it's name.

We ended up using only simple create_spectrogram function which creates Mel Spectrogram images. We tried the model using MFCC feature images of the sound through the create_mfcc_spectrogram function but the results weren't as good. The code in the main CNN_model notebook uses the files from create_spectrogram function as used below.

In [None]:
def create_spectrogram(filename,newFilePath):
    plt.interactive(False)
    clip, sample_rate = librosa.load(filename, sr=None)
    fig = plt.figure(figsize=[0.72,0.72])
    ax = fig.add_subplot(111)
    ax.axes.get_xaxis().set_visible(False)
    ax.axes.get_yaxis().set_visible(False)
    ax.set_frame_on(False)
    S = librosa.feature.melspectrogram(y=clip, sr=sample_rate)
    librosa.display.specshow(librosa.power_to_db(S, ref=np.max))
    plt.savefig(newFilePath, dpi=400, bbox_inches='tight',pad_inches=0)
    plt.close()    
    fig.clf()
    plt.close(fig)
    plt.close('all')
    
def create_mfcc_spectrogram(filename,newFilePath):
    plt.interactive(False)
    clip, sample_rate = librosa.load(filename, sr=None)
    fig = plt.figure(figsize=[0.72,0.72])
    ax = fig.add_subplot(111)
    ax.axes.get_xaxis().set_visible(False)
    ax.axes.get_yaxis().set_visible(False)
    ax.set_frame_on(False)
    S = librosa.feature.mfcc(y=clip, sr=sample_rate, n_mfcc=40)
    librosa.display.specshow(librosa.power_to_db(S, ref=np.max))
    plt.savefig(newFilePath, dpi=400, bbox_inches='tight',pad_inches=0) #saves at a file path
    plt.close()    
    fig.clf()
    plt.close(fig)
    plt.close('all')

Just a test for these function.

In [None]:
create_mfcc_spectrogram('./Audio_Data/Chinese/clips/common_voice_zh-CN_18659734.mp3', './test1.jpg')
create_spectrogram('./Audio_Data/Chinese/clips/common_voice_zh-CN_18659734.mp3', './test2.jpg') #to add data

These are the 5 languages (the uncommented one) we ended up using for finally training the model. To increase the number of languages whose data is transformed into images, add other languages to this array.

In [None]:
# Languages = [
#     'French',
#     'German',
#     'Dutch',
#     'Russian',
#     'Spanish',
#     'Italian',
#     'Turkish',
#     'Persian',
#     'Swedish',
#     'Mongolian',
#     'Chinese'
#     ] test cases

Languages = ['Turkish', 'Dutch', 'Swedish', 'Mongolian', 'Persian']

This block can be used to get Mel Spectrogram feature images for the samples sepcified in the range (currently 3500, 5000) of each language specified above in the Languages array.

In [None]:
for language in Languages:
    Image_data_path = f'./Image_Data/{language}'
    if not os.path.isdir(Image_data_path):
        os.makedirs(Image_data_path)
    
    files = os.listdir(f'./Audio_Data/{language}/clips')
    for i in range(3500,5000):
        create_spectrogram(f'./Audio_Data/{language}/clips/{files[i]}', f'./Image_Data/{language}/{files[i][:-4]}.jpg') #spectogram save
        
    print(f'{language} done')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

This block can be used to get the test Mel Spectrogram feature images using 375 images samples after the 1500 images of each language specified above in the Languages array.

In [None]:
for language in Languages:
    Image_data_path = f'./Image_Data_Test/{language}'
    if not os.path.isdir(Image_data_path):
        os.makedirs(Image_data_path)
    
    files = os.listdir(f'./Audio_Data/{language}/clips')
    for i in range(375):
        create_spectrogram(f'./Audio_Data/{language}/clips/{files[1501 + i]}', f'./Image_Data_Test/{language}/{files[1501 + i][:-4]}.jpg')
        
    print(f'{language} done')

This block can be used to get MFCC feature images for the first 1500 samples of each language specified above in the Languages array.

In [None]:
for language in Languages:
    Image_data_path = f'./Image_Data_MFCC/{language}'
    if not os.path.isdir(Image_data_path):
        os.makedirs(Image_data_path)
    
    files = os.listdir(f'./Audio_Data/{language}/clips')
    for i in range(1500):
        create_mfcc_spectrogram(f'./Audio_Data/{language}/clips/{files[i]}', f'./Image_Data_MFCC/{language}/{files[i][:-4]}.jpg')
        
    print(f'{language} done')

This block can be used to get the test MFCC feature images using 375 images samples after the 1500 images of each language specified above in the Languages array.

In [None]:
for language in Languages:
    Image_data_path = f'./Image_Data_MFCC_Test/{language}'
    if not os.path.isdir(Image_data_path):
        os.makedirs(Image_data_path)
    
    files = os.listdir(f'./Audio_Data/{language}/clips')
    for i in range(375):
        create_spectrogram(f'./Audio_Data/{language}/clips/{files[1501 + i]}', f'./Image_Data_MFCC_Test/{language}/{files[1501 + i][:-4]}.jpg')
        
    print(f'{language} done')

This was used to get the average length of audio files for each language. This was calculated using only the first 375 files as for bigger numbers it start to quite a bit of time.

In [None]:
for language in Languages:
    files = os.listdir(f'./Audio_Data/{language}/clips')
    totalLength = 0
    
    for i in range(375):
        totalLength += librosa.get_duration(filename=f'./Audio_Data/{language}/clips/{files[i]}')
            
    print(f'{language} done with totalLength: {totalLength} & avgLength: {totalLength/375}')