In [7]:
import os
import time
import tarfile

import numpy as np
import pandas as pd

import librosa
import librosa.display

import IPython.display as ipd

import matplotlib.pyplot as plt
#%matplotlib inline

In [8]:
def listen_clip(idx):
    
    audio_path = os.path.join(clips_dir,paths[idx]) 
    print(sentences[idx])
    return ipd.Audio(audio_path)

def clean_audio(audio):
    
    intervals = librosa.effects.split(audio, top_db=18, frame_length=4096, hop_length=2048)
    no_intervals = intervals.shape[0]    
        
    if no_intervals == 1:       
        cut_start_idx = intervals[0,0]
        cut_end_idx = intervals[0,1]
        
    else:               
        energies = np.zeros((no_intervals,))
        for j in range(no_intervals):
            
            energies[j] = np.sum(np.square(audio[intervals[j,0]:intervals[j,1]]))
        
        max_energy = np.max(energies)
        
        relative_energies = 100*energies/max_energy
                
        high_energy_indices = []
        for j,relative_energy in enumerate(relative_energies):
            
            if relative_energy> 10: # 10 percent found heuristicaly
                high_energy_indices.append(j) # find which intervals have high energy
        
        first_interval_idx = high_energy_indices[0] # the first interval with high energy
        last_interval_idx = high_energy_indices[-1] # last interval with high energy
        
        cut_start_idx = intervals[first_interval_idx,0] 
        cut_end_idx = intervals[last_interval_idx,1]
        
        
    clean_audio = audio[cut_start_idx:cut_end_idx]
        
    return clean_audio

In [30]:
# Mozilla Turkish Dataset
dataset_name = "cv-corpus-5.1-2020-06-22"

dataset_dir = os.path.join("..","data","Datasets",dataset_name,"tr")

clips_dir = os.path.join(dataset_dir,"clips")

pickle_dir = os.path.join(dataset_dir,"cv-corpus-5.1-2020-06-22_validated_simple_ordered.pkl") 
df = pd.read_pickle(pickle_dir)

#annotation_df = pd.read_csv(tsv_dir,sep='\t')
#annotation_df = pd.read_csv(csv_dir)

IDs = df["client_id"]
paths = df["path"]
sentences = df["sentence"]
codes = df['encoded']

tt = "test"

dataset_dir = os.path.join("..","..","Datasets","ti20",tt)

clip_dir = os.path.join(dataset_dir,"clean")

pickle_dir = os.path.join(dataset_dir,"ti20_"+tt+"_coded.pkl")

df = pd.read_pickle(pickle_dir)

paths = df["path"]
sentences = df["sentence"]
codes = df['encoded']

#METUbet
dataset_dir = os.path.join('..','..',"Datasets","METUbet","data")

dataset_name = "METUbet"

clip_dir = os.path.join(dataset_dir,'speech-text')
pickle_dir = os.path.join(dataset_dir,'METUbet_encoded.pkl') # Original annotations

df = pd.read_pickle(pickle_dir)

#paths = df["path"]
sentences = df["turkish_sentence"]
codes = df['encoded']

**Spectrogram Output Directories**

In [4]:
amp_spectrogram_dir = os.path.join(dataset_dir,"spectrograms",'amplitude')
dB_spectrogram_dir = os.path.join(dataset_dir,"spectrograms",'dB')
power_spectrogram_dir = os.path.join(dataset_dir,"spectrograms",'power')

spectrogram_dirs = [amp_spectrogram_dir,dB_spectrogram_dir,power_spectrogram_dir]

for d in spectrogram_dirs:
    if not os.path.isdir(d):
        os.makedirs(d)   

## Single File Read

In [None]:
idx = 100
listen_clip(idx)

In [None]:
#clip_dir = os.path.join(dataset_dir,'speech-text','s1000','s1000-000.wav')
idx = 100
clip_dir = os.path.join(clips_dir,paths[idx])
audio,sr = librosa.load(clip_dir,sr=16000)

fig, ax = plt.subplots(figsize=(20,8))
librosa.display.waveplot(audio, sr=sr, ax=ax) 
ax.set(title="{} Waveform".format(paths[idx]))
#plt.savefig("{} Waveform.png".format(paths[idx]))

In [None]:
def plot_waveform_with_cuts(audio,sr,start_cut_time,end_cut_time,idx): #,amplitude_threshold
    
    fig, ax = plt.subplots(figsize=(10,5))
    librosa.display.waveplot(audio, sr=sr, ax=ax) 
    ax.set(title="{} Waveform".format(paths[idx]))

    #plt.hlines(amplitude_threshold,0,len(audio),colors='r')
    #plt.hlines(-amplitude_threshold,0,len(audio),colors='r')

    bound = np.max(np.abs(audio))
    
    #plt.vlines(len(audio)/sr/3,-bound,bound,colors='b')
    #plt.vlines(2*len(audio)/sr/3,-bound,bound,colors='b')

    plt.vlines(start_cut_time,-bound,bound,colors='g')
    plt.vlines(end_cut_time,-bound,bound,colors='g')
    #plt.savefig("{} Cut.png".format(paths[idx]))
    

# Compute Spectrogram Variations for a Single Recording

In [None]:
audio_path = os.path.join(clips_dir,paths[1])
audio,sr = librosa.load(audio_path,sr =44100)
 
audio_clean = clean_audio(audio)

amplitude_spectrogram = np.abs(librosa.stft(audio_clean,n_fft=512,hop_length=int(512/3),win_length=512))

dB_spectrogram = librosa.amplitude_to_db(amplitude_spectrogram,np.max(amplitude_spectrogram))

power_spectrogram = librosa.db_to_power(dB_spectrogram, ref=1.0)

In [None]:
fig, ax = plt.subplots(figsize=(20,8))
img = librosa.display.specshow(amplitude_spectrogram,
                               y_axis='log', x_axis='time', ax=ax)
ax.set_title('Amplitude spectrogram')
#fig.colorbar(img, ax=ax, format="%+2.0f dB")
#plt.text(18, 8000, sentences[1], fontsize=15,bbox=dict(alpha=1))

#plt.savefig("Amplitude Spectrogram.png")

In [None]:
fig, ax = plt.subplots(figsize=(20,8))
img = librosa.display.specshow(dB_spectrogram,
                               y_axis='log', x_axis='time', ax=ax)
ax.set_title('dB spectrogram')
fig.colorbar(img, ax=ax, format="%+2.0f dB")
#plt.text(18, 9000, sentences[1], fontsize=14,bbox=dict(alpha=1))

#plt.savefig("dB Spectrogram.png")

In [None]:
fig, ax = plt.subplots(figsize=(20,8))
img = librosa.display.specshow(power_spectrogram,
                               y_axis='log', x_axis='time', ax=ax)
ax.set_title('Power spectrogram')
#plt.text(18, 8000, sentences[1], fontsize=13,bbox=dict(alpha=1))
fig.colorbar(img, ax=ax, format="%+2.0f dB")

#plt.savefig("Power Spectrogram.png")

# Multiple Spectrogram Extraction

### Create spectrograms and Write them to csv

Read audio, clean it, calculate spectrograms and write to csv files

In [6]:
counter = 0
start_time = time.time()
audio_lengths = []

for root,_,files in os.walk(clips_dir):
    
    for file in files:
        
        if file in paths and file.split('.')[-1] in ['wav','mp3']:

            audio_path = os.path.join(root,file)

            audio,sr = librosa.load(audio_path,sr=16000)
            
            audio_clean = clean_audio(audio)
            
            audio_lengths.append(len(audio_clean))

            amplitude_spectrogram = np.abs(librosa.stft(audio_clean,n_fft=512,hop_length=int(512/3),win_length=512))
            dB_spectrogram = librosa.amplitude_to_db(amplitude_spectrogram,np.max(amplitude_spectrogram))
            power_spectrogram = librosa.db_to_power(dB_spectrogram, ref=1.0)           

            file_name = file.split(".")[0]+".csv"
            np.savetxt(os.path.join(amp_spectrogram_dir,file_name),amplitude_spectrogram,delimiter=',')
            np.savetxt(os.path.join(dB_spectrogram_dir,file_name),dB_spectrogram,delimiter=',')
            np.savetxt(os.path.join(power_spectrogram_dir,file_name),power_spectrogram,delimiter=',')

            counter += 1        
            if counter == 1000:
                print("Time passed: {:.2f}s".format(time.time()-start_time))
                counter = 0
                start_time = time.time()
                
print('Total audio length {:.2f} hours'.format(sum(audio_lengths)/sr/60/60))

Time: 590.10s
Time: 641.59s
Time: 690.73s
Time: 667.50s
Time: 611.72s
Time: 587.86s
Time: 644.11s
Time: 661.13s
Time: 683.43s
Time: 658.77s
Time: 643.85s
Time: 597.68s
Time: 621.28s
Time: 626.61s
Time: 652.88s
Time: 669.46s
Time: 644.78s
Time: 626.91s
Time: 635.84s
Time: 619.86s


# Compress the folders

In [29]:
dB_spectrogram_dir

'..\\..\\Datasets\\cv-corpus-5.1-2020-06-22\\tr\\spectrograms\\dB'

In [31]:
spectrogram_dirs = [dB_spectrogram_dir]
for d in spectrogram_dirs:

    start_time = time.time()

    output_filename = '{}_{}_spectrograms.tar.gz'.format(dataset_name,d.split("\\")[-1])   
    
    output_dir = os.path.join(dataset_dir,"spectrograms",output_filename)

    with tarfile.open(output_dir, "w:gz") as tar:
        tar.add(d, arcname=os.path.basename(d))

    print("Total time: {:.2f} mins".format((time.time()-start_time)/60))

Total time: 92.93 mins


path_list = [p.split('.')[0] for p in paths.tolist()]

for spec_dir in spectrogram_dirs:
    
    for root,_,files in os.walk(spec_dir):

        for file in files:

            if file.split('.')[0] not in path_list:
                
                file_path = os.path.join(root,file)
                os.remove(file_path)

start_time = time.time()

output_filename = "dB_spectrograms.tar.gz"
 
with tarfile.open(output_filename, "w:gz") as tar:
    tar.add(spectrogram_dir, arcname=os.path.basename(spectrogram_dir))
    
print("Total time: {}".format(time.time()-start_time))

# Transcription lengts of the sentences.
lengths = []

for sentence in sentences:
    lengths.append(len(sentence))

**Duration calculations**

duration = []

for idx,path in enumerate(paths):
    
    audio_path = os.path.join(clip_dir,path)
    
    audio,sr = librosa.load(audio_path)
    
    duration.append(len(audio)/sr)
    

In [None]:
tot = 0
for dur in duration:
    tot += dur

In [None]:
len(duration)

In [None]:
tot/60/60

In [None]:
print(duration)

In [None]:
name = paths[0].split(".")[0]+".csv"

In [None]:
audio_path = os.path.join(clip_dir,paths[0])
audio,sr = librosa.load(audio_path)

amplitude_spectrogram = np.abs(librosa.stft(audio,n_fft=512))

dB_spectrogram = librosa.amplitude_to_db(amplitude_spectrogram,np.max(amplitude_spectrogram))

power_spectrogram = librosa.db_to_power(dB_spectrogram, ref=1.0)

np.savetxt(name,power_spectrogram,delimiter=',')