In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'  # O '0,1' para usar múltiples GPUs
import pandas as pd
from io import StringIO
import fnmatch
import soundfile as sf
import numpy as np
import random
from tqdm import tqdm

In [2]:
#!wget https://www.openslr.org/resources/12/dev-clean.tar.gz
#!wget https://www.openslr.org/resources/17/musan.tar.gz

In [3]:
#!tar -xvf dev-clean.tar.gz
#!tar -xvf musan.tar.gz

In [4]:
def preprocess_file(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    lines = lines[11:]
    lines = [line.replace('|CBW|', 'CBW').strip() for line in lines]
    lines[0] = lines[0].replace(';', '')
    cleaned_text = '\n'.join(lines)
    df = pd.read_csv(StringIO(cleaned_text), sep='|', engine='python')
    df.columns = [col.strip() for col in df.columns]
    for col in df.columns:
        df[col] = df[col].apply(lambda x: x.strip() if isinstance(x, str) else x)
    return df

In [5]:
base_path = "LibriSpeech/"
speakers_txt = base_path + "SPEAKERS.TXT"

In [6]:
def find_files_with_extensions(root_dir, extensions):
    matched_files = []
    for root, dirs, files in os.walk(root_dir):
        dirs[:] = [d for d in dirs if not d.startswith('.')]
        for pattern in extensions:
            for filename in fnmatch.filter(files, pattern):
                if not filename.startswith('.'):
                    matched_files.append(os.path.join(root, filename))

    return matched_files



In [7]:
files_list = list(find_files_with_extensions("LibriSpeech/",["*.flac"]))
print(len(files_list))

2703


In [8]:
def ls_to_df(files_list):
    df = pd.DataFrame(files_list, columns=['Path'])
    df_expanded = df['Path'].str.split('/', expand=True)
    df_expanded.columns = [f'Folder_{i}' for i in range(df_expanded.shape[1])]
    df_expanded['Full_Path'] = df.apply(lambda row: '/'.join(row.astype(str)), axis=1)
    return df_expanded

In [9]:
df_list_files = ls_to_df(files_list)
speakers_info = preprocess_file(speakers_txt)


In [10]:
speakers_info

Unnamed: 0,ID,SEX,SUBSET,MINUTES,NAME
0,14,F,train-clean-360,25.03,Kristin LeMoine
1,16,F,train-clean-360,25.11,Alys AtteWater
2,17,M,train-clean-360,25.04,Gord Mackenzie
3,19,F,train-clean-100,25.19,Kara Shallenberg
4,20,F,train-other-500,30.07,Gesine
...,...,...,...,...,...
2479,8975,F,train-clean-100,25.11,Daisy Flaim
2480,9000,M,train-other-500,27.26,Ramon Escamilla
2481,9022,F,train-clean-360,25.17,Claire M
2482,9023,F,train-clean-360,25.19,P. J. Morgan


In [11]:
speakers_info["SECONDS"] = speakers_info["MINUTES"] * 60

In [12]:
speakers_info

Unnamed: 0,ID,SEX,SUBSET,MINUTES,NAME,SECONDS
0,14,F,train-clean-360,25.03,Kristin LeMoine,1501.8
1,16,F,train-clean-360,25.11,Alys AtteWater,1506.6
2,17,M,train-clean-360,25.04,Gord Mackenzie,1502.4
3,19,F,train-clean-100,25.19,Kara Shallenberg,1511.4
4,20,F,train-other-500,30.07,Gesine,1804.2
...,...,...,...,...,...,...
2479,8975,F,train-clean-100,25.11,Daisy Flaim,1506.6
2480,9000,M,train-other-500,27.26,Ramon Escamilla,1635.6
2481,9022,F,train-clean-360,25.17,Claire M,1510.2
2482,9023,F,train-clean-360,25.19,P. J. Morgan,1511.4


In [13]:
speakers_info["SEX"].value_counts()

SEX
M    1283
F    1201
Name: count, dtype: int64

In [14]:
speakers_info["ID"].unique().shape

(2484,)

In [15]:
df_list_files

Unnamed: 0,Folder_0,Folder_1,Folder_2,Folder_3,Folder_4,Full_Path
0,LibriSpeech,dev-clean,8842,302196,8842-302196-0010.flac,LibriSpeech/dev-clean/8842/302196/8842-302196-...
1,LibriSpeech,dev-clean,8842,302196,8842-302196-0011.flac,LibriSpeech/dev-clean/8842/302196/8842-302196-...
2,LibriSpeech,dev-clean,8842,302196,8842-302196-0008.flac,LibriSpeech/dev-clean/8842/302196/8842-302196-...
3,LibriSpeech,dev-clean,8842,302196,8842-302196-0012.flac,LibriSpeech/dev-clean/8842/302196/8842-302196-...
4,LibriSpeech,dev-clean,8842,302196,8842-302196-0009.flac,LibriSpeech/dev-clean/8842/302196/8842-302196-...
...,...,...,...,...,...,...
2698,LibriSpeech,dev-clean,2078,142845,2078-142845-0031.flac,LibriSpeech/dev-clean/2078/142845/2078-142845-...
2699,LibriSpeech,dev-clean,2078,142845,2078-142845-0006.flac,LibriSpeech/dev-clean/2078/142845/2078-142845-...
2700,LibriSpeech,dev-clean,2078,142845,2078-142845-0017.flac,LibriSpeech/dev-clean/2078/142845/2078-142845-...
2701,LibriSpeech,dev-clean,2078,142845,2078-142845-0032.flac,LibriSpeech/dev-clean/2078/142845/2078-142845-...


In [18]:
df_list_files["Folder_2"] = df_list_files["Folder_2"].astype("int")

speakers_info["ID"] = speakers_info["ID"].astype("int")

df_speakers_list = df_list_files[["Folder_2","Folder_3","Folder_4", "Full_Path"]].merge(
    speakers_info[speakers_info["SUBSET"] == "dev-clean"], left_on = "Folder_2", right_on="ID")
df_speakers_list

Unnamed: 0,Folder_2,Folder_3,Folder_4,Full_Path,ID,SEX,SUBSET,MINUTES,NAME,SECONDS
0,8842,302196,8842-302196-0010.flac,LibriSpeech/dev-clean/8842/302196/8842-302196-...,8842,F,dev-clean,8.10,Mary J,486.0
1,8842,302196,8842-302196-0011.flac,LibriSpeech/dev-clean/8842/302196/8842-302196-...,8842,F,dev-clean,8.10,Mary J,486.0
2,8842,302196,8842-302196-0008.flac,LibriSpeech/dev-clean/8842/302196/8842-302196-...,8842,F,dev-clean,8.10,Mary J,486.0
3,8842,302196,8842-302196-0012.flac,LibriSpeech/dev-clean/8842/302196/8842-302196-...,8842,F,dev-clean,8.10,Mary J,486.0
4,8842,302196,8842-302196-0009.flac,LibriSpeech/dev-clean/8842/302196/8842-302196-...,8842,F,dev-clean,8.10,Mary J,486.0
...,...,...,...,...,...,...,...,...,...,...
2698,2078,142845,2078-142845-0031.flac,LibriSpeech/dev-clean/2078/142845/2078-142845-...,2078,M,dev-clean,8.03,Kathy Caver,481.8
2699,2078,142845,2078-142845-0006.flac,LibriSpeech/dev-clean/2078/142845/2078-142845-...,2078,M,dev-clean,8.03,Kathy Caver,481.8
2700,2078,142845,2078-142845-0017.flac,LibriSpeech/dev-clean/2078/142845/2078-142845-...,2078,M,dev-clean,8.03,Kathy Caver,481.8
2701,2078,142845,2078-142845-0032.flac,LibriSpeech/dev-clean/2078/142845/2078-142845-...,2078,M,dev-clean,8.03,Kathy Caver,481.8


In [19]:

def detect_vad_times(audio_signal, threshold_db=-40):
    # Convertir la amplitud de la señal a decibelios
    amplitude = np.abs(audio_signal)
    amplitude_db = 20 * np.log10(amplitude + 1e-6)  # Añadir una pequeña constante para evitar log10(0)
    
    # Aplicar umbral para obtener una máscara booleana
    vad_mask = amplitude_db > threshold_db
    
    # Detectar los cambios de estado en la máscara booleana
    vad_changes = np.diff(vad_mask.astype(int))
    
    # Encontrar los índices donde ocurre un cambio de 1 (inicio de voz)
    vad_start_indices = np.where(vad_changes == 1)[0] + 1  # Añadir 1 para corregir el cambio de índice
    
    return vad_start_indices.tolist()

In [20]:
results = []

for index, row in df_speakers_list.iterrows():
    file_path = row['Full_Path']
    audio_signal, sr = sf.read(file_path)
    # Detectar los segundos de actividad de voz
    vad_times = detect_vad_times(audio_signal)

    # Agregar los resultados a la lista
    results.append({'File_Path': file_path, 'VAD_Times': vad_times})

# Convertir los resultados en un DataFrame
vad_results = pd.DataFrame(results)


In [21]:
df_speakers_list_vad = df_speakers_list.merge(vad_results, right_on = "File_Path", left_on="Full_Path")
df_speakers_list_vad

Unnamed: 0,Folder_2,Folder_3,Folder_4,Full_Path,ID,SEX,SUBSET,MINUTES,NAME,SECONDS,File_Path,VAD_Times
0,8842,302196,8842-302196-0010.flac,LibriSpeech/dev-clean/8842/302196/8842-302196-...,8842,F,dev-clean,8.10,Mary J,486.0,LibriSpeech/dev-clean/8842/302196/8842-302196-...,"[6593, 6596, 6602, 6607, 6610, 6642, 6646, 664..."
1,8842,302196,8842-302196-0011.flac,LibriSpeech/dev-clean/8842/302196/8842-302196-...,8842,F,dev-clean,8.10,Mary J,486.0,LibriSpeech/dev-clean/8842/302196/8842-302196-...,"[195, 204, 221, 233, 240, 245, 251, 257, 274, ..."
2,8842,302196,8842-302196-0008.flac,LibriSpeech/dev-clean/8842/302196/8842-302196-...,8842,F,dev-clean,8.10,Mary J,486.0,LibriSpeech/dev-clean/8842/302196/8842-302196-...,"[4048, 4051, 4252, 4255, 4486, 4533, 4540, 454..."
3,8842,302196,8842-302196-0012.flac,LibriSpeech/dev-clean/8842/302196/8842-302196-...,8842,F,dev-clean,8.10,Mary J,486.0,LibriSpeech/dev-clean/8842/302196/8842-302196-...,"[4535, 4540, 5494, 5501, 5528, 5531, 5539, 554..."
4,8842,302196,8842-302196-0009.flac,LibriSpeech/dev-clean/8842/302196/8842-302196-...,8842,F,dev-clean,8.10,Mary J,486.0,LibriSpeech/dev-clean/8842/302196/8842-302196-...,"[245, 261, 270, 277, 283, 286, 292, 301, 307, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...
2698,2078,142845,2078-142845-0031.flac,LibriSpeech/dev-clean/2078/142845/2078-142845-...,2078,M,dev-clean,8.03,Kathy Caver,481.8,LibriSpeech/dev-clean/2078/142845/2078-142845-...,"[1, 7, 22, 130, 138, 146, 155, 166, 171, 180, ..."
2699,2078,142845,2078-142845-0006.flac,LibriSpeech/dev-clean/2078/142845/2078-142845-...,2078,M,dev-clean,8.03,Kathy Caver,481.8,LibriSpeech/dev-clean/2078/142845/2078-142845-...,"[843, 925, 927, 930, 934, 937, 941, 1007, 1012..."
2700,2078,142845,2078-142845-0017.flac,LibriSpeech/dev-clean/2078/142845/2078-142845-...,2078,M,dev-clean,8.03,Kathy Caver,481.8,LibriSpeech/dev-clean/2078/142845/2078-142845-...,"[236, 310, 681, 689, 691, 700, 809, 817, 824, ..."
2701,2078,142845,2078-142845-0032.flac,LibriSpeech/dev-clean/2078/142845/2078-142845-...,2078,M,dev-clean,8.03,Kathy Caver,481.8,LibriSpeech/dev-clean/2078/142845/2078-142845-...,"[62, 66, 69, 83, 87, 90, 121, 130, 133, 137, 1..."


In [22]:
from multiprocessing.pool import ThreadPool

def get_audio_segment(audio_signal, start_time, segment_duration_sr):
    end_index = start_time + segment_duration_sr
    segment = audio_signal[start_time:end_index]
    return segment

def select_random_valid_vad(vad_times, max_start_time, segment_duration_sr):
    
    valid_vad_times = [time for time in vad_times if (time) + (segment_duration_sr) <= max_start_time]
    return random.choice(valid_vad_times) if valid_vad_times else None

def create_segments(df, output_dir,segment_duration, iterations = 10000):
    def wrapper(identifier):
        num_rows = len(df)
        index = 0
        x =  0
        while x < iterations:
            #print(x)
            row = df.iloc[index]
            file_path = row['File_Path']
            vad_times = row['VAD_Times']
            
            audio_signal, sr = sf.read(file_path, always_2d=False)
            start_time = select_random_valid_vad(vad_times, len(audio_signal), segment_duration * sr)
            
            if start_time is not None:
                segment = get_audio_segment(audio_signal, start_time, segment_duration * sr)
                sf.write("{}/{}_{}.wav".format(output_dir,identifier,x),segment,sr)
                x+=1
                #pbar.update(1)
            else:
                print("No valid VAD time")
            index = (index + 1) if index + 1 < num_rows else 0
    return wrapper


In [23]:
from sklearn.model_selection import train_test_split

In [24]:
speakers_ids = speakers_info["ID"].unique()
train_ids, test_ids = train_test_split(speakers_ids, test_size=0.2, random_state=21)
train_df = df_speakers_list_vad[df_speakers_list_vad["ID"].isin(train_ids)]
test_df = df_speakers_list_vad[df_speakers_list_vad["ID"].isin(test_ids)]

In [25]:
def check_directory(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)


In [26]:
segment_duration = 1

iterations_train = 8000
repetitions_train = 10
output_train = "Dataset/speech/train"

iterations_test = 2000
repetitions_test = 10
output_test = "Dataset/speech/test"


check_directory(output_train)
check_directory(output_test)

speech_urls = np.array(find_files_with_extensions("Dataset/speech",["*.wav","*.flac"]))

if len(speech_urls) == 0:
    with ThreadPool(repetitions_train) as pool:
        pool.map(create_segments(train_df, output_train, segment_duration, iterations_train), range(repetitions_train))

    with ThreadPool(repetitions_test) as pool:
        pool.map(create_segments(test_df, output_test, segment_duration, iterations_test), range(repetitions_test))
        

In [27]:
files_list_noises = list(find_files_with_extensions("musan/noise/free-sound/",["*.wav","*.flac"]))
df_noises = ls_to_df(files_list_noises)
df_train_noises, df_test_noises = train_test_split(df_noises, test_size=0.2, random_state=21)

In [28]:
def get_random_audio_segment(audio_signal, segment_duration_sr):
    max_start_index = len(audio_signal) - segment_duration_sr
    start_index = random.randint(0, max_start_index)
    end_index = start_index + segment_duration_sr
    segment = audio_signal[start_index:end_index]
    return segment


In [29]:
segment_duration = 1


output_noise_train = "Dataset/noise/train"
iterations_noise_train = 8000
repetitions_noise_train = 10


output_noise_test = "Dataset/noise/test"
iterations_noise_test = 2000
repetitions_noise_test = 10

check_directory(output_noise_train)
check_directory(output_noise_test)


def create_segment_noise(df, segment_duration, output_folder, iterations = 10000):
    def wrapper(identifier):
        num_rows = len(df)
        index = 0
        x = 0
        while x < iterations:
            row = df.iloc[index]
            file_path = row['Full_Path']
            audio_signal, sr = sf.read(file_path, always_2d=False)
            if len(audio_signal) > segment_duration * sr:
                segment = get_random_audio_segment(audio_signal, segment_duration * sr)
                sf.write("{}/{}_{}.wav".format(output_folder,identifier,x),segment,sr)
                x+=1
            index = (index + 1) if index + 1 < num_rows else 0
    return wrapper

noise_urls = np.array(find_files_with_extensions("Dataset/noise",["*.wav","*.flac"]))

if len(noise_urls) == 0:
    with ThreadPool(repetitions_noise_train) as pool:
        pool.map(create_segment_noise(df_train_noises, segment_duration, output_noise_train, iterations_noise_train), range(repetitions_noise_train))

    with ThreadPool(repetitions_noise_test) as pool:
        pool.map(create_segment_noise(df_test_noises, segment_duration, output_noise_test, iterations_noise_test), range(repetitions_noise_test))
        

In [30]:
import os ,fnmatch
import math
def create_data_from_dir(speech_dir, noise_dir, n_samples):
    speech_urls = np.array(random.choices(find_files_with_extensions(speech_dir,["*.wav","*.flac"]), k = n_samples))
    noise_urls = np.array(random.choices(find_files_with_extensions(noise_dir,["*.wav","*.flac"]), k = n_samples))
    return np.array(list(zip(speech_urls,noise_urls)))


mix_urls_train = create_data_from_dir("Dataset/speech/train",
                                     "Dataset/noise/train", 100000)

mix_urls_valid = create_data_from_dir("Dataset/speech/test",
                                     "Dataset/noise/test", 100000)


In [31]:

from tensorflow.keras.models import *
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import *
from tensorflow.keras.metrics import *
import tensorflow as tf

# https://github.com/guillaume-chevalier/HAR-stacked-residual-bidir-LSTMs
def generic_model(timesteps,len_inputs,hidden,layers,cell,n_fc,fc_size,resnet,use_relu,batch_norm):
    inputs = Input(shape=[timesteps,len_inputs,1])
    x = Reshape([timesteps, len_inputs])(inputs)
    if use_relu:
        x = ReLU()(x)

    rnn = LSTM(hidden,return_sequences=True) if cell == "LSTM" else Bidirectional(LSTM(hidden,return_sequences=True))
    x = rnn(x)
    
    if use_relu:
        x = ReLU()(x)

    for i in range(layers - 1):
        rnn = LSTM(hidden,return_sequences=True) if cell == "LSTM" else Bidirectional(LSTM(hidden,return_sequences=True))
        r = rnn(x)
        
        if use_relu:
            x  =ReLU()(x)

        x = Add()([r,x]) if resnet else r

        ### si es la penultima 
        if batch_norm:
            x = BatchNormalization()(x)
    

    x = Flatten()(x)
    for i in range(n_fc):
        x = Dense(fc_size, activation='relu')(x)

    x = Dense(timesteps * len_inputs * 2, activation=None)(x)

    x = Reshape([timesteps,len_inputs,2])(x)
    x = Softmax(axis=-1)(x)
    #x = Concatenate(axis=-1)([x, x, x, x, x])
    model = Model(inputs=inputs, outputs=x)
    return model


2024-09-05 18:23:48.730726: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [32]:
from scipy import signal
from signaltransform import *

def prepare_signals(v,n,fac1,fac2):
    v = 1 / np.abs(v).max() * v
    n = 1 / np.abs(n).max() * n
    v = v * float(fac1) 
    n = n * float(fac2) 
    return v, n

def feature_extractor0(input_feature, audio_len, apply_vad, vad_threshold, nperseg = 512, noverlap = 256):
    def extract_features(data):
        speech, noise= data
        v,_ = sf.read(speech)
        n,_ = sf.read(noise)

        v = v[:audio_len]
        n = n[:audio_len]

        v, n = prepare_signals(v, n,  random.uniform(.8, 1), random.uniform(.8, 1))
        mix = (v + n) / 2.0

        _, _, complex_v = signal.stft(v, fs=16000, nperseg=nperseg, noverlap=noverlap)
        _, _, complex_n = signal.stft(n, fs=16000, nperseg=nperseg, noverlap=noverlap)
        _, _, complex_mix = signal.stft(mix, fs=16000, nperseg=nperseg, noverlap=noverlap)

        mag_v = np.abs(complex_v)
        mag_n = np.abs(complex_n)
        mag_mix = np.abs(complex_mix)
        angles_mix = np.angle(complex_mix)
        mag_mix_db = db(mag_mix,10000.,10000.)

        if input_feature == "mag":
            x = reduce_standarize_mag(mag_mix)
        elif input_feature == "mag_db":
            x = reduce_standarize(mag_mix_db)

        VAD = vad(mag_mix_db, vad_threshold)
        
        if not apply_vad:
            VAD_t = np.ones_like(VAD)


        y = np.array((mag_v > mag_n)).astype(np.float32)

        y_t_c = np.stack([np.transpose(y,[1,0]),np.transpose(1 - y,[1,0]),np.transpose(VAD,[1,0]),np.transpose(x,[1,0]),np.transpose(mag_mix,[1,0]),np.transpose(angles_mix,[1,0])])
        
        return  np.transpose(x,[1,0])[...,np.newaxis],np.transpose(y_t_c,[1,2,0])

    return extract_features


In [33]:
feature_extractor = feature_extractor0("mag_db", 16384, True, 40.0, 512, 256)

In [34]:
x, _ = feature_extractor(mix_urls_train[0])
timesteps = x.shape[0]
len_inputs =  x.shape[1]

In [35]:
model = generic_model(timesteps ,len_inputs, 160,6 ,"BLSTM" ,2 ,128 , True, True, True)

2024-09-05 18:23:54.444361: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-09-05 18:23:54.524907: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-09-05 18:23:54.525121: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [65]:
%load_ext autoreload
%autoreload 2
from losses import *
from metrics2 import *

In [66]:
from tensorflow.keras.metrics import BinaryAccuracy as BA

In [67]:
loss = MSE
metric = binary_accuracy_c
optimizer  = Adam(1e-3)

model.compile(
    optimizer= optimizer,
    loss= loss,
    metrics= [metric])


In [68]:
from tensorflow.keras.utils import Sequence
class XYGenerator(Sequence):
    def __init__(self, ids, batch_size, data,feature_extractor):
        'Initialization'
        self.ids = ids
        self.data = data
        self.batch_size = batch_size
        self.feature_extractor = feature_extractor

    def __len__(self):
        'Denotes the number of batches per epoch'
        return len(self.ids)

    def __getitem__(self, index):
        #carga de batches
        X = []
        Y = []
        for i in range(index * self.batch_size, (index + 1) * self.batch_size):
            x, y = self.feature_extractor(self.data[i])
            X.append(x)
            Y.append(y)
        return np.array(X),np.array(Y)


In [69]:
batch_size = 50
epochs = 50
total_items = len(mix_urls_train)
num_batches = int(total_items/batch_size)

training_generator = XYGenerator(range(num_batches), batch_size, mix_urls_train, feature_extractor)

valid_total_items = len(mix_urls_valid)
valid_batch_size = batch_size
valid_num_batches = int(valid_total_items/valid_batch_size)

test_generator = XYGenerator(range(valid_num_batches),valid_batch_size,mix_urls_valid,feature_extractor)


In [70]:
model.fit(training_generator, validation_data= test_generator, epochs= epochs)

Epoch 1/50


2024-09-05 17:12:16.659831: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8600
2024-09-05 17:12:16.917089: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x76ba5c03e9a0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-09-05 17:12:16.917127: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 2080 SUPER, Compute Capability 7.5
2024-09-05 17:12:16.931241: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:255] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-09-05 17:12:17.089549: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50

KeyboardInterrupt: 

In [71]:
def create_predictions(predicted,x_list,y_list, nperseg = 512, noverlap = 256):
    i = 0
    for predicted_, x , y in zip(predicted,x_list,y_list):
        print(y.shape)
        mag_mix = y[...,4:5]
        angles_mix = y[...,5:6]
        complex_mix = mag_mix * np.exp(angles_mix*1j)
        print(complex_mix.shape)
        print(type(complex_mix))
        #exit(0)
        print(predicted_.shape)
        print(complex_mix.shape)
        _, source_recovered_a = signal.istft(np.transpose(predicted_[...,0],[1,0]) * np.transpose(complex_mix[...,0],[1,0]) , fs=16000 ,nperseg= nperseg, noverlap = noverlap)
        _, source_recovered_b = signal.istft(np.transpose(1 - predicted_[...,0],[1,0]) * np.transpose(complex_mix[...,0],[1,0]) , fs=16000 ,nperseg= nperseg, noverlap = noverlap)
        _, source_recovered_x = signal.istft(np.transpose(complex_mix[...,0],[1,0]) , fs=16000 ,nperseg= nperseg, noverlap = noverlap)
        sf.write("{}_a.wav".format(i),source_recovered_a,16000)
        sf.write("{}_b.wav".format(i),source_recovered_b,16000)
        sf.write("{}_x.wav".format(i),source_recovered_x,16000)
        i+=1

def create_predictions_pred(predicted,x_list,pred_folder, nperseg, noverlap):
    i = 0
    for predicted_list, complex_list  in zip(predicted,x_list):
        recovered_a = []
        recovered_b = []
        recovered_x = []
        for predicted_, complex_mix in zip(predicted_list,complex_list):
            _, source_recovered_a = signal.istft(np.transpose(predicted_[...,0],[1,0]) * np.transpose(complex_mix[...,0],[1,0]) , fs=16000 ,nperseg= nperseg, noverlap = noverlap)
            _, source_recovered_b = signal.istft(np.transpose(1 - predicted_[...,0],[1,0]) * np.transpose(complex_mix[...,0],[1,0]) , fs=16000 ,nperseg= nperseg, noverlap = noverlap)
            _, source_recovered_x = signal.istft(np.transpose(complex_mix[...,0],[1,0]) , fs=16000 ,nperseg= nperseg, noverlap = noverlap)
            recovered_a.append(source_recovered_a)
            recovered_b.append(source_recovered_b)
            recovered_x.append(source_recovered_x)

        sf.write("{}/{}_a.wav".format(pred_folder,i),np.array(recovered_a).reshape(-1),16000)
        sf.write("{}/{}_b.wav".format(pred_folder,i),np.array(recovered_b).reshape(-1),16000)
        sf.write("{}/{}_x.wav".format(pred_folder,i),np.array(recovered_x).reshape(-1),16000)
        i+=1

In [72]:
pred_batch_size = 15
pred_total_items = 15 
pred_num_batches = int((pred_total_items/pred_batch_size))
predGen = XYGenerator(range(pred_num_batches),pred_batch_size,mix_urls_valid[0:15],feature_extractor) # Change the 3 to range(x+1) where x is the number of npy files you want to train on
predicted = model.predict(predGen)
pred_gen = [[x,y] for x,y in predGen]

create_predictions(predicted,pred_gen[0][0],pred_gen[0][1])


(64, 257, 6)
(64, 257, 1)
<class 'numpy.ndarray'>
(64, 257, 2)
(64, 257, 1)
(64, 257, 6)
(64, 257, 1)
<class 'numpy.ndarray'>
(64, 257, 2)
(64, 257, 1)
(64, 257, 6)
(64, 257, 1)
<class 'numpy.ndarray'>
(64, 257, 2)
(64, 257, 1)
(64, 257, 6)
(64, 257, 1)
<class 'numpy.ndarray'>
(64, 257, 2)
(64, 257, 1)
(64, 257, 6)
(64, 257, 1)
<class 'numpy.ndarray'>
(64, 257, 2)
(64, 257, 1)
(64, 257, 6)
(64, 257, 1)
<class 'numpy.ndarray'>
(64, 257, 2)
(64, 257, 1)
(64, 257, 6)
(64, 257, 1)
<class 'numpy.ndarray'>
(64, 257, 2)
(64, 257, 1)
(64, 257, 6)
(64, 257, 1)
<class 'numpy.ndarray'>
(64, 257, 2)
(64, 257, 1)
(64, 257, 6)
(64, 257, 1)
<class 'numpy.ndarray'>
(64, 257, 2)
(64, 257, 1)
(64, 257, 6)
(64, 257, 1)
<class 'numpy.ndarray'>
(64, 257, 2)
(64, 257, 1)
(64, 257, 6)
(64, 257, 1)
<class 'numpy.ndarray'>
(64, 257, 2)
(64, 257, 1)
(64, 257, 6)
(64, 257, 1)
<class 'numpy.ndarray'>
(64, 257, 2)
(64, 257, 1)
(64, 257, 6)
(64, 257, 1)
<class 'numpy.ndarray'>
(64, 257, 2)
(64, 257, 1)
(64, 257, 6)