# **Audio-to-Audio Generator LSTM**

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
import os
import torchaudio
import torch
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import IPython
import librosa
import numpy as np
from scipy.io.wavfile import write

In [4]:
directory_path = '/content/gdrive/MyDrive/deep_learning/uas/input_music'

In [5]:
list_of_songs = []

# Get a list of all files in the directory
file_names = os.listdir(directory_path)

# Loop through each file name
for file_name in file_names:
    # Print or perform any other operation with the file name
    full_path = os.path.join(directory_path, file_name)
    list_of_songs.append(full_path)

In [6]:
def load_and_preprocess_audio(file_path):
    audio, _ = librosa.load(file_path, sr=None)  # Load audio file
    spectrogram = np.abs(librosa.stft(audio))  # Compute spectrogram
    return spectrogram

In [7]:
def create_dataset(file_paths, max_frames=6000):
    spectrograms = []
    for file_path in file_paths:
        spectrogram = load_and_preprocess_audio(file_path)

        if spectrogram.shape[1] > max_frames:
            spectrogram = spectrogram[:, :max_frames]
        else:
            padding = max_frames - spectrogram.shape[1]
            spectrogram = np.pad(spectrogram, ((0, 0), (0, padding)), mode='constant')

        spectrograms.append(spectrogram)

    return np.array(spectrograms)

In [8]:
music_dataset = create_dataset(list_of_songs)

  audio, _ = librosa.load(file_path, sr=None)  # Load audio file
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  audio, _ = librosa.load(file_path, sr=None)  # Load audio file
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  audio, _ = librosa.load(file_path, sr=None)  # Load audio file
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  audio, _ = librosa.load(file_path, sr=None)  # Load audio file
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  audio, _ = librosa.load(file_path, sr=None)  # Load audio file
	Deprecated as of librosa version 0.10.0.
	It will be removed in libros

In [9]:
input_shape = music_dataset.shape  # Shape of a single spectrogram

In [10]:
print(input_shape)

(100, 1025, 6000)


In [11]:
model = tf.keras.Sequential([
    layers.Input(shape=(input_shape[1], input_shape[2])),
    layers.LSTM(64, return_sequences=True),
    layers.Dropout(0.2),
    layers.LSTM(32, return_sequences=True),
    layers.Dropout(0.2),
    layers.Dense(input_shape[2], activation='sigmoid')
])

In [12]:
target_dataset = music_dataset

In [13]:
model.compile(optimizer='adam', loss='mean_squared_error')

In [14]:
model.fit(music_dataset[:10], target_dataset, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7c63c821e920>

In [15]:
import soundfile as sf

In [16]:
def generate_audio(model, seed_input):
    # Expand dimensions to match the model's input shape
    seed_input = np.expand_dims(seed_input, axis=0)

    # Generate predictions using the trained model
    generated_spectrogram = model.predict(seed_input)

    return generated_spectrogram

seed_input = music_dataset[0][:, :6000]  # Example: use the first 100 frames of the first song
generated_audio = generate_audio(model, seed_input)

# Convert the generated spectrogram back to the time domain
generated_audio_waveform = librosa.istft(generated_audio.squeeze())

# Save generated audio
sf.write('/content/gdrive/MyDrive/deep_learning/uas/test.wav', generated_audio_waveform, 44100)

