In [32]:
import os
import tensorflow as tf
import numpy as np
import math
from random import sample, shuffle
from PIL import Image
import matplotlib.pyplot as plt
from src.DataGenerator import AudioDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, MaxPool2D , Flatten, Reshape, Conv2DTranspose, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from src.helper_functions import plot_reconstruction
import seaborn as sns

In [39]:
data_gen = AudioDataGenerator(
    directory='data/Spotify/comp_pngs/', 
    image_size=(128,512), 
    color_mode='rgb',
    batch_size=32,
    sample_size=100000,
    shuffle=True,
    train_test_split=True, 
    test_size=0.02,
    output_channel_index=0,
    output_size=(128,128))


Found 98000 files for Training set
Found 2000 files for Test set


In [34]:
img_width = 128
img_height = 128
kernel_size = 5
strides = 2

In [109]:
class Time_Freq_Autoencoder(tf.keras.Model):
    
    def __init__(self, latent_dim, num_channels):
        super(Time_Freq_Autoencoder, self).__init__()
        self.latent_dim = latent_dim
        self.time_encoder = Sequential([
            Reshape(target_shape=(128,128)),
            tf.keras.layers.Conv1D(input_shape=(img_height, img_width), filters=64, kernel_size=kernel_size, padding="same", strides=strides, activation='relu'),
            BatchNormalization(axis=-1),
            tf.keras.layers.Conv1D(filters=64, kernel_size=kernel_size, padding="same", strides=strides, activation='relu'),
            BatchNormalization(axis=-1),
            tf.keras.layers.Conv1D(filters=128, kernel_size=kernel_size, padding="same", strides=strides, activation='relu'),
            BatchNormalization(axis=-1),
            tf.keras.layers.Conv1D(filters=256, kernel_size=kernel_size, padding="same", strides=strides, activation='relu'),
            BatchNormalization(axis=-1),
            Flatten(),
            Dense(2048, activation='relu'),
            Dense(units=latent_dim//2)
        ])
        self.freq_encoder = Sequential([
            Reshape(target_shape=(128,128)),
            tf.keras.layers.Conv1D(input_shape=(img_height, img_width), filters=64, kernel_size=5, padding="same", strides=strides, activation='relu'),
            BatchNormalization(axis=-1),
            tf.keras.layers.Conv1D(filters=64, kernel_size=kernel_size, padding="same", strides=strides, activation='relu'),
            BatchNormalization(axis=-1),
            tf.keras.layers.Conv1D(filters=128, kernel_size=kernel_size, padding="same", strides=strides, activation='relu'),
            BatchNormalization(axis=-1),
            tf.keras.layers.Conv1D(filters=256, kernel_size=kernel_size, padding="same", strides=strides, activation='relu'),
            BatchNormalization(axis=-1),
            Flatten(),
            Dense(2048, activation='relu'),
            Dense(units=latent_dim//2)
        ])
        self.decoder = Sequential([
            tf.keras.layers.InputLayer(input_shape=(latent_dim)),
            Dense(units=16384, activation='relu'),
            BatchNormalization(axis=-1),
            Reshape(target_shape=(8,8,256)),
            Conv2DTranspose(filters=256, kernel_size=kernel_size, strides=strides, padding="same", activation="relu"),
            BatchNormalization(axis=-1),
            Conv2DTranspose(filters=128, kernel_size=kernel_size, strides=strides, padding="same", activation="relu"),
            BatchNormalization(axis=-1),
            Conv2DTranspose(filters=64, kernel_size=kernel_size,  strides=strides, padding="same", activation="relu"),
            BatchNormalization(axis=-1),
            Conv2DTranspose(filters=32, kernel_size=kernel_size,  strides=strides, padding="same", activation="relu"),
            BatchNormalization(axis=-1),
            Conv2DTranspose(filters=num_channels, kernel_size=kernel_size, padding="same", activation='sigmoid'),
        ])
        
    def encode(self, x):
        x_1 = x
        x_2 = tf.transpose(x, perm=[0,2,1,3])
        encoded_time = self.time_encoder(x_1)
        encoded_freq = self.freq_encoder(x_2)
        encoded = tf.keras.layers.Concatenate(axis=1)([encoded_time, encoded_freq])
        return encoded
        
    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded
    
autoencoder = Time_Freq_Autoencoder(256, 1)

opt = Adam(learning_rate=1e-3)

autoencoder.compile(optimizer=opt, loss=tf.keras.losses.mse)

In [None]:
# hist = autoencoder.fit(data_gen.train,
#                        batch_size=data_gen.batch_size,
#                        epochs=10,
#                        validation_data=data_gen.test)

In [None]:
hist_2 = autoencoder.fit(data_gen.train,
                       batch_size=data_gen.batch_size,
                       epochs=5,
                       validation_data=data_gen.test)

In [77]:
autoencoder_path = 'data/autoencoder_512dim_time_freq'
autoencoder.save(autoencoder_path)



INFO:tensorflow:Assets written to: data/autoencoder_512dim_time_freq/assets


INFO:tensorflow:Assets written to: data/autoencoder_512dim_time_freq/assets


In [108]:
test_img = data_gen.take(20)[0]
prediction = autoencoder(test_img)

plot_reconstruction(test_img, prediction, 1)

ValueError: Exception encountered when calling layer "sequential_52" (type Sequential).

A `Concatenate` layer should be called on a list of at least 1 input. Received: input_shape=(32, 128, 128, 1)

Call arguments received by layer "sequential_52" (type Sequential):
  • inputs=tf.Tensor(shape=(32, 128, 128, 1), dtype=float32)
  • training=False
  • mask=None

In [73]:
autoencoder.encoder(test_img)

<tf.Tensor: shape=(32, 256), dtype=float32, numpy=
array([[ 911.20306, -862.2692 , -397.7018 , ..., -863.3008 , -368.2759 ,
        -526.147  ],
       [ 891.67053, -904.7645 , -444.9764 , ..., -939.6398 , -309.77386,
        -780.13275],
       [1043.9781 , -926.9036 , -317.7887 , ..., -856.2506 , -233.06314,
        -683.01117],
       ...,
       [ 707.19745, -734.52325, -285.39926, ..., -927.082  , -312.66   ,
        -624.9271 ],
       [ 879.4098 , -906.7295 , -463.8893 , ..., -963.3282 , -493.59512,
        -707.8858 ],
       [ 871.25256, -794.2508 , -569.2462 , ..., -954.0515 , -423.87006,
        -680.7016 ]], dtype=float32)>

In [78]:
from src.LatentSpace import LatentSpace

latent_space = LatentSpace(autoencoder_path=autoencoder_path,
                        image_dir='data/Spotify/comp_pngs/',
                        tracks_feather_path='data/all_tracks.feather', latent_dims=512, output_size=(128, 128)
                        )

AttributeError: 'NoneType' object has no attribute 'get'

In [79]:
latent_space.build()

try:
    os.mkdir('data/autoencoder_512dim_time_freq')
except:
    pass
latent_space.save('data/autoencoder_512dim_time_freq')

Getting predictions from autoencoder...


AttributeError: 'Time_Freq_Autoencoder' object has no attribute 'encoder'