# Deep Learning for Media 
#### MPATE-GE 2039 - DM-GY 9103 

---

## Visual and Audio Genre Classification of Movie Trailers

**Demo:**

Here is a demo on a small dataset.

You can find the dataset here. You will need to add the [folder](https://drive.google.com/drive/folders/1NA-uPe9AM5WsIAboSjoM6eW-d8C1YFZu?usp=share_link) to a folder in your Google Drive called dl4m_datasets.






['The Desperate Hour', 764835, [0, 0, 0, 0, 0, 0, 0, 0, 0, 1], '1MWECKYcyJk'] ['Thriller']


['The Munsters', 804413, [0, 0, 1, 0, 0, 0, 0, 0, 0, 0], 'mRE0PUvgsKo'] ['Comedy']


['The Lucky One', 77877, [0, 0, 0, 0, 1, 0, 0, 1, 0, 0], 't2LI5OOifsQ'] ['Drama', 'Romance']


['Body Brokers', 656940, [0, 0, 0, 1, 0, 0, 0, 0, 0, 1], 'j8T0GIpfxr0'] ['Crime', 'Thriller']


['Wonder Park', 400157, [0, 1, 1, 0, 0, 0, 0, 0, 0, 0], '5MRSBuQt51A'] ['Adventure', 'Comedy']

['Wild Indian', 656991, [0, 0, 0, 1, 1, 0, 0, 0, 0, 1], 'ou6hNBf8FKI'] ['Crime', 'Drama', 'Thriller']


<div class="markdown-google-sans">

## **Frames**
</div>

Uses a pre-trained model using Imagenet and VGG16 to classify 10 frames, selected from the middle of a trailer, by genre.

In [None]:
# Fix the random seed for reproducibility
from numpy.random import seed
import numpy as np

seed(123)
import tensorflow as tf
tf.keras.utils.set_random_seed(123)
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import os
import keras
import numpy as np

from collections import Counter

root_dir = "/content/drive/My Drive/dl4m_datasets/"

modules_path = os.path.join(root_dir, "trailer_dataset", "trailer_model_data")

import sys
sys.path.append(modules_path)

import utils as u # to use the functions from utils, do u.function()
import models as m


import json

genres_path = os.path.join(modules_path, "genres.json")

labels_dict = {}

# Open the JSON file
with open(genres_path, 'r') as f:
    # Load the JSON data as a dictionary
    labels_dict = json.load(f)

data_home = '/content/drive/My Drive/dl4m_datasets/trailer_dataset/' 

if not os.path.exists(data_home):
  print('You should download the dataset first, unzip all its folders and '
  'locate it in the data_home folder.\nFollow the instructions in the practice '
  'of our class #6.')


In [None]:
conv_base = m.load_conv_model()

In [None]:
conv_base.summary()

Model: "vgg16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 180, 180, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 180, 180, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 180, 180, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 90, 90, 64)        0         
                                                                 
 block2_conv1 (Conv2D)       (None, 90, 90, 128)       73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 90, 90, 128)       147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 45, 45, 128)       0     

In [None]:
extraction_path = os.path.join(data_home, "trailer_model_data", "feature_extraction.keras")
target_shape = (180, 180)
batch_size = 7
data_home = '/content/drive/My Drive/dl4m_datasets/trailer_dataset/' 
transf_model = keras.models.load_model(extraction_path)



data = [
    ['The Desperate Hour', 764835, [0, 0, 0, 0, 0, 0, 0, 0, 0, 1], '1MWECKYcyJk'],
    ['The Munsters', 804413, [0, 0, 1, 0, 0, 0, 0, 0, 0, 0], 'mRE0PUvgsKo'],
    ['The Lucky One', 77877, [0, 0, 0, 0, 1, 0, 0, 1, 0, 0], 't2LI5OOifsQ'],
    ['Body Brokers', 656940, [0, 0, 0, 1, 0, 0, 0, 0, 0, 1], 'j8T0GIpfxr0'],
    ['Wonder Park', 400157, [0, 1, 1, 0, 0, 0, 0, 0, 0, 0], '5MRSBuQt51A'],
    ['Wild Indian', 656991, [0, 0, 0, 1, 1, 0, 0, 0, 0, 1], 'ou6hNBf8FKI']
]

def find_movie_by_genre(genre_list, data):
  for movie_data in data:
      if movie_data[2] == genre_list:
          return movie_data[0]
  return None

demo_path = os.path.join(data_home, 'demo_slides', 'frames')

demo_dataset = u.create_dataset(demo_path, batch_size, target_shape, shuffle=False)

In [None]:
demo_features, demo_labels =  u.get_features_and_labels(demo_dataset, conv_base)

save_path = os.path.join(data_home, "trailer_model_data", 'demo_features.npy')
np.save(save_path, demo_features)

save_path = os.path.join(data_home, "trailer_model_data", 'demo_labels.npy')
np.save(save_path, demo_labels)



In [None]:
demo_features = np.load(os.path.join(data_home, "trailer_model_data", 'demo_features.npy'))
demo_labels = np.load(os.path.join(data_home, "trailer_model_data", 'demo_labels.npy'))

save_path = os.path.join(data_home, "trailer_model_data", 'frames_demo_predictions.npy')
vis_y_pred = transf_model.predict(demo_features)
np.save(save_path, vis_y_pred)


transf_model_y_pred_30 = (vis_y_pred > .35)
print("Prediction \t\t\t Labels \t\t\t Movie")
for i in range(len(demo_features)):
  pred_arr = [int(x) for x in transf_model_y_pred_30[i]]
  print(pred_arr, list(demo_labels[i]), find_movie_by_genre(list(demo_labels[i]), data))







Prediction 			 Labels 			 Movie
[1, 1, 1, 0, 1, 0, 0, 0, 0, 1] [0, 0, 0, 0, 0, 0, 0, 0, 0, 1] The Desperate Hour
[1, 1, 1, 0, 1, 1, 0, 0, 1, 1] [0, 0, 0, 0, 0, 0, 0, 0, 0, 1] The Desperate Hour
[1, 1, 1, 0, 1, 1, 0, 0, 1, 1] [0, 0, 0, 0, 0, 0, 0, 0, 0, 1] The Desperate Hour
[1, 1, 1, 0, 1, 0, 0, 0, 0, 1] [0, 0, 0, 0, 0, 0, 0, 0, 0, 1] The Desperate Hour
[1, 1, 1, 0, 1, 0, 0, 0, 0, 1] [0, 0, 0, 0, 0, 0, 0, 0, 0, 1] The Desperate Hour
[1, 1, 1, 0, 1, 0, 0, 0, 0, 1] [0, 0, 0, 0, 0, 0, 0, 0, 0, 1] The Desperate Hour
[1, 1, 1, 0, 1, 1, 0, 0, 1, 1] [0, 0, 0, 0, 0, 0, 0, 0, 0, 1] The Desperate Hour
[1, 1, 1, 0, 1, 0, 0, 0, 0, 1] [0, 0, 0, 0, 0, 0, 0, 0, 0, 1] The Desperate Hour
[1, 1, 1, 0, 1, 1, 0, 0, 1, 1] [0, 0, 0, 0, 0, 0, 0, 0, 0, 1] The Desperate Hour
[1, 1, 1, 0, 1, 0, 0, 0, 0, 1] [0, 0, 0, 0, 0, 0, 0, 0, 0, 1] The Desperate Hour
[1, 1, 1, 0, 1, 1, 0, 0, 0, 1] [0, 0, 0, 0, 0, 1, 0, 0, 1, 0] None
[1, 1, 1, 0, 1, 1, 0, 0, 1, 1] [0, 0, 0, 0, 0, 1, 0, 0, 1, 0] None
[1, 1, 1, 0, 1, 1, 0, 0,

<div class="markdown-google-sans">

## **Audio**
</div>



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import os
import librosa
import json
import keras
import seaborn as sns
from tcn import TCN

In [None]:
!pip install keras-tcn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Audio Utils

In [None]:
# Having issues running with utils.py, so using here


# -*- coding: utf-8 -*-
"""utils_audio_edited.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1bJGc4ADfWAhuq5ksBTKuMIgDPY5C1QSH
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import os
import librosa
import json
import keras
from tensorflow.keras.models import model_from_json

# Must have mapping of id to array of genre labels
# e.g. for id 1, [0,0,1,0,1,0]
# with open('genres.json') as f:
#   label_mapper = json.load(f)

def load_data_first(data_home, corrupt_ids, track_ids = None):
    """
    Load data from a specified music dataset and return the audio file paths and their corresponding labels.

    Parameters
    ----------
    data_home : str
        The root directory where the dataset is stored.
    dataset_name : str, optional
        The name of the dataset to load, by default 'gtzan_genre'.
    version : str, optional
        The version of the dataset to load, by default '1.0'.
    track_ids : list of str, optional
        A list of track IDs to load from the dataset, by default None. If None, all tracks in the dataset will be loaded.

    Returns
    -------
    audio_file_paths : list of str
        A list of audio file paths from the specified dataset.
    labels : list of int
        A list of corresponding labels for the audio files.

    Example
    -------
    >>> data_home = "/path/to/data_directory"
    >>> dataset_name = "gtzan_genre"
    >>> version = "mini"
    >>> track_ids = ["track_1", "track_2"]
    >>> audio_file_paths, labels = load_data(data_home, dataset_name, version, track_ids)
    """

    with open('/content/drive/My Drive/dl4m_datasets/trailer_dataset/trailer_model_data/genres.json') as f:
      label_mapper = json.load(f)

    audio_file_paths = []
    labels = []
    ids = []
    print(data_home)

    for file in os.listdir(data_home):
      #print(file)
      file_str = str(file)[:-4]
      if file_str not in corrupt_ids:
        audio_file_paths.append(file_str)
        ids.append(file_str)
        labels.append(label_mapper[file_str])


    return audio_file_paths, labels, ids

def load_data(data_home, track_ids = None):
    """
    Load data from a specified music dataset and return the audio file paths and their corresponding labels.

    Parameters
    ----------
    data_home : str
        The root directory where the dataset is stored.
    dataset_name : str, optional
        The name of the dataset to load, by default 'gtzan_genre'.
    version : str, optional
        The version of the dataset to load, by default '1.0'.
    track_ids : list of str, optional
        A list of track IDs to load from the dataset, by default None. If None, all tracks in the dataset will be loaded.

    Returns
    -------
    audio_file_paths : list of str
        A list of audio file paths from the specified dataset.
    labels : list of int
        A list of corresponding labels for the audio files.

    Example
    -------
    >>> data_home = "/path/to/data_directory"
    >>> dataset_name = "gtzan_genre"
    >>> version = "mini"
    >>> track_ids = ["track_1", "track_2"]
    >>> audio_file_paths, labels = load_data(data_home, dataset_name, version, track_ids)
    """

    with open('/content/drive/My Drive/dl4m_datasets/trailer_dataset/trailer_model_data/genres.json') as f:
      label_mapper = json.load(f)

    audio_file_paths = []
    labels = []

    track_ids = [x.decode() for x in track_ids]
    #print("in load data")
    #print("load data track ids")
    #print(track_ids)

    for file in os.listdir(data_home):
      file_str = str(file)[2:-5]
      #file_str = str(file)
      #print("file_str:")
      #print(file_str)
      if file_str in track_ids:
        #print("path ",file_str)
        #print("label ",label_mapper[file_str])
        audio_file_paths.append(file)
        labels.append(label_mapper[file_str])
    #print("at end")
    #print(labels)
    audio_file_paths = [x.decode() for x in audio_file_paths]
    #print(audio_file_paths)
    return audio_file_paths, labels

def window_audio(audio, sample_rate, audio_seg_size, segments_overlap):
    """
    Segment audio into windows with a specified size and overlap. Padding is added only to the
    last window.

    Parameters
    ----------
    audio : np.ndarray
        The audio signal to be segmented.
    sample_rate : int
        The sampling rate of the audio signal.
    audio_seg_size : float
        The duration of each window in seconds.
    segments_overlap : float
        The duration of the overlap between consecutive windows in seconds.

    Returns
    -------
    audio_windows : list of np.ndarray
        A list of windows of the audio signal.

    Example
    -------
    >>> import librosa
    >>> y, sr = librosa.load(librosa.ex('trumpet'))
    >>> audio_windows = window_audio(y, sr, audio_seg_size=1, segments_overlap=0.5)
    """
    # YOUR CODE HERE
    import math
    audio_windows = []
    # Calculate the window size in samples
    # x samples / second * y seconds = z samples in a given segment
    window_size = int(sample_rate * audio_seg_size)


    # Calculate the overlap size in samples
    # same math as above
    overlap_size = int(sample_rate * segments_overlap)


    # Iterate through the audio signal, extracting windows
    start_pos = 0
    idx = 0
    at_end = False

    while not at_end:
        # current window goes from start to start + size of the window
        # If the window end is within the audio length, extract the window
        if ((start_pos + window_size) - overlap_size) < len(audio):
          audio_window = audio[start_pos : start_pos + window_size]
          audio_windows.append(audio_window)
          
        # Padding the last window with zeros if it extends beyond the audio length
        else:
          audio_window = audio[start_pos:]
          at_end = True
          zeros_to_add = window_size - len(audio_window)
          for _ in range(zeros_to_add):
            audio_window = np.append(audio_window,0)
          audio_windows.append(audio_window)
        # Add the window to the list of audio windows
        
        # Update the start position for the next window, considering the overlap
        start_pos = (start_pos + window_size) - overlap_size
        idx += 1

    #print("out of while loop")
    return(audio_windows)

def compute_mel_spectrogram(audio, sample_rate=22050, n_mels=128, hop_length=512):
    """
    Compute the normalized Mel spectrogram of an audio signal.

    Parameters
    ----------
    audio : np.ndarray
        Input audio signal as a 1D numpy array.
    sample_rate : int, optional
        Sampling rate of the audio signal, by default 22050.
    n_mels : int, optional
        Number of Mel bands to generate, by default 128.
    hop_length : int, optional
        Number of samples between successive frames, by default 512.

    Returns
    -------
    np.ndarray
        Mel spectrogram as a 2D numpy array.

    """
    # Hint: use librosa melspectrogram and librosa power_to_db
    from librosa import power_to_db
    from librosa.feature import melspectrogram

    # Compute Mel spectrogram from the audio signal
    spectrogram = melspectrogram(y = audio, sr = sample_rate, hop_length = hop_length, n_mels = n_mels)
    
    # Convert the Mel spectrogram to dB scale
    spectrogram = power_to_db(spectrogram)
    

    return spectrogram

def win_generator(data_home, augment, track_ids=None, sample_rate=22050, pitch_shift_steps=2,
                   n_mels=128, hop_length=512, audio_seg_size=1, segments_overlap=0.5, shuffle=True):
    """
    Generator function that yields Mel spectrograms and labels from the specified dataset, with optional data augmentation. 
    Audio is broken down in small windows, the spectrogram is computed and yielded along with the label. 
    The label of the window is assumed to be the same as the label for the entire track.

    Parameters
    ----------
    data_home : str
        The root directory where the dataset is stored.
    augment : bool
        Whether to apply data augmentation (pitch shifting) to the audio.
    track_ids : list of str, optional
        A list of track IDs to load from the dataset, by default None. If None, all tracks in the dataset will be loaded.
    sample_rate : int, optional
        The sample rate at which to load the audio, by default 22050.
    pitch_shift_steps : int, optional
        The number of steps by which to shift the pitch for data augmentation, by default 2.
    n_mels : int, optional
        The number of Mel bands to generate, by default 128.
    hop_length : int, optional
        The number of samples between successive frames, by default 512.
    audio_seg_size : float, optional
        The size of audio segments in seconds, by default 1.
    segments_overlap : float, optional
        The overlap between audio segments in seconds, by default 0.5.
    shuffle : bool, optional
        Whether to shuffle the data before iterating, by default True.

    Yields
    ------
    spectrogram : np.ndarray
        A NumPy array containing the Mel spectrogram data.
    label : int
        The corresponding label for the spectrogram.

    Example
    -------
    >>> data_home = "/path/to/data_directory"
    >>> augment = True
    >>> track_ids = ["track_1", "track_2"]
    >>> generator = win_generator(data_home, augment, track_ids)
    >>> for spectrogram, label in generator:
    ...     # Process spectrogram and label
    """

    # Get list of audio paths and their corresponding labels
    #print("in generator")
    #print(label_mapper)
    #print("track ids = ", track_ids)
    audio_file_paths, labels = load_data(data_home, track_ids=track_ids)
    #print([audio_file_paths,labels])

    # Convert labels to numpy array
    labels = np.array(labels)

    # Shuffle data
    if shuffle:
      idxs = np.random.permutation(len(labels))
      audio_file_paths = [audio_file_paths[i] for i in idxs]
      labels = labels[idxs]


    for idx in range(len(audio_file_paths)):

        # Load audio at given sample_rate and label
        label = labels[idx]
        #audio, _ = librosa.load(audio_file_paths[idx], sr=sample_rate)
        #print("data_home ",data_home)
        print("file path ", audio_file_paths[idx])
        audio, _ = librosa.load(data_home.decode()+audio_file_paths[idx], sr=sample_rate)

        # Shorten audio to 29s due to imprecisions in duration of GTZAN
        # (ensures same duration files)
        audio = audio[:29*sample_rate]

        # Apply augmentation
        if augment:
            audio = pitch_shift_audio(audio, sample_rate, pitch_shift_steps)

        # Compute audio windowing
        audio_windows = window_audio(audio, sample_rate, audio_seg_size, segments_overlap)

        # Loop over windows
        for window in audio_windows:
            
            # Compute Mel spectrogram
            spectrogram = compute_mel_spectrogram(window, sample_rate, n_mels, hop_length)
            spectrogram = np.expand_dims(spectrogram, axis=2)
            if spectrogram.shape != (128,44,1):
              print(audio_file_paths[idx])
              print("is a BAD ID")
              continue
            #print(spectrogram)
            #print("shape")
            #print(spectrogram.shape)

            #print("leaving generator")
            yield spectrogram, label

def create_dataset(data_generator, input_args, input_shape):
    """
    Create a TensorFlow dataset from a data generator function along with the specified input arguments and shape.

    Parameters
    ----------
    data_generator : callable
        The data generator function to use for creating the dataset.
    input_args : list
        A list containing the arguments to be passed to the data generator function.
    input_shape : tuple
        A tuple representing the shape of the input data.

    Returns
    -------
    dataset : tf.data.Dataset
        A TensorFlow dataset created from the data generator function.

    Example
    -------
    >>> def sample_generator():
    ...     for i in range(10):
    ...         yield np.random.random((4, 4)), i
    >>> input_args = []
    >>> input_shape = (4, 4, 1)
    >>> dataset = create_dataset(sample_generator, input_args, input_shape)
    """

    dataset = tf.data.Dataset.from_generator(
      data_generator,
      args=input_args,
      output_signature=(
          tf.TensorSpec(shape=input_shape, dtype=tf.float32),
          #tf.TensorSpec(shape=(), dtype=tf.uint8)))
          tf.TensorSpec(shape=(10,), dtype=tf.uint8)))

    return dataset

def wav_generator(data_home, augment, track_ids=None, sample_rate=22050, pitch_shift_steps=2, shuffle=True):

    # Hint: base your generator on the win_generator
    # YOUR CODE HERE
    
    audio_file_paths, labels = load_data(data_home, track_ids=track_ids)
    print([audio_file_paths,labels])

    # Convert labels to numpy array
    labels = np.array(labels)

    # Shuffle data
    if shuffle:
      idxs = np.random.permutation(len(labels))
      audio_file_paths = [audio_file_paths[i] for i in idxs]
      labels = labels[idxs]


    for idx in range(len(audio_file_paths)):

        # Load audio at given sample_rate and label
        label = labels[idx]
        #audio, _ = librosa.load(audio_file_paths[idx], sr=sample_rate)
        #print("data_home ",data_home)
        print("file path ", audio_file_paths[idx])
        audio, _ = librosa.load(data_home.decode()+audio_file_paths[idx], sr=sample_rate)

        # Shorten audio to 29s due to imprecisions in duration of GTZAN
        # (ensures same duration files)
        audio = audio[:29*sample_rate]

        # Apply augmentation
        if augment:
            audio = pitch_shift_audio(audio, sample_rate, pitch_shift_steps)

        if audio.shape != (464000,):
          continue
        
        yield audio, label

def extract_yamnet_embedding(wav_data, yamnet):
    """
    Run YAMNet to extract embeddings from the wav data.

    Parameters
    ----------
    wav_data : np.ndarray
        The audio signal to be processed.
    yamnet : tensorflow.keras.Model
        The pre-trained YAMNet model.

    Returns
    -------
    np.ndarray
        The extracted embeddings from YAMNet.
    """
    # Hint: check the tensorflow models to see how yamnet should be used
    # YOUR CODE HERE
    scores, embeddings, spectrogram = yamnet(wav_data)
    return(embeddings)

def reload_tcn(model_path, weights_path, optimizer, loss, metrics):
    """
    Reload a TCN model from a JSON file and restore its weights. 
    Preferred method when dealing with custom layers.

    Parameters
    ----------
    model_path : str
        The path to the JSON file containing the model architecture.
    weights_path : str
        The path to the model weights file.
    optimizer : str or tf.keras.optimizers.Optimizer
        The optimizer to use when compiling the model.
    loss : str or tf.keras.losses.Loss
        The loss function to use when compiling the model.
    metrics : list of str or tf.keras.metrics.Metric
        The list of metrics to use when compiling the model.

    Returns
    -------
    reloaded_model : tf.keras.Model
        The reloaded model with the restored weights.

    Example
    -------
    >>> model_path = 'path/to/saved_model.json'
    >>> weights_path = 'path/to/saved_weights.h5'
    >>> optimizer = 'adam'
    >>> loss = 'sparse_categorical_crossentropy'
    >>> metrics = ['accuracy']
    >>> reloaded_model = reload_tcn(model_path, weights_path, optimizer, loss, metrics)
    """
    # Load the best checkpoint of the model from json file (due to custom layers)

    from tensorflow.keras.models import model_from_json
    
    loaded_json = open(model_path, 'r').read()
    reloaded_model = model_from_json(loaded_json, custom_objects={'TCN': TCN})

    reloaded_model.compile(optimizer=optimizer, 
                         loss=loss, 
                       metrics=metrics)
    # restore weights
    reloaded_model.load_weights(weights_path)

    return reloaded_model



### Audio Processing

In [None]:
# Must have id/label mapping
# e.g. for id 1, [0,0,1,0,1,0]

model_data_home = '/content/drive/My Drive/dl4m_datasets/trailer_dataset/trailer_model_data/'


with open(os.path.join(model_data_home, 'genres.json')) as f:
  label_mapper = json.load(f)

In [None]:
# Some IDs cause model to crash - remove these
corrupt_ids = ["gYbW1F_c9eM", "VW-F1H-Nonk", "j9N0nvBITzk", "xNstK5rbzcw", "5tGgqyhCIXQ", "t2LI5OOifsQ", "lcwmDAYt22k", "RGyrxamYhUA", "vlEwqBrbPPU"]

In [None]:
# load in test data 
test_data, test_labels, test_ids = load_data_first(data_home+"demo_slides/audio", corrupt_ids)

/content/drive/My Drive/dl4m_datasets/trailer_dataset/demo_slides/audio


In [None]:
# create dataset
sample_rate = 16000  # 16kHz for Yamnet
augment = False
input_shape = (29*sample_rate,)
input_args_test  = [data_home+'demo_slides/audio/', augment, test_ids, sample_rate]
dataset_test= create_dataset(wav_generator, input_args_test, input_shape)


In [None]:
# look at dataset
dataset_test

<_FlatMapDataset element_spec=(TensorSpec(shape=(464000,), dtype=tf.float32, name=None), TensorSpec(shape=(10,), dtype=tf.uint8, name=None))>

In [None]:
# import yamnet
import tensorflow_hub as hub
yamnet = hub.load('https://tfhub.dev/google/yamnet/1')

In [None]:
# Define a mapping function to extract embeddings
def map_function(audio, label):
   return extract_yamnet_embedding(audio, yamnet), label

In [None]:
# Load the best checkpoint of the model and set hyperparameters

optimizer = 'adam'
loss = "BinaryCrossentropy"
metrics = ["accuracy"]
model_path = (os.path.join(model_data_home, "yamnet_model.json"))
model_weights = (os.path.join(model_data_home, "yamnet_weights.h5"))
batch_size = 32


tcn_yamnet_reloaded = reload_tcn(model_path, model_weights, optimizer, loss, metrics)

# Evaluate the model on the test set
predictions = tcn_yamnet_reloaded.predict(dataset_test.prefetch(tf.data.AUTOTUNE).map(map_function).batch(batch_size))

[['1MWECKYcyJk.wav', 'LJbe3ovg-TQ.wav', 'mRE0PUvgsKo.wav', 'j8T0GIpfxr0.wav', '5MRSBuQt51A.wav', 'ou6hNBf8FKI.wav'], [[0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 1, 0, 0, 1, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0, 1], [0, 1, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 1, 0, 0, 0, 0, 1]]]
file path  LJbe3ovg-TQ.wav


  audio, _ = librosa.load(data_home.decode()+audio_file_paths[idx], sr=sample_rate)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


file path  j8T0GIpfxr0.wav


  audio, _ = librosa.load(data_home.decode()+audio_file_paths[idx], sr=sample_rate)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


file path  5MRSBuQt51A.wav


  audio, _ = librosa.load(data_home.decode()+audio_file_paths[idx], sr=sample_rate)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


file path  1MWECKYcyJk.wav


  audio, _ = librosa.load(data_home.decode()+audio_file_paths[idx], sr=sample_rate)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


file path  mRE0PUvgsKo.wav


  audio, _ = librosa.load(data_home.decode()+audio_file_paths[idx], sr=sample_rate)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


file path  ou6hNBf8FKI.wav


  audio, _ = librosa.load(data_home.decode()+audio_file_paths[idx], sr=sample_rate)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)




In [None]:
# look at predictions
predictions
print(len(predictions))

6


In [None]:
# set decision threshold
y_pred = (predictions > .5)

In [None]:
# look at labels
test_labels

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
 [0, 0, 0, 0, 0, 1, 0, 0, 1, 0],
 [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
 [0, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 1, 1, 0, 0, 0, 0, 1]]

In [None]:
# look at predictions
y_pred

array([[False, False, False, False, False, False, False, False, False,
        False],
       [False, False, False, False, False, False, False, False, False,
        False],
       [False, False,  True, False, False, False, False, False, False,
        False],
       [False, False, False, False, False, False, False, False, False,
        False],
       [False, False,  True, False, False, False, False, False, False,
        False],
       [False, False, False, False, False, False, False, False, False,
         True]])

In [None]:
# make class report
from sklearn.metrics import classification_report
genre_names = ['Action', 'Adventure', 'Comedy', 'Crime', 'Drama', 'Horror', 'Mystery', 'Romance', 'Science Fiction', 'Thriller']
cr = classification_report(y_true = test_labels, y_pred = y_pred, target_names= genre_names, output_dict = True)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
audio_y_pred = y_pred

save_path = os.path.join(data_home, "trailer_model_data", 'audio_demo_predictions.npy')
np.save(save_path, audio_y_pred)

<div class="markdown-google-sans">

## **Comparing Results**
</div>

Comparing pre-trained Imagenet results with YAMNET results

In [None]:
save_path = os.path.join(data_home, "trailer_model_data", 'audio_demo_predictions.npy')
audio_pred = np.load(save_path)
audio_pred = (audio_pred > .5)


save_path = os.path.join(data_home, "trailer_model_data", 'frames_demo_predictions.npy')
frames_pred = np.load(save_path)
frames_pred = (frames_pred > .35)


converted_vis_preds = []
converted_audio_preds = []
each_label = []

for pred in frames_pred:
    conv_pred = [int(x) for x in pred]
    converted_vis_preds.append(conv_pred)

for pred in audio_pred:
    conv_pred = [int(x) for x in pred]
    converted_audio_preds.append(conv_pred)

vis_averages = []
for i in range(0, len(converted_vis_preds), 10):
    group = converted_vis_preds[i:i+10]
    group_average = np.mean(group, axis=0)
    group_average[group_average >= 0.5] = 1
    group_average[group_average < 0.5] = 0
    vis_averages.append(group_average.astype(int).tolist())

for i in range(0, len(demo_labels), 10):
    group = demo_labels[i:i+10]
    group_average = np.mean(group, axis=0)
    group_average[group_average >= 0.5] = 1
    group_average[group_average < 0.5] = 0
    each_label.append(list(group[0]))




In [None]:
import pandas as pd

comparison_tables = []
genres = ['Action', 'Adventure', 'Comedy', 'Crime', 'Drama', 'Horror', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller']
for i in range(len(vis_averages)):
  name = find_movie_by_genre(each_label[i], data)
  df = pd.DataFrame(columns=genres)

  image_predictions = vis_averages[i]
  audio_predictions = converted_audio_preds[i]
  y_true = each_label[i]

  df.loc[len(df.index)] = image_predictions
  df.loc[len(df.index)] = audio_predictions
  df.loc[len(df.index)] = y_true

  df = df.set_axis(["Image Predictions", "Audio Predictions", "Actuals"], axis = 0)
  df = df.rename_axis(name)

  comparison_tables.append(df)



In [None]:
comparison_tables[0]

Unnamed: 0_level_0,Action,Adventure,Comedy,Crime,Drama,Horror,Mystery,Romance,Sci-Fi,Thriller
The Desperate Hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Image Predictions,1,1,1,0,1,0,0,0,0,1
Audio Predictions,0,0,0,0,0,0,0,0,0,0
Actuals,0,0,0,0,0,0,0,0,0,1


In [None]:
comparison_tables[1]

Unnamed: 0,Action,Adventure,Comedy,Crime,Drama,Horror,Mystery,Romance,Sci-Fi,Thriller
Image Predictions,1,1,1,0,1,1,0,0,0,1
Audio Predictions,0,0,0,0,0,0,0,0,0,0
Actuals,0,0,0,0,0,1,0,0,1,0


In [None]:
comparison_tables[2]

Unnamed: 0_level_0,Action,Adventure,Comedy,Crime,Drama,Horror,Mystery,Romance,Sci-Fi,Thriller
The Munsters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Image Predictions,1,1,1,0,1,0,0,0,0,1
Audio Predictions,0,0,1,0,0,0,0,0,0,0
Actuals,0,0,1,0,0,0,0,0,0,0


In [None]:
comparison_tables[3]

Unnamed: 0_level_0,Action,Adventure,Comedy,Crime,Drama,Horror,Mystery,Romance,Sci-Fi,Thriller
Body Brokers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Image Predictions,1,1,1,0,1,0,0,0,0,1
Audio Predictions,0,0,0,0,0,0,0,0,0,0
Actuals,0,0,0,1,0,0,0,0,0,1


In [None]:
comparison_tables[4]

Unnamed: 0_level_0,Action,Adventure,Comedy,Crime,Drama,Horror,Mystery,Romance,Sci-Fi,Thriller
Wonder Park,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Image Predictions,1,1,1,0,1,0,0,0,0,1
Audio Predictions,0,0,1,0,0,0,0,0,0,0
Actuals,0,1,1,0,0,0,0,0,0,0


In [None]:
comparison_tables[5]

Unnamed: 0_level_0,Action,Adventure,Comedy,Crime,Drama,Horror,Mystery,Romance,Sci-Fi,Thriller
Wild Indian,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Image Predictions,1,1,1,0,1,1,0,0,1,1
Audio Predictions,0,0,0,0,0,0,0,0,0,1
Actuals,0,0,0,1,1,0,0,0,0,1
