This notebook use different models to make predictions of emotions in music.  

The models are trained on two datasets ([DEAM](https://cvml.unige.ch/databases/DEAM/), [EmoMusic](https://cvml.unige.ch/databases/emoMusic/)
), with three types of deep embeddings ([AudioSet-VGGish](https://essentia.upf.edu/models.html#audioset-vggish), [Discogs-EffNet](https://essentia.upf.edu/models.html#discogs-effnet), and [MSD-MusiCNN](https://essentia.upf.edu/models.html#msd-musicnn)). Therefore, 6 models in total.  

Then, we use Sptify's API to analysis the emotion prediction of the audio.

## Install Libraries

In [None]:
!pip install essentia-tensorflow

In [2]:
import os
import json
from essentia import Pool
from essentia.standard import (
    MonoLoader,
    TensorflowPredict,
    TensorflowPredictMusiCNN,
)

## Load audio file

In [None]:
# Load an audio files.
# The embeddings models work with input audio with the 16 KHz samplerate.
audio_folder = "./audio.002"
allAudios = os.walk(audio_folder)
audio = []

for dir, dirame, audiofile in sorted(allAudios):
  for audioname in sorted(audiofile):
    audio_path = os.path.join(dir, audioname)
    print(audioname)

    audio_resample = MonoLoader(filename=audio_path, sampleRate=16000)()    # resample audio to 16k Hz.
    audio.append(audio_resample)
    print(audio)

## Predicting

### DEAM-AudioSet-VGGish

In [6]:
from essentia.standard import TensorflowPredictVGGish

# Model files for inference of embeddings and arousal/valence.
av_model_path = "./essentia-models/deam-vggish-audioset-1/deam-vggish-audioset-1.pb"
pretrained_model_path = "./essentia-models/audioset-vggish-3.pb"

# VGGish works in time domain, it doesn't need to specify patch_size and patch_hop_size,
# output_layer name.
output_layer = "model/vggish/embeddings"

# Instantiate the embeddings model
embeddings_model = TensorflowPredictVGGish(
    graphFilename=pretrained_model_path,
    output=output_layer,
)

In [7]:
# Load the arousal-valence model and run inference with TensorflowPredict().

# Configure the input and output layers for this model.
metadata = json.load(open("./essentia-models/deam-vggish-audioset-1/deam-vggish-audioset-1.json", "r"))

input_layer = metadata["schema"]["inputs"][0]["name"]
output_layer = metadata["schema"]["outputs"][0]["name"]

# Instantiate the arousal-valence model
av_model = TensorflowPredict(
    graphFilename=av_model_path,
    inputs=[input_layer],
    outputs=[output_layer],
)

In [8]:
# Compute embeddings.
for i in audio:
  embeddings = embeddings_model(i)
  feature = embeddings.reshape(-1, 1, 1, embeddings.shape[1])
  pool = Pool()
  pool.set(input_layer, feature)
  predictions = av_model(pool)[output_layer].squeeze()

  # Estimate the average of the predictions to get an arousal-valence
  print(f"prediction: {predictions.mean(axis=0)}")  # [valence, arousal]

prediction: [6.114302 6.247788]
prediction: [5.8908854 5.915308 ]
prediction: [5.951558  5.8261423]
prediction: [5.610529  5.3558865]
prediction: [5.47648  5.353496]
prediction: [4.6646414 4.892181 ]
prediction: [4.4502335 4.613028 ]
prediction: [4.0791144 4.3315163]


### DEAM-Discogs-EffNet

In [9]:
# Model files for inference of embeddings and arousal/valence.
av_model_path = "./essentia-models/deam-effnet-discogs-1/deam-effnet-discogs-1.pb"
embeddings_model_path = "./essentia-models/effnet-discogs-1.pb"

patch_size = 128
patch_hop_size = patch_size // 2

input_layer = "melspectrogram"
output_layer = "onnx_tf_prefix_BatchNormalization_496/add_1"

# Instantiate the embeddings model
embeddings_model = TensorflowPredictMusiCNN(
    graphFilename=embeddings_model_path,
    input=input_layer,
    output=output_layer,
    patchSize=patch_size,
    patchHopSize=patch_hop_size,
)

In [10]:
# Load the arousal-valence model and run inference with TensorflowPredict().

# Configure the input and output layers for this model.
metadata = json.load(open("./essentia-models/deam-effnet-discogs-1/deam-effnet-discogs-1.json", "r"))

input_layer = metadata["schema"]["inputs"][0]["name"]
output_layer = metadata["schema"]["outputs"][0]["name"]

# Instantiate the arousal-valence model
av_model = TensorflowPredict(
    graphFilename=av_model_path,
    inputs=[input_layer],
    outputs=[output_layer],
)

In [11]:
# Compute embeddings.
for i in audio:
  embeddings = embeddings_model(i)

  feature = embeddings.reshape(-1, 1, 1, embeddings.shape[1])
  pool = Pool()
  pool.set(input_layer, feature)
  predictions = av_model(pool)[output_layer].squeeze()

  # Estimate the average of the predictions to get an arousal-valence
  print(f"prediction: {predictions.mean(axis=0)}")  # [valence, arousal]

prediction: [6.271964  5.9751506]
prediction: [6.7898927 5.7247977]
prediction: [5.960185  5.8681226]
prediction: [5.488956  4.9948626]
prediction: [5.575601 5.563194]
prediction: [5.054484 5.153976]
prediction: [4.4691358 4.671598 ]
prediction: [4.191529  4.4826646]


### DEAM-MSD-MusiCNN

In [12]:
# Model files for inference of embeddings and arousal/valence.
av_model_path = "./essentia-models/deam-musicnn-msd-1/deam-musicnn-msd-1.pb"
pretrained_model_path = "./essentia-models/msd-musicnn-1.pb"


patch_size = 187
patch_hop_size = patch_size // 2

input_layer = "model/Placeholder"
output_layer = "model/dense/BiasAdd"

# Instantiate the embeddings model
embeddings_model = TensorflowPredictMusiCNN(
    graphFilename=pretrained_model_path,
    input=input_layer,
    output=output_layer,
    patchSize=patch_size,
    patchHopSize=patch_hop_size,
)

In [13]:
# Load the arousal-valence model and run inference with TensorflowPredict().

# Configure the input and output layers for this model.
metadata = json.load(open("./essentia-models/deam-musicnn-msd-1/deam-musicnn-msd-1.json", "r"))

input_layer = metadata["schema"]["inputs"][0]["name"]
output_layer = metadata["schema"]["outputs"][0]["name"]

# Instantiate the arousal-valence model
av_model = TensorflowPredict(
    graphFilename=av_model_path,
    inputs=[input_layer],
    outputs=[output_layer],
)

In [14]:
# Compute embeddings.

for i in audio:
  embeddings = embeddings_model(i)

  feature = embeddings.reshape(-1, 1, 1, embeddings.shape[1])
  pool = Pool()
  pool.set(input_layer, feature)
  predictions = av_model(pool)[output_layer].squeeze()

  # Estimate the average of the predictions to get an arousal-valence
  print(f"prediction: {predictions.mean(axis=0)}")  # [valence, arousal]

prediction: [5.6215873 5.9229913]
prediction: [5.8505416 5.637046 ]
prediction: [6.593474  6.1359262]
prediction: [5.8707404 5.360821 ]
prediction: [5.841827 5.894704]
prediction: [5.2682815 5.4188595]
prediction: [4.658526  4.7978835]
prediction: [3.9800584 4.175116 ]


### EmoMusic-AudioSet-VGGish

In [15]:
# Model files for inference of embeddings and arousal/valence.
av_model_path = "./essentia-models/emomusic-vggish-audioset-1/emomusic-vggish-audioset-1.pb"
pretrained_model_path = "./essentia-models/audioset-vggish-3.pb"

# VGGish embeddings model works in time domain, it doesn't needs to specify patch_size and patch_hop_size,
# only output_layer name.
output_layer = "model/vggish/embeddings"

# Instantiate the embeddings model
embeddings_model = TensorflowPredictVGGish(
    graphFilename=pretrained_model_path,
    output=output_layer,
)

In [16]:
# Load the arousal-valence model and run inference with TensorflowPredict().

# Configure the input and output layers for this model.
metadata = json.load(open("./essentia-models/emomusic-vggish-audioset-1/emomusic-vggish-audioset-1.json", "r"))

input_layer = metadata["schema"]["inputs"][0]["name"]
output_layer = metadata["schema"]["outputs"][0]["name"]

# Instantiate the arousal-valence model
av_model = TensorflowPredict(
    graphFilename=av_model_path,
    inputs=[input_layer],
    outputs=[output_layer],
)

In [17]:
# Compute embeddings.
for i in audio:
  embeddings = embeddings_model(i)
  feature = embeddings.reshape(-1, 1, 1, embeddings.shape[1])
  pool.set(input_layer, feature)
  predictions = av_model(pool)[output_layer].squeeze()

  # Estimate the average of the predictions to get an arousal-valence
  print(f"prediction: {predictions.mean(axis=0)}")  # [valence, arousal]

prediction: [6.161597  6.0167737]
prediction: [5.6637535 6.418614 ]
prediction: [6.295356 6.213693]
prediction: [4.951011  5.9048142]
prediction: [5.5317736 5.681318 ]
prediction: [4.5389795 4.650194 ]
prediction: [5.1369176 4.438523 ]
prediction: [4.5149903 4.1252203]


### EmoMusic-Discogs-EffNet

In [18]:
# Model files for inference of embeddings and arousal/valence.
av_model_path = "./essentia-models/emomusic-effnet-discogs-1/emomusic-effnet-discogs-1.pb"
embeddings_model_path = "./essentia-models/effnet-discogs-1.pb"

patch_size = 128
patch_hop_size = patch_size // 2

input_layer = "melspectrogram"
output_layer = "onnx_tf_prefix_BatchNormalization_496/add_1"

# Instantiate the embeddings model
embeddings_model = TensorflowPredictMusiCNN(
    graphFilename=embeddings_model_path,
    input=input_layer,
    output=output_layer,
    patchSize=patch_size,
    patchHopSize=patch_hop_size,
)

In [19]:
# Load the arousal-valence model and run inference with TensorflowPredict().

# Configure the input and output layers for this model.
metadata = json.load(open("./essentia-models/emomusic-effnet-discogs-1/emomusic-effnet-discogs-1.json", "r"))

input_layer = metadata["schema"]["inputs"][0]["name"]
output_layer = metadata["schema"]["outputs"][0]["name"]

# Instantiate the arousal-valence model
av_model = TensorflowPredict(
    graphFilename=av_model_path,
    inputs=[input_layer],
    outputs=[output_layer],
)

In [20]:
# Compute embeddings.
for i in audio:
  embeddings = embeddings_model(i)

  feature = embeddings.reshape(-1, 1, 1, embeddings.shape[1])
  pool = Pool()
  pool.set(input_layer, feature)
  predictions = av_model(pool)[output_layer].squeeze()

  # Estimate the average of the predictions to get an arousal-valence
  # representation for the entire song.
  print(f"prediction: {predictions.mean(axis=0)}")  # [valence, arousal]

prediction: [7.0017533 6.36807  ]
prediction: [6.1376977 6.7136407]
prediction: [6.2747164 5.6342397]
prediction: [4.7400765 5.74673  ]
prediction: [5.5805    5.6627116]
prediction: [5.074748 4.960502]
prediction: [4.879995 4.312363]
prediction: [4.668727  4.1893415]


### EmoMusic-MSD-MusiCNN

In [21]:
# Model files for inference of embeddings and arousal/valence.
av_model_path = "./essentia-models/emomusic-musicnn-msd-1/emomusic-musicnn-msd-1.pb"
pretrained_model_path = "./essentia-models/msd-musicnn-1.pb"

# Patch size and patch hop size different from Effnet.
patch_size = 187
patch_hop_size = patch_size // 2

input_layer = "model/Placeholder"
output_layer = "model/dense/BiasAdd"

# Instantiate the embeddings model
embeddings_model = TensorflowPredictMusiCNN(
    graphFilename=pretrained_model_path,
    input=input_layer,
    output=output_layer,
    patchSize=patch_size,
    patchHopSize=patch_hop_size,
)

In [22]:
# Load the arousal-valence model and run inference with TensorflowPredict().

# Configure the input and output layers for this model.
metadata = json.load(open("./essentia-models/emomusic-musicnn-msd-1/emomusic-musicnn-msd-1.json", "r"))

input_layer = metadata["schema"]["inputs"][0]["name"]
output_layer = metadata["schema"]["outputs"][0]["name"]

# Instantiate the arousal-valence model
av_model = TensorflowPredict(
    graphFilename=av_model_path,
    inputs=[input_layer],
    outputs=[output_layer],
)

In [23]:
# Compute embeddings.
for i in audio:
  embeddings = embeddings_model(i)

  feature = embeddings.reshape(-1, 1, 1, embeddings.shape[1])
  pool = Pool()
  pool.set(input_layer, feature)
  predictions = av_model(pool)[output_layer].squeeze()

  # Estimate the average of the predictions to get an arousal-valence
  print(f"prediction: {predictions.mean(axis=0)}")  # [valence, arousal]

prediction: [6.269694 5.799119]
prediction: [5.071554 6.268912]
prediction: [6.6763215 6.54575  ]
prediction: [4.2763968 5.8142323]
prediction: [5.871333 6.014273]
prediction: [5.2285833 5.0768723]
prediction: [5.3695736 4.3264213]
prediction: [4.6982937 4.1428885]


### Spotify

In [25]:
import yaml

api_folder = "./annotations-spotifyapi.002"
allAnnot = os.walk(api_folder, topdown=False)


for dirs, dirames, apifile in sorted(allAnnot):
  for api in sorted(apifile):
    api_path = os.path.join(dirs, api)

    metadata = yaml.safe_load(open(api_path, "r"))
    valence = metadata["audio_features"]["valence"]
    arousal = metadata["audio_features"]["energy"]
    print([valence, arousal])

[0.955, 0.73]
[0.228, 0.73]
[0.176, 0.955]
[0.365, 0.671]
[0.469, 0.64]
[0.748, 0.894]
[0.573, 0.241]
[0.461, 0.225]
