In [None]:
from google.colab import userdata
from google.colab import drive

my_secret_key = userdata.get("comet_ml_api_key")

drive.mount("/content/drive/")

In [None]:
! pip install --upgrade comet_ml --quiet
from comet_ml import start

experiment = start(
    api_key=my_secret_key, project_name="wav2vec-mer", workspace="nikzagl"
)

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/nikzagl/wav2vec-mer/568e0c39b6ff4cc1a2152e153ff43dd3

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/content' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [3]:
%cd "/content/drive/MyDrive/MER Project"

/content/drive/MyDrive/MER Project


In [4]:
import fnmatch
import os

files = []
for root, dirnames, filenames in os.walk("MP3-Example"):
    for filename in fnmatch.filter(filenames, "*mp3"):
        files.append(os.path.join(root, filename))

In [None]:
import re

track_ids = list()
for file in files:
    track_id = re.split(r"\.|-", file)[-2]
    track_ids.append(track_id)

In [6]:
import pandas as pd

musicinfo_df = pd.read_csv("Music Info.csv")

In [7]:
track_ids = pd.DataFrame(track_ids).rename({0: "track_id"}, axis=1)
print(track_ids)
track_ids = track_ids.join(musicinfo_df.set_index("track_id"), on="track_id")

                track_id
0     TRAFNQO12903CBB254
1     TRAAEJQ128F92C484E
2     TRACTQD128F14B0F9D
3     TRANLAK128F429F8D0
4     TRAMMYK128E07936F9
...                  ...
1495  TRXJGBY128F930137D
1496  TRSVTIE128F428079E
1497  TRVBLFJ128F426AAB9
1498  TRXMDGW128F426CDB3
1499  TRPYIKK128F932B961

[1500 rows x 1 columns]


In [8]:
def classify_emotions(energy, valence):
    e = energy - 0.5
    v = valence - 0.5
    if v > 0 and e > 0:
        return "Joy"
    elif v < 0 and e > 0:
        return "Anger"
    elif v < 0 and e < 0:
        return "Sad"
    else:
        return "Pleasure"

In [9]:
emotions = track_ids.apply(lambda x: classify_emotions(x.energy, x.valence), axis=1)

In [10]:
emotions

Unnamed: 0,0
0,Joy
1,Joy
2,Sad
3,Anger
4,Joy
...,...
1495,Joy
1496,Pleasure
1497,Joy
1498,Sad


In [11]:
from sklearn.preprocessing import LabelEncoder

In [12]:
le = LabelEncoder()

In [13]:
labels = le.fit_transform(emotions)

In [14]:
labels

array([1, 1, 3, ..., 1, 3, 0])

In [15]:
import torch
from torch.utils.data import TensorDataset, DataLoader

In [16]:
tensor_df = torch.load("waveforms_tensor.pt")

  tensor_df = torch.load("waveforms_tensor.pt")


In [17]:
from sklearn.model_selection import train_test_split

tensor_df_train, tensor_df_test, labels_train, labels_test = train_test_split(
    tensor_df.T, labels, random_state=42
)

In [18]:
train_dataset = TensorDataset(tensor_df_train, torch.tensor(labels_train))

In [19]:
train_dataset[0]

(tensor([ 0.0746,  0.0559, -0.0754,  ..., -0.0258, -0.0304,  0.0467]),
 tensor(2))

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [21]:
from transformers import AutoModelForAudioClassification

model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base-960h", num_labels=4
).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import random

hyper_params = {"seed": 42, "batch_size": 16, "num_epochs": 25, "learning_rate": 1e-5}


# Logging hyperparamters
experiment.log_parameters(hyper_params)
random.seed(hyper_params["seed"])
torch.manual_seed(hyper_params["seed"])

<torch._C.Generator at 0x79d9ee0affd0>

In [23]:
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import f1_score
import numpy as np

# Получение весов классов
class_weights = compute_class_weight(
    "balanced", classes=np.unique(labels_train), y=labels_train
)

In [None]:
train_loader = DataLoader(
    train_dataset, batch_size=hyper_params["batch_size"], shuffle=True
)
test_dataset = TensorDataset(tensor_df_test, torch.tensor(labels_test))
test_loader = DataLoader(
    test_dataset, shuffle=False, batch_size=hyper_params["batch_size"]
)
# Оптимизатор
optimizer = torch.optim.Adam(model.parameters(), hyper_params["learning_rate"])
loss_function = torch.nn.CrossEntropyLoss(
    weight=torch.tensor(class_weights).float().to(device)
)
num_epochs = 25
model.train()
# Цикл обучения
step = 0
with experiment.train():
    for epoch in range(num_epochs):
        losses = list()
        model.train()
        for audio, labels in train_loader:
            optimizer.zero_grad()
            audio = audio.to(device)
            labels = labels.to(device)
            outputs = model(audio)
            loss = loss_function(outputs.logits, labels)
            losses.append(loss.item())
            print(loss)
            experiment.log_metric(name="loss", value=loss.item(), step=step)
            loss.backward()
            optimizer.step()
            step += 1
            del audio
            del labels
        model.eval()
        accuracy_score = 0
        labels_pred = list()
        for audio, labels in test_loader:
            audio = audio.to(device)
            _, outputs = torch.max(model(audio).logits, 1)
            labels_pred += outputs.cpu().tolist()
        f1_weighted = f1_score(labels_test, labels_pred, average="weighted")
        print(f1_weighted)
        experiment.log_metric(
            name="f1_score_weighted", value=f1_weighted, epoch=epoch + 1
        )

tensor(1.3785, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3873, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3908, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3864, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4009, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3768, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3756, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3874, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3863, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3817, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3663, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3807, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3779, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3861, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4033, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3888, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3421, device='cuda:0', grad_fn=

In [None]:
model.eval()

Wav2Vec2ForSequenceClassification(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
labels_pred = list()
for audio, labels in test_loader:
    audio = audio.to(device)
    _, outputs = torch.max(model(audio).logits, 1)
    labels_pred += outputs.cpu().tolist()
f1_weighted = f1_score(labels_test, labels_pred, average="weighted")
print(f1_weighted)

In [None]:
print(classification_report(labels_test, labels_pred))
experiment.log_confusion_matrix(confusion_matrix(labels_test, labels_pred))

In [None]:
torch.save(model.state_dict(), "/content/drive/MyDrive/MER Project/wav2vec_weights.pth")

In [None]:
experiment.end()