In [1]:
import os
import torch
import torchaudio
import numpy as np
import pandas as pd
import torch.nn as nn
from transformers import (
    AutoFeatureExtractor,
    Wav2Vec2ForSequenceClassification

)
from IPython.display import  Audio

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def process_audio(speech_array, sr, target_sample_rate, processor):
        speech_array = torch.mean(speech_array, dim=0, keepdim=True)
        if sr != target_sample_rate : 
            transform = torchaudio.transforms.Resample(sr, target_sample_rate)
            speech_array = transform(speech_array)

        fixed_length = (
            target_sample_rate * 3
        )  # Adjust this value based on your requirements
        if speech_array.shape[1] < fixed_length:
            speech_array = torch.nn.functional.pad(speech_array, (0, fixed_length - speech_array.shape[1]))
        else:
            speech_array = speech_array[:, :fixed_length]
            
        speech_array =  processor(speech_array, sampling_rate=target_sample_rate, do_normalize=True, return_tensors="pt").input_values[0]
        speech_array = speech_array.squeeze()
        return speech_array

In [3]:
model_path = 'facebook/wav2vec2-xls-r-300m'
label2id = {'real':0, 'fake':1}
id2label = {0:"real", 1:"fake"}

feature_extractor = AutoFeatureExtractor.from_pretrained(model_path)
model = Wav2Vec2ForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path=model_path,
    num_labels=len(label2id),
    label2id=label2id,
    id2label=id2label
)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
class Wav2VecClassificationModel(nn.Module):
    def __init__(self, model):
        super(Wav2VecClassificationModel, self).__init__()
        self.model = model

    def forward(self, input_values):
        x = self.model(input_values)['logits']
        x = nn.Softmax()(x)
        return x

In [5]:
model_w2v = Wav2VecClassificationModel(model=model)
model_w2v.load_state_dict(torch.load('W2V/models/checkpoint_epoch_10baseline_w2v-xls-300-dataaug-multistep-lr.pt'))

<All keys matched successfully>

In [6]:
audio_fake, sr_fake = torchaudio.load('dedicatoria_fake.wav')
audio_real, sr_real = torchaudio.load('dedicatoria_real.wav')

In [7]:
print("Amostra real")
Audio('dedicatoria_real.wav')

Amostra real


In [8]:
print("Amostra Fake")
Audio('dedicatoria_fake.wav')

Amostra Fake


In [9]:
sample_fake = process_audio(speech_array=audio_fake, sr=sr_fake, target_sample_rate=16000,processor=feature_extractor)
sample_real = process_audio(speech_array=audio_real, sr=sr_real, target_sample_rate=16000,processor=feature_extractor)


In [10]:
with torch.no_grad():
    sample_fake_preds = model_w2v(sample_fake.unsqueeze(0))
    print("Probabilidades para a amostra fake:")
    print(f"Real: {sample_fake_preds[0,0]:.4f}    Fake: {sample_fake_preds[0,1]:.4f}")
    print(f"Classe predita: {id2label[sample_fake_preds.argmax().item()]}")



Probabilidades para a amostra fake:
Real: 0.0001    Fake: 0.9999
Classe predita: fake


  return self._call_impl(*args, **kwargs)


In [11]:
with torch.no_grad():
    sample_real_preds = model_w2v(sample_real.unsqueeze(0))
    print("Probabilidades para a amostra fake:")
    print(f"Real: {sample_real_preds[0,0]:.4f}    Fake: {sample_real_preds[0,1]:.4f}")
    print(f"Classe predita: {id2label[sample_real_preds.argmax().item()]}")

Probabilidades para a amostra fake:
Real: 0.9998    Fake: 0.0002
Classe predita: real
