<a href="https://colab.research.google.com/github/rsidorchuk93/audio/blob/main/audio_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Audio processing

In [None]:
!pip install transformers pydub librosa resampy -q

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Upload audio file and play it

In [None]:
from IPython.display import Audio

# Load the audio file
audio_file = '/content/drive/My Drive/test/audio/me_neutral.ogg'

# Play the audio file
Audio(audio_file, rate=44100)

## Transcribe with Open AI Whisper transformer

In [None]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torchaudio

# load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")

# read audio file
audio_file = '/content/drive/My Drive/test/audio/me_neutral.ogg'
input_signal, original_sampling_rate = torchaudio.load(audio_file)

# resample audio data
resampler = torchaudio.transforms.Resample(original_sampling_rate, 16000)
input_signal = resampler(input_signal)
sampling_rate = 16000

# preprocess audio
input_features = processor(input_signal.numpy()[0], sampling_rate=sampling_rate, return_tensors="pt").input_features

# generate token ids
predicted_ids = model.generate(input_features)
# decode token ids to text
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

print(transcription)



[' Hello, my name is Roman, how are you doing?']


## Recignize emotions

In [None]:
import torch
import resampy
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification

# load model and processor
model_name_or_path = "facebook/wav2vec2-base-960h"
processor = Wav2Vec2Processor.from_pretrained(model_name_or_path)
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name_or_path, num_labels=4)

# define label mapping
label_mapping = {0: "neutral", 1: "happy", 2: "sad", 3: "angry"}

Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForSequenceClassification: ['lm_head.bias', 'lm_head.weight']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['projector.weight', 'classifier.bias', 'projector.bias', 'wav2vec2.masked_spec_embed', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be 

In [None]:
# read audio file
audio_file = '/content/drive/My Drive/test/audio/me_positive.ogg'
input_signal, sampling_rate = torchaudio.load(audio_file)

# resample audio
input_signal = input_signal.numpy()[0]
input_signal = resampy.resample(input_signal, sampling_rate, 16000)

# preprocess audio
input_values = processor(input_signal, sampling_rate=16000, return_tensors="pt").input_values

# generate token ids
logits = model(input_values).logits
predicted_class = torch.argmax(logits, dim=-1).item()

# map predicted class to label
predicted_label = label_mapping[predicted_class]

print("Predicted Emotion: ", predicted_label)

Predicted Emotion:  neutral
