In [1]:
import torch
import onnx
import toml
import librosa
import lazycon
import time
import os, sys

import coremltools as ct
import numpy as np
import soundfile as sf
import sounddevice as sd

sys.path.append('../experiments')
import core

scikit-learn version 1.3.0 is not supported. Minimum required version: 0.17. Maximum required version: 1.1.2. Disabling scikit-learn conversion API.
Torch version 2.0.1 has not been tested with coremltools. You may run into unexpected errors. Torch 2.0.0 is the most recent version that has been tested.


In [2]:
def create_features(
    data: np.array,
    hop_length_coef: float = 0.01,
    win_length_coef: float = 0.02,
    sample_rate: int = 16000,
    n_mels: int = 64,
) -> np.array:
    """
    As an input all models use standard speech features:
    64 Mel-filterbank calculated from 20ms windows with a 10ms overlap.
    """

    hop_length = int(sample_rate * hop_length_coef)
    win_length = int(sample_rate * win_length_coef)
    if len(data) != 0:
        spec = librosa.feature.melspectrogram(
                    y=data,
                    sr=sample_rate,
                    hop_length=hop_length,
                    n_fft=win_length,
                    n_mels=n_mels,
                )
    else:
        raise AttributeError
    mel_spec = librosa.power_to_db(spec, ref=np.max)

    return mel_spec

In [3]:
def create_features_for_audio(
    wav_name: str,
    hop_length_coef: float = 0.01,
    win_length_coef: float = 0.02,
    sample_rate: int = 16000,
    n_mels: int = 64,
) -> np.array:
    """
    As an input all models use standard speech features:
    64 Mel-filterbank calculated from 20ms windows with a 10ms overlap.
    """

    hop_length = int(sample_rate * hop_length_coef)
    win_length = int(sample_rate * win_length_coef)
    data, rate = librosa.load(wav_name, sr=sample_rate)
    print(data)
    if len(data) != 0:
        spec = librosa.feature.melspectrogram(
                    y=data,
                    sr=rate,
                    hop_length=hop_length,
                    n_fft=win_length,
                    n_mels=n_mels,
                )
    else:
        raise AttributeError
    mel_spec = librosa.power_to_db(spec, ref=np.max)
    return mel_spec

In [4]:
def index2name(
    index: int
) -> str:
    class_dict = {0:"angry", 1:"sad", 2:"neutral", 3:"positive"}
    
    if index > len(class_dict) or index < 0:
        raise AttributeError
    
    return class_dict[index]

In [5]:
dir_path = './model/'
model_name = 'podcasts_finetune_old_w_lr_1e-3_try1'
device = 'cpu'

In [19]:
config_path = os.path.join(dir_path, "train.config")
assert os.path.exists(config_path), f"No train.config in {dir_path}"

model_path = os.path.join(dir_path, model_name)
# check the model
if not os.path.exists(model_path):
    print(f"There is no saved model {model_path}. Nothing to inference")
#     return None

# load the model
cfg = lazycon.load(config_path)
model = cfg.model
    
model.to(device)
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
model = model.double()
model.eval()



ConvSelfAttentionMobileNet(
  (features): Sequential(
    (0): _DeprecatedConvBNAct(
      (0): Conv2d(1, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (1): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU6(inplace=True)
    )
    (1): InvertedResidual(
      (conv): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(4, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=4, bias=False)
          (1): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): Conv2d(4, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (2): InvertedResidual(
      (conv): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(16, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(

In [20]:
sd.query_devices()

> 0 Внешний микрофон, Core Audio (1 in, 0 out)
< 1 Внешние наушники, Core Audio (0 in, 2 out)
  2 Микрофон MacBook Pro, Core Audio (1 in, 0 out)
  3 Динамики MacBook Pro, Core Audio (0 in, 2 out)

In [21]:
fs=16000 
duration = 5 # seconds 
myrecording = sd.rec(duration * fs, samplerate=fs, channels=1, dtype='float64') 
print ("Recording Audio") 
sd.wait() 
print ("Audio recording complete , Play Audio") 
sd.play(myrecording, fs) 

Recording Audio
Audio recording complete , Play Audio


In [29]:
feat = create_features(np.transpose(myrecording)[0])

In [24]:
#feat = create_features_for_audio('wavs/c9780b567a8de31862971aa5412bf834.wav')

In [30]:
print(f"Calculating predicts")
inputs = torch.from_numpy(feat).to(device).unsqueeze(0).unsqueeze(0)

with torch.no_grad():
    probs = model(inputs)

Calculating predicts


In [31]:
pred_class = np.argmax(probs.cpu().numpy(), axis=1)
index2name(pred_class[0])

'neutral'

In [32]:
probs

tensor([[-2.4723, -0.7901,  1.3804, -0.7780]], dtype=torch.float64)