In [None]:
from tensorflow import keras
import os
import numpy as np
import librosa
import sounddevice as sd
import IPython.display as ipd

In [None]:
model = keras.models.load_model('nnet')

In [None]:
loaded = np.load('X_train_mean_std.npz')
X_train_mean = loaded['X_train_mean']
X_train_std = loaded['X_train_std']

In [None]:
def aavg(input):
    return np.mean(np.abs(input), keepdims=True)

def sdev(input):
    return np.std(input, keepdims= True)

def energy(input):
    return np.sum((input*1.0)**2, keepdims=True)

def mfcc(input, rate=22050, sampling=5):
    # Sample values
    signal = input[::sampling]
    # Compute MFCC coefficients
    mfcc = librosa.feature.mfcc(signal*1.0, sr=int(rate/sampling))
    # Flatten into monodimensional vector for the SVM
    mfcc = mfcc.flatten()
    return mfcc

def combo(input):
    return np.concatenate((aavg(input),sdev(input),energy(input), mfcc(input)))

# Score Audio

In [None]:
duration = 10 # (seconds)
rec_rate = 22050
rec = sd.rec(int(duration * rec_rate), samplerate=rec_rate, 
             channels=1, blocking=True)
ipd.Audio(rec[:,0], rate=rec_rate)

In [None]:
def predict_language(rec, model):
    eps = 0.001
    features = np.array(combo(rec[:,0]))
    features = ((features) - X_train_mean + eps)/(X_train_std + eps)
    features = features.reshape((1,features.shape[0]))
    predict = model.predict(features).argmax(axis = -1)
    labels = {0: 'tedesco', 1: 'inglese', 2: 'spagnolo'}
    return labels[predict[0]]
    

In [None]:
print(predict_language(rec, model))

inglese
