In [71]:
import glob
import os

In [72]:
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [73]:
DATA_DIR = os.path.abspath('./letras')

In [76]:
def windows(data, window_size):
    start = 0    
    while start < len(data):
        yield int(start), int(start + window_size)
        start += (window_size / 2)


def get_features(audio_file):
    X, sample_rate = librosa.load(audio_file)
    stft = np.abs(librosa.stft(X))
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr = sample_rate).T, axis=0)    
    return mfccs,chroma,mel,contrast,tonnetz
    
        
def extract_features(data_dir, file_ext="*.wav", bands = 60, frames = 41):
    
    features, labels = np.empty((0, 193)), np.empty(0)
    
    for dirpath, dirnames, filenames in list(os.walk(data_dir))[1:]:
        for fn in sorted(filenames):
            audio_file = os.path.join(dirpath, fn)            
            label = os.path.dirname(audio_file).split("/")[-1]
            mfccs,chroma,mel,contrast,tonnetz = get_features(audio_file)            
            
            ext_features = np.hstack([mfccs,chroma,mel,contrast,tonnetz])
            features = np.vstack([features, ext_features])
            
            labels = np.append(labels, label)
            
    return np.array(features), np.array(labels, dtype=np.str)

In [75]:
features, labels = extract_features(DATA_DIR)

In [107]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, random_state=0)

In [140]:
knn = KNeighborsClassifier(n_neighbors = 5)

In [141]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [142]:
#0.9130

In [143]:
knn.score(X_test, y_test)

0.89010989010989006

In [144]:
def extract_audio_file_features(audio_file):
    ft = np.empty((0, 193))
    mfccs,chroma,mel,contrast,tonnetz = get_features(audio_file)            
    ext_features = np.hstack([mfccs,chroma,mel,contrast,tonnetz])
    ft = np.array(np.vstack([ft, ext_features]))
    return ft    

In [145]:
def get_model_data_file(letter, n):
    audio_file = os.path.join(DATA_DIR, letter, str(n).zfill(3) + '.wav')
    return audio_file

## Performance dos caracteres alfabéticos

In [146]:
letters = range(ord('a'), ord('z') + 1)
for i in letters:
    char = chr(i)    
    n = 1000
    valid = 0
    for i in range(1, n + 1):
        try:
            audio_file = get_model_data_file(char, i)            
            audio_features = extract_audio_file_features(audio_file)            
            prediction = knn.predict(audio_features)
            if prediction == char:        
                valid += 1
        except:
            print(char, ':', round(float(valid) / i, 2))
            break            

a : 0.91
b : 0.87
c : 0.94
d : 0.73
e : 0.95
f : 0.69
g : 0.85
h : 0.86
i : 0.64
j : 0.9
k : 0.97
l : 0.88
m : 0.94
n : 0.96
o : 0.85
p : 0.94
q : 0.92
r : 0.96
s : 0.65
t : 0.9
u : 0.97
v : 0.94
w : 0.97
x : 0.97
y : 0.92
z : 0.92


## Performance dos caracteres numéricos

In [147]:
for i in range(1, 9):
    char = str(i)    
    n = 1000
    valid = 0
    for i in range(1, n + 1):
        try:
            audio_file = get_model_data_file(char, i)            
            audio_features = extract_audio_file_features(audio_file)            
            prediction = knn.predict(audio_features)
            if prediction == char:        
                valid += 1
        except:
            print(char, ':', round(float(valid) / i, 2))
            break            

1 : 0.57
2 : 0.8
3 : 0.93
4 : 0.89
5 : 0.81
6 : 0.71
7 : 0.85
8 : 0.83


## Teste com dados reais

In [159]:
data = sorted([i for i in os.listdir() if i.endswith('.wav')])
for audio_file in data:
    audio_features = extract_audio_file_features(audio_file)            
    prediction = knn.predict(audio_features)    
    print(prediction)    

['q']
['4']
['v']
['3']
['5']
['5']
