In [5]:
import librosa
import librosa.display
import IPython.display as ipd
import numpy as np
import sklearn
import pandas as pd
from sklearn.model_selection import LeaveOneOut
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import matplotlib
import matplotlib.pyplot as plt
import latexify as lt
import os
from thinkdsp import read_json
#LOSO Validation
loo = LeaveOneOut()

LR with advanced features (MFCC, Chroma and Melspectogram)

In [6]:
n95 = pd.read_csv("n95data-p1.csv").dropna()
print("Total Data Points = ",len(n95))
n95.head(7)

Total Data Points =  11


Unnamed: 0.1,Unnamed: 0,Filename,rPEF,rFEV1,rFVC,rFEVFVC,tPEF,tFEV1,tFVC,tFEVFVC,PEFdiff,FEVdiff,FVCdiff,age,height
0,2,006_FVC_N95_1.json,9.426188,1.957211,2.547454,76.830098,5.57,2.77,3.02,91.721854,-3.856188,0.812789,0.472546,27,155
1,10,012_FVC_N95_2.json,9.541296,1.602125,2.243371,71.415974,5.47,2.4,2.72,88.235294,-4.071296,0.797875,0.476629,25,163
2,15,Anubhav_FVC_N95Mask_1.json,9.9108,2.477272,3.786664,65.420948,6.05,3.33,4.33,76.905312,-3.8608,0.852728,0.543336,21,180
3,18,Rishiraj_FVC_N95Mask_1.json,12.438241,2.250131,2.983454,75.420321,8.3,3.02,3.49,86.532951,-4.138241,0.769869,0.506546,28,165
4,21,Rohit_FVC_N95Mask_2.json,12.282675,2.874539,3.737664,76.907372,7.79,3.65,4.24,86.084906,-4.492675,0.775461,0.502336,21,170
5,14,014_FVC_N95_2.json,10.651584,1.759281,2.005181,87.736791,6.16,2.41,2.52,95.634921,-4.491584,0.650719,0.514819,25,165
6,16,017_FVC_N95_2.json,9.194612,1.098786,3.0489,36.038764,5.03,1.98,2.42,81.818182,-4.164612,0.881214,-0.6289,55,160


In [8]:
def get_features(filepath):
    wave = read_json(filepath=filepath)

    #get mfccs 
    mfc_coefficients = np.mean(librosa.feature.mfcc(y=wave.ys, n_mfcc=13, sr=16000).T,axis=0)
    melspectogram = np.mean(librosa.feature.melspectrogram(y=wave.ys, sr=16000, n_mels=128, fmax=8000).T,axis=0)
    stft_spectrogram=np.abs(librosa.stft(wave.ys))
    chromagram=np.mean(librosa.feature.chroma_stft(S=stft_spectrogram, sr=16000).T,axis=0)
    
    feature_matrix_r = np.hstack((mfc_coefficients,melspectogram,chromagram))
    
    #feature_matrix = np.hstack((chromagram, melspectrogram, mfc_coefficients))
    
    #add the estimated param as a feature
    #feature_matrix = np.hstack((feature_matrix_r, FEV1values))
    return feature_matrix_r


def load_data(lungParam):
    '''
    feature: it is either 'FEV1', 'FVC' or 'PEF'
    '''
    #get the ground truth
    y=  np.array([i for i in n95['t'+lungParam]])
    
    #prepare to store a lot of features in X
    X=[]
    count = 0
    for file in n95['Filename']:
        features = get_features("n95data/"+file)
        X.append(features)
        count += 1
        # '\r' + end='' results in printing over same line
        #print('\r' + f' Processed {count}/{22} audio samples',end=' ')
    #print("All data loaded in X and Y, Please add the estimated features using np.htack")
    X =  np.array(X)
    
    #add the estiamted lung param as a feature
    estiamtedValues = np.atleast_2d(n95['r'+lungParam].to_numpy()).T
    X = np.hstack((X, estiamtedValues))
    # Return arrays to plug into sklearn's cross-validation algorithms
    return X, np.array(y)

In [9]:
#ML inits
regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)

In [10]:
X, Y = load_data('FEV1')
loo.get_n_splits(X)
mse=0
pe= [] 
for train_index, test_index in loo.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    reg = regressor.fit(X_train, y_train)
    #reg = LinearRegression().fit(X_train, y_train)
    #print("Learned coef is = ",reg.coef_[0])
    #print("Learned intercept is = ",reg.intercept_)
    print("Predicted= ", reg.predict(X_test))
    print("True = ", y_test)
    #mse = mse+mean_squared_error(reg.predict(X_test), y_test)
    pe.append(np.abs((y_test-reg.predict(X_test))/reg.predict(X_test)))
print("PE Mean = ", np.mean(pe));
print("PE STD = ", np.std(pe))

TRAIN: [ 1  2  3  4  5  6  7  8  9 10] TEST: [0]
Predicted=  [2.6656]
True =  [2.77]
TRAIN: [ 0  2  3  4  5  6  7  8  9 10] TEST: [1]
Predicted=  [2.7328]
True =  [2.4]
TRAIN: [ 0  1  3  4  5  6  7  8  9 10] TEST: [2]
Predicted=  [2.9163]
True =  [3.33]
TRAIN: [ 0  1  2  4  5  6  7  8  9 10] TEST: [3]
Predicted=  [2.7245]
True =  [3.02]
TRAIN: [ 0  1  2  3  5  6  7  8  9 10] TEST: [4]
Predicted=  [2.8252]
True =  [3.65]
TRAIN: [ 0  1  2  3  4  6  7  8  9 10] TEST: [5]
Predicted=  [2.6344]
True =  [2.41]
TRAIN: [ 0  1  2  3  4  5  7  8  9 10] TEST: [6]
Predicted=  [2.9443]
True =  [1.98]
TRAIN: [ 0  1  2  3  4  5  6  8  9 10] TEST: [7]
Predicted=  [2.8096]
True =  [2.71]
TRAIN: [ 0  1  2  3  4  5  6  7  9 10] TEST: [8]
Predicted=  [2.3883]
True =  [3.31]
TRAIN: [ 0  1  2  3  4  5  6  7  8 10] TEST: [9]
Predicted=  [2.7236]
True =  [2.09]
TRAIN: [0 1 2 3 4 5 6 7 8 9] TEST: [10]
Predicted=  [2.9984]
True =  [2.04]
PE Mean =  0.18995870668675438
PE STD =  0.11967992361910698
