In [1]:
import opensmile 
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import random
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.manifold import TSNE
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns
from sklearn import svm
base_path = "/home/ubuntu/"

## Read saved dataframes to generate features and labels

In [17]:
data_path_lj = base_path + 'testing-code/opensmile-feature-importance/smile_dfs/LJ_sample_11200.csv'
df = pd.read_csv(data_path_lj)

In [18]:
df.shape

(11200, 6378)

## Brute Force Code

In [34]:
archs = list(df.type.unique())[7:]

In [69]:
def get_single_arch_data(df, arch):
    
    df1 = df[df.type=='Original for '+arch]
    df2 = df[df.type==arch]
    
    df_ = pd.concat([df1, df2])
    df_ = df_.drop(columns=['file', 'type', 'start', 'end'])
    
    y = df_['label'].copy()
    X = df_.drop(columns=['label']).copy()
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled, y_train, y_test
    

In [62]:
def run_lr_model(X_train_scaled, X_test_scaled, y_train, y_test, features):
    
    accuracies = []
    
    for i in range(len(features)):
        model_lr = LogisticRegression(solver='liblinear')
        model_lr.fit(X_train_scaled[:,i].reshape(-1, 1), y_train)
        yhat = model_lr.predict(X_test_scaled[:,i].reshape(-1, 1))
        accuracy = accuracy_score(y_test, yhat)
        accuracies.append(accuracy)
        
    return accuracies

In [70]:
features = df.drop(columns=['file', 'type', 'start', 'end', 'label']).columns.to_list()
nbf_df = pd.DataFrame(features, columns=['features'])

for arch in archs:
    
    X_train_scaled, X_test_scaled, y_train, y_test = get_single_arch_data(df, arch)
    
    nbf_df[arch] = run_lr_model(X_train_scaled, X_test_scaled, y_train, y_test, features)
    

In [94]:
#all features
#standard scaling of input data

y = df['label'].copy()
X = df.drop(columns=['file', 'type', 'start', 'end','label']).copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

accuracies = []

for i in range(len(features)):
    model_lr = LogisticRegression(solver='liblinear')
    model_lr.fit(X_train_scaled[:,i].reshape(-1, 1), y_train)
    yhat = model_lr.predict(X_test_scaled[:,i].reshape(-1, 1))
    accuracy = accuracy_score(y_test, yhat)
    accuracies.append(accuracy)

nbf_df['all_archs'] = accuracies


In [98]:
nbf_df[(nbf_df.iloc[:,1:] > 0.5).all(1)]

Unnamed: 0,features,ljspeech_full_band_melgan,ljspeech_melgan,ljspeech_melgan_large,ljspeech_multi_band_melgan,ljspeech_parallel_wavegan,ljspeech_waveglow,ljspeech_hifiGAN,all_archs
5,audspec_lengthL1norm_sma_quartile3,0.5375,0.51875,0.51875,0.63125,0.69375,0.5125,0.6125,0.561607
65,pcm_RMSenergy_sma_quartile1,0.55625,0.5125,0.6,0.5375,0.6,0.5125,0.5375,0.544643
1247,pcm_fftMag_spectralFlux_sma_iqr2-3,0.54375,0.56875,0.55625,0.6,0.63125,0.53125,0.58125,0.576786
1312,pcm_fftMag_spectralEntropy_sma_percentile99.0,0.55,0.7375,0.83125,0.50625,0.5875,0.58125,0.5125,0.611607
1436,pcm_fftMag_spectralSlope_sma_percentile99.0,0.55,0.58125,0.59375,0.575,0.675,0.525,0.50625,0.560714
1671,mfcc_sma[5]_lpc2,0.55,0.86875,0.86875,0.58125,0.525,0.5125,0.50625,0.61875
1702,mfcc_sma[6]_lpc2,0.51875,0.8875,0.90625,0.61875,0.575,0.6,0.525,0.602679
1733,mfcc_sma[7]_lpc2,0.50625,0.8625,0.7875,0.55,0.5375,0.58125,0.575,0.611607
1792,mfcc_sma[9]_lpgain,0.58125,0.81875,0.8875,0.64375,0.5375,0.58125,0.54375,0.616964
1860,mfcc_sma[12]_range,0.5375,0.5125,0.5625,0.6,0.63125,0.625,0.55,0.576786


In [95]:
selected_feats = nbf_df[(nbf_df.iloc[:,1:] > 0.5).all(1)]

In [96]:
selected_feats.shape

(60, 9)

In [97]:
sorted(selected_feats.features.to_list())

['audSpec_Rfilt_sma_de[0]_iqr1-2',
 'audspec_lengthL1norm_sma_amean',
 'audspec_lengthL1norm_sma_quartile3',
 'audspec_lengthL1norm_sma_rqmean',
 'jitterDDP_sma_flatness',
 'jitterDDP_sma_percentile1.0',
 'jitterDDP_sma_quartile1',
 'jitterLocal_sma_flatness',
 'jitterLocal_sma_percentile1.0',
 'jitterLocal_sma_quartile1',
 'logHNR_sma_pctlrange0-1',
 'logHNR_sma_percentile99.0',
 'logHNR_sma_quartile3',
 'logHNR_sma_range',
 'mfcc_sma[10]_peakDistStddev',
 'mfcc_sma[12]_range',
 'mfcc_sma[12]_stddevRisingSlope',
 'mfcc_sma[14]_percentile99.0',
 'mfcc_sma[14]_stddevFallingSlope',
 'mfcc_sma[5]_lpc2',
 'mfcc_sma[6]_lpc2',
 'mfcc_sma[7]_lpc2',
 'mfcc_sma[9]_lpgain',
 'mfcc_sma[9]_meanRisingSlope',
 'mfcc_sma_de[12]_pctlrange0-1',
 'mfcc_sma_de[2]_lpc4',
 'mfcc_sma_de[3]_lpc3',
 'mfcc_sma_de[3]_lpgain',
 'mfcc_sma_de[5]_lpgain',
 'mfcc_sma_de[6]_lpc3',
 'mfcc_sma_de[9]_lpgain',
 'pcm_RMSenergy_sma_amean',
 'pcm_RMSenergy_sma_quartile1',
 'pcm_fftMag_spectralEntropy_sma_percentile99.0',
 '

In [100]:
features = df.drop(columns=['file', 'type', 'start', 'end', 'label']).columns.to_list()

In [110]:
[feat for feat in features if 'mfcc_sma_' in feat]

['mfcc_sma_de[1]_range',
 'mfcc_sma_de[1]_maxPos',
 'mfcc_sma_de[1]_minPos',
 'mfcc_sma_de[1]_quartile1',
 'mfcc_sma_de[1]_quartile2',
 'mfcc_sma_de[1]_quartile3',
 'mfcc_sma_de[1]_iqr1-2',
 'mfcc_sma_de[1]_iqr2-3',
 'mfcc_sma_de[1]_iqr1-3',
 'mfcc_sma_de[1]_percentile1.0',
 'mfcc_sma_de[1]_percentile99.0',
 'mfcc_sma_de[1]_pctlrange0-1',
 'mfcc_sma_de[1]_stddev',
 'mfcc_sma_de[1]_skewness',
 'mfcc_sma_de[1]_kurtosis',
 'mfcc_sma_de[1]_meanSegLen',
 'mfcc_sma_de[1]_maxSegLen',
 'mfcc_sma_de[1]_minSegLen',
 'mfcc_sma_de[1]_segLenStddev',
 'mfcc_sma_de[1]_upleveltime25',
 'mfcc_sma_de[1]_upleveltime50',
 'mfcc_sma_de[1]_upleveltime75',
 'mfcc_sma_de[1]_upleveltime90',
 'mfcc_sma_de[1]_risetime',
 'mfcc_sma_de[1]_leftctime',
 'mfcc_sma_de[1]_lpgain',
 'mfcc_sma_de[1]_lpc0',
 'mfcc_sma_de[1]_lpc1',
 'mfcc_sma_de[1]_lpc2',
 'mfcc_sma_de[1]_lpc3',
 'mfcc_sma_de[1]_lpc4',
 'mfcc_sma_de[2]_range',
 'mfcc_sma_de[2]_maxPos',
 'mfcc_sma_de[2]_minPos',
 'mfcc_sma_de[2]_quartile1',
 'mfcc_sma_de[2]

In [4]:
y = df['label'].copy()
X = df.drop(columns=['file', 'type', 'start', 'end','label']).copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)