In [1]:
import opensmile 
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import random
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.manifold import TSNE
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns
from sklearn import svm
base_path = "/home/ubuntu/"

## Read saved dataframes to generate features and labels

In [2]:
data_path_lj = base_path + 'testing-code/opensmile-feature-importance/smile_dfs/LJ_sample_11200.csv'
df = pd.read_csv(data_path_lj)

In [3]:
df.shape

(11200, 6378)

In [4]:
y = df['label'].copy()
X = df.drop(columns=['file', 'type', 'start', 'end','label']).copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [5]:
#standard scaling of input data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [14]:
features = df.drop(columns=['file', 'type', 'start', 'end','label']).columns.to_list()

Univariate feature selection works by selecting the best features based on univariate statistical tests. It can be seen as a preprocessing step to an estimator. Scikit-learn exposes feature selection routines as objects that implement the transform method:

## SelectKBest

In [10]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif

In [11]:
#Select top 2 features based on mutual info regression
selector = SelectKBest(mutual_info_classif, k = 50)
selector.fit(X_train_scaled, y_train)

SelectKBest(k=50, score_func=<function mutual_info_classif at 0x7fe540890430>)

In [13]:
len(selector.get_support())

6373

In [17]:
df.drop(columns=['file', 'type', 'start', 'end','label']).columns[selector.get_support()]

Index(['mfcc_sma[5]_lpc1', 'mfcc_sma[6]_lpc0', 'mfcc_sma[6]_lpc1',
       'mfcc_sma[7]_lpc0', 'mfcc_sma[7]_lpc1', 'mfcc_sma[8]_lpc1',
       'mfcc_sma[9]_lpgain', 'mfcc_sma[9]_lpc0', 'mfcc_sma[9]_lpc1',
       'mfcc_sma[10]_lpc0', 'mfcc_sma[10]_lpc1', 'mfcc_sma[11]_lpgain',
       'mfcc_sma[11]_lpc1', 'mfcc_sma[12]_lpgain', 'mfcc_sma[12]_lpc1',
       'mfcc_sma[13]_lpgain', 'mfcc_sma_de[3]_lpc1', 'mfcc_sma_de[3]_lpc2',
       'mfcc_sma_de[5]_lpc0', 'mfcc_sma_de[5]_lpc1', 'mfcc_sma_de[5]_lpc2',
       'mfcc_sma_de[6]_lpgain', 'mfcc_sma_de[6]_lpc0', 'mfcc_sma_de[6]_lpc1',
       'mfcc_sma_de[6]_lpc2', 'mfcc_sma_de[6]_lpc3', 'mfcc_sma_de[7]_lpc0',
       'mfcc_sma_de[7]_lpc1', 'mfcc_sma_de[7]_lpc2', 'mfcc_sma_de[8]_lpgain',
       'mfcc_sma_de[8]_lpc0', 'mfcc_sma_de[8]_lpc1', 'mfcc_sma_de[8]_lpc2',
       'mfcc_sma_de[9]_lpgain', 'mfcc_sma_de[9]_lpc1',
       'mfcc_sma_de[10]_lpgain', 'mfcc_sma_de[10]_lpc0',
       'mfcc_sma_de[10]_lpc1', 'mfcc_sma_de[11]_lpgain',
       'mfcc_sma_de[11]_

 ## Recursive Feature Elimination (RFE)

In [25]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")

In [28]:
# #Selecting the Best important features according to Logistic Regression
rfe_selector = RFE(estimator=LogisticRegression(),n_features_to_select = 50, step = 10)
rfe_selector.fit(X_train_scaled, y_train)

RFE(estimator=LogisticRegression(), n_features_to_select=50, step=10)

In [29]:
df.drop(columns=['file', 'type', 'start', 'end','label']).columns[rfe_selector.get_support()]

Index(['audspec_lengthL1norm_sma_iqr1-3', 'pcm_RMSenergy_sma_de_stddev',
       'pcm_fftMag_fband250-650_sma_percentile1.0',
       'pcm_fftMag_spectralFlux_sma_quartile3',
       'pcm_fftMag_spectralFlux_sma_iqr1-2',
       'pcm_fftMag_spectralEntropy_sma_iqr2-3',
       'pcm_fftMag_spectralVariance_sma_quartile1',
       'pcm_fftMag_spectralSkewness_sma_quartile3',
       'pcm_fftMag_spectralHarmonicity_sma_quartile2', 'mfcc_sma[1]_range',
       'mfcc_sma[1]_quartile3', 'mfcc_sma[2]_quartile2', 'mfcc_sma[5]_lpc1',
       'mfcc_sma[6]_lpc2', 'mfcc_sma[7]_quartile3', 'mfcc_sma[9]_lpc0',
       'mfcc_sma[11]_lpc1', 'audSpec_Rfilt_sma_de[22]_iqr1-3',
       'pcm_fftMag_spectralFlux_sma_de_pctlrange0-1',
       'mfcc_sma_de[1]_quartile3', 'mfcc_sma_de[3]_lpc1',
       'mfcc_sma_de[5]_lpgain', 'voicingFinalUnclipped_sma_quartile3',
       'jitterLocal_sma_quartile2', 'jitterDDP_sma_amean',
       'jitterDDP_sma_flatness', 'jitterDDP_sma_posamean',
       'jitterDDP_sma_quartile1', 'jitter

In [None]:
from sklearn.feature_selection import RFECV
cv_estimator = RandomForestClassifier(random_state =42)
cv_estimator.fit(X_train_scaled, y_train)
cv_selector = RFECV(cv_estimator,cv= 5, step=50,scoring='accuracy')
cv_selector = cv_selector.fit(X_train_scaled, y_train)

In [None]:
tdf = df.drop(columns=['file', 'type', 'start', 'end','label'])
rfecv_mask = cv_selector.get_support() #list of booleans
rfecv_features = [] 
for bool, feature in zip(rfecv_mask, tdf.columns):
    if bool:
        rfecv_features.append(feature)
print(‘Optimal number of features :’, cv_selector.n_features_)
print(‘Best features :’, rfecv_features)
n_features = tdf.shape[1]
plt.figure(figsize=(8,8))
plt.barh(range(n_features), cv_estimator.feature_importances_, align='center') 
plt.yticks(np.arange(n_features), tdf.columns.values) 
plt.xlabel('Feature importance')
plt.ylabel('Feature')
plt.show()