# Boomerang Audio Analysis with OpenSmile Emobase Toolkit

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.feature_selection import  SelectKBest

### Combine data from Multiple Files

In [2]:
!ls ../sox_features/

11-14-2014-1020-G.csv  11-17-2014-1148-G.csv  11-20-2014-300.csv
11-14-2014-1148-G.csv  11-18-2014-0515.csv    11-21-2014-0315.csv
11-14-2014-320-G.csv   11-18-2014-1130.csv    11-21-2014-1015.csv
11-14-2014-540-G.csv   11-18-2014-330.csv     11-21-2014-1145.csv
11-17-2014-0130-G.csv  11-19-2014-1145.csv    11-21-2014-810AM.csv
11-17-2014-0600-G.csv  11-19-2014-605.csv
11-17-2014-1010-G.csv  11-20-2014-0435.csv


In [3]:
directory = '../sox_features/'
features_data =  []
for element in os.listdir(directory):
    data = pd.read_csv(directory+element,index_col=None)
    
        
    features_data.append(data)

sox_feature_frame = pd.concat(features_data)
    
    

In [4]:
# Sanity Check to see if the columsn match
for element in np.where(sox_feature_frame['class_y'] == sox_feature_frame['class_x'],'yes','no') :
    if element == 'no':
        print('Data Frame ERROR! Do not proceed!')

In [5]:
columns = ['name_x','name_y','class_x']
sox_feature_frame.drop(columns,inplace=True,axis=1)

In [6]:
# Remove IG Rows from the dataframe

sox_feature_frame = sox_feature_frame[sox_feature_frame.class_y != 'IG']

In [7]:
sox_feature_frame.shape

(369, 1980)

In [8]:
labels_distinct =  sox_feature_frame['class_y'].unique()
labels_distinct

array(['A', 'C', 'P'], dtype=object)

In [9]:
labels = sox_feature_frame['class_y']

In [10]:
sox_feature_frame.drop(['class_y'],inplace=True,axis=1)

### Preprocessing

Convert all Non-Numeric(String) Labels into Numbers. 

In [11]:
df_labels = pd.DataFrame(labels)
label_dummies = pd.get_dummies(df_labels)

In [12]:
label_dummies.head()

Unnamed: 0,class_y_A,class_y_C,class_y_P
0,1,0,0
1,1,0,0
2,0,1,0
3,1,0,0
4,0,1,0


### Feature Selection

In [126]:
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
new_t = sel.fit_transform(sox_feature_frame)

In [127]:
new_t.shape

(369, 894)

In [444]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import decomposition
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import SMOTE


sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_sample(new_t, labels)
X_res.shape


(525, 894)

In [337]:
import pandas as pd
pd.Series(y_res).value_counts()


A    175
P    175
C    175
dtype: int64

### Evaluation of Boomerang Problem with Random Forests having all three Classes

In [486]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report

import logging
logging.basicConfig(level=logging.WARN)
import sklearn.metrics

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.20, random_state=42)

forest = RandomForestClassifier(n_estimators=20)

forest.fit(X_train,y_train)
print(classification_report(y_test, forest.predict(X_test)))
predicted = forest.predict(X_test)
confusion_matrix(y_test, predicted)


             precision    recall  f1-score   support

          A       0.74      0.79      0.76        33
          C       0.87      0.77      0.81        43
          P       0.91      1.00      0.95        29

avg / total       0.84      0.84      0.84       105



array([[26,  5,  2],
       [ 9, 33,  1],
       [ 0,  0, 29]])

In [487]:
print("Cohens Kappa",sklearn.metrics.cohen_kappa_score(y_test,predicted))

Cohens Kappa 0.755747126437


### Evaluation of Boomerangs with A lumped with C

In [490]:
# Replace C to A
y_res[y_res=='A'] = 'C'

In [491]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.10, random_state=42)

forest = RandomForestClassifier(n_estimators=20)

forest.fit(X_train,y_train)
print(classification_report(y_test, forest.predict(X_test)))
predicted =  forest.predict(X_test)
confusion_matrix(y_test,predicted)


             precision    recall  f1-score   support

          C       1.00      0.95      0.97        39
          P       0.88      1.00      0.93        14

avg / total       0.97      0.96      0.96        53



array([[37,  2],
       [ 0, 14]])

In [492]:
print("Cohens Kappa",sklearn.metrics.cohen_kappa_score(y_test,predicted))

Cohens Kappa 0.907180385289


## Evaluation of Boomerangs with A lumped with P

In [442]:
y_res = temp_y

In [358]:
# Replace C to A
y_res[y_res=='A'] = 'P'

In [440]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.10, random_state=42)

forest = RandomForestClassifier(n_estimators=20)

forest.fit(X_train,y_train)
print(classification_report(y_test, forest.predict(X_test)))
predicted =  forest.predict(X_test)
confusion_matrix(y_test,predicted)


             precision    recall  f1-score   support

          C       0.87      0.83      0.85        24
          P       0.87      0.90      0.88        29

avg / total       0.87      0.87      0.87        53



array([[20,  4],
       [ 3, 26]])

In [441]:
print("Cohens Kappa",sklearn.metrics.cohen_kappa_score(y_test,predicted))

Cohens Kappa 0.732516222062


## Comparision of Values

| Method | Three Classes | A lumped with C | A Lumped with P|
| ------ | --------------|-----------------|----------------|
| Time Distance old |  0.82 | 0.85 | Not reported|
| Raw Audio | 0.712 | 0.826 |0.7406 |
| Boomerangs Sox | 0.75 | 0.9071 | 0.7325|