# Boomerang Audio Analysis with Raw Audio

In [3]:
import pandas as pd
import seaborn as sns
import datetime
import numpy as np
%matplotlib inline
import matplotlib.pylab as plt
from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report

import logging
logging.basicConfig(level=logging.WARN)
import sklearn.metrics

In [4]:
import os
from natsort import natsorted

path ='../csv_files/' # use your path
allFiles = [path+file for file in natsorted(os.listdir(path)) if not file.startswith('TBLT_')]
allFiles
print("Total Files",len(allFiles))

Total Files 19


In [18]:
data = pd.DataFrame()
list_ = []
file_id_list = []
for file_ in allFiles:
    df = pd.read_csv(file_,index_col=None, header=None)
    df.columns = ['id','time','P1','P2','label']
    dt = df.loc[:,['id','label']].drop_duplicates()
    dt['act_id'] = dt["id"].astype(int)
    dt['act_id'] = dt['act_id'] - min(dt['id']) + 1
    dt['file'] = file_.split("/")[2]
    file_id_list.append(dt)
    list_.append(df)
data = pd.concat(list_)
file_list = pd.concat(file_id_list)
file_list.columns = ['id','label','actual_id','Filename']

In [20]:
data.head()

Unnamed: 0,id,time,P1,P2,label
0,1,0.0,0.0,0.0,P
1,1,0.000125,0.0,0.0,P
2,1,0.00025,0.0,0.0,P
3,1,0.000375,0.0,0.0,P
4,1,0.0005,0.0,0.0,P


In [None]:
data.columns = ['time','P1','P2','label','id']

In [21]:
# Convert to Milliseconds
data['time'] = data['time'] * 1e+3


In [22]:
# Take one sample for every 1000 samples
data = data.iloc[::1000, :]

In [23]:
# Sort data frame by id and time
data = data.sort_values(['id', 'time'], ascending=[True, True])

In [24]:
id_values = data.id.unique()

In [25]:
data.head()

Unnamed: 0,id,time,P1,P2,label
0,1,0.0,0.0,0.0,P
1000,1,125.0,0.0,0.0,P
2000,1,250.0,0.0,0.0,P
3000,1,375.0,0.0,0.0,P
4000,1,500.0,0.0,0.0,P


In [26]:
file_list.head()

Unnamed: 0,id,label,actual_id,Filename
0,1,P,1,11-14-2014-320-G.csv
313784,2,C,2,11-14-2014-320-G.csv
3067441,3,C,3,11-14-2014-320-G.csv
5025970,4,C,4,11-14-2014-320-G.csv
7463963,5,A,5,11-14-2014-320-G.csv


In [27]:
file_list.loc[file_list['id']==1,["actual_id","Filename"]]

Unnamed: 0,actual_id,Filename
0,1,11-14-2014-320-G.csv


In [None]:
# Print Time series graphs
import matplotlib
#pdf_doc = matplotlib.backends.backend_pdf.PdfPages('A-plots.pdf')
for i in id_values:
    if data[data.id == i].label.unique()[0] == 'P':
        data[(data['id'] == i)][['time', 'P1', 'P2']].plot(x='time', title="Label at: "+str(file_list.loc[file_list['id']==i,["actual_id","Filename"]])+
                                                      "--"+data[data.id == i].label.unique()[0], figsize=(12, 6));
        
        #pdf_doc.savefig(sub_data_plot.get_figure())
        
#pdf_doc.close()

In [None]:
t = data[(data['id'] == 1)][['time', 'P1', 'P2']].plot(x='time', title="Label at: "+str(file_list.loc[file_list['id']==1,["actual_id","Filename"]])+
                                                      "--"+data[data.id == i].label.unique()[0], figsize=(12, 6));

In [None]:
t

In [34]:
extraction_settings = ComprehensiveFCParameters()

In [35]:
data = data[data.label != 'IG']
data_input = data[['time','P1','P2','id']]
data_input.id.unique()

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        28,  29,  30,  31,  32,  33,  34,  35,  41,  42,  43,  44,  45,
        46,  47,  48,  49,  50,  51,  52,  53,  56,  57,  58,  59,  60,
        61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105,
       106, 107, 108, 110, 111, 112, 113, 114, 116, 117, 118, 119, 120,
       121, 122, 123, 124, 125, 126, 127, 128, 130, 131, 132, 133, 134,
       135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147,
       149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161,
       162, 163, 164, 165, 166, 167, 168, 170, 171, 172, 174, 175, 176,
       177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
       190, 191, 192, 193, 195, 196, 197, 198, 199, 200, 201, 20

In [36]:
# Fix minor label issues with data
data.at[data.label=='C ','label'] = 'C'
data.at[data.label=='A  ','label'] = 'A'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [37]:
data_label = data[['label','id']]
data_label = data_label.drop_duplicates()['label']

In [38]:
pd.Series(data_label).value_counts()

C    175
A    143
P     51
Name: label, dtype: int64

In [39]:
data_label = data_label.as_matrix().flatten()

In [40]:
X = extract_features(data_input, 
                     column_id='id', column_sort='time',
                     default_fc_parameters=extraction_settings,
                     impute_function= impute)

Feature Extraction: 100%|██████████| 738/738 [16:47<00:00,  1.37s/it]   
 'P1__friedrich_coefficients__m_3__r_30__coeff_1'
 'P1__friedrich_coefficients__m_3__r_30__coeff_2'
 'P1__friedrich_coefficients__m_3__r_30__coeff_3'
 'P1__max_langevin_fixed_point__m_3__r_30'
 'P2__friedrich_coefficients__m_3__r_30__coeff_0'
 'P2__friedrich_coefficients__m_3__r_30__coeff_1'
 'P2__friedrich_coefficients__m_3__r_30__coeff_2'
 'P2__friedrich_coefficients__m_3__r_30__coeff_3'
 'P2__max_langevin_fixed_point__m_3__r_30'] did not have any finite values. Filling with zeros.


In [41]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 369 entries, 1 to 605
Columns: 1576 entries, P1__abs_energy to P2__variance_larger_than_standard_deviation
dtypes: float64(1576)
memory usage: 4.4 MB


In [None]:
X.shape

In [104]:
X_filtered = extract_relevant_features(data_input, data_label, 
                                       column_id='id', column_sort='time', 
                                       default_fc_parameters=extraction_settings)

Feature Extraction: 100%|██████████| 738/738 [17:46<00:00,  1.44s/it]   
 'P1__friedrich_coefficients__m_3__r_30__coeff_1'
 'P1__friedrich_coefficients__m_3__r_30__coeff_2'
 'P1__friedrich_coefficients__m_3__r_30__coeff_3'
 'P1__max_langevin_fixed_point__m_3__r_30'
 'P2__friedrich_coefficients__m_3__r_30__coeff_0'
 'P2__friedrich_coefficients__m_3__r_30__coeff_1'
 'P2__friedrich_coefficients__m_3__r_30__coeff_2'
 'P2__friedrich_coefficients__m_3__r_30__coeff_3'
 'P2__max_langevin_fixed_point__m_3__r_30'] did not have any finite values. Filling with zeros.








In [105]:
X_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 369 entries, 1 to 605
Columns: 477 entries, P1__fft_coefficient__coeff_93__attr_"abs" to P1__partial_autocorrelation__lag_8
dtypes: float64(477)
memory usage: 1.3 MB


In [116]:
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
new_t = sel.fit_transform(X_filtered)

In [117]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import decomposition
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import SMOTE


sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_sample(new_t, data_label)
X_res.shape


    



(525, 305)

In [108]:
import pandas as pd
pd.Series(y_res).value_counts()


A    175
P    175
C    175
dtype: int64

## Evaluation of Boomerang Problem With Random Forests having all three predictors

In [147]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y_res, test_size=0.20, random_state=42)






forest = RandomForestClassifier(n_estimators=20)

forest.fit(X_train,y_train)
print(classification_report(y_test, forest.predict(X_test)))
predicted = forest.predict(X_test)
confusion_matrix(y_test, predicted)

             precision    recall  f1-score   support

          A       0.87      0.76      0.81        45
          C       0.71      0.89      0.79        27
          P       0.84      0.82      0.83        33

avg / total       0.82      0.81      0.81       105



array([[34,  7,  4],
       [ 2, 24,  1],
       [ 3,  3, 27]])

In [148]:
sklearn.metrics.cohen_kappa_score(y_test,predicted)

0.71217105263157898

## Evaluation of Boomerangs with A lumped with C

In [110]:
# Replace C to A
y_res[y_res=='A'] = 'C'

In [None]:
y_res

In [113]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y_res, test_size=0.10, random_state=42)/

forest = RandomForestClassifier(n_estimators=20)

forest.fit(X_train,y_train)
print(classification_report(y_test, forest.predict(X_test)))
predicted =  forest.predict(X_test)
confusion_matrix(y_test,predicted)

             precision    recall  f1-score   support

          C       0.94      0.94      0.94        36
          P       0.88      0.88      0.88        17

avg / total       0.92      0.92      0.92        53



array([[34,  2],
       [ 2, 15]])

In [114]:
sklearn.metrics.cohen_kappa_score(y_test,predicted)

0.82679738562091498

## Evaluation of Boomerangs with A lumped with P

In [150]:
# Replace C to A
y_res[y_res=='A'] = 'P'

In [239]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y_res, test_size=0.10, random_state=42)

forest = RandomForestClassifier(n_estimators=20)

forest.fit(X_train,y_train)
print(classification_report(y_test, forest.predict(X_test)))
predicted =  forest.predict(X_test)
confusion_matrix(y_test,predicted)

             precision    recall  f1-score   support

          C       0.78      0.88      0.82        16
          P       0.94      0.89      0.92        37

avg / total       0.89      0.89      0.89        53



array([[14,  2],
       [ 4, 33]])

In [240]:
sklearn.metrics.cohen_kappa_score(y_test,predicted)

0.74061990212071782