In [1]:
import glob

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import pywt
import biosppy

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

from sklearn.ensemble import BaggingClassifier

from sklearn.svm import LinearSVC

In [2]:
files = map(np.load, glob.glob("../export/data/physio_data*.npz"))

In [3]:
def apply_wavelet(chunk, w_mother='db3', w_maxlev=None):

    w = pywt.Wavelet(w_mother)        

    if w_maxlev is None:
        w_maxlev = pywt.dwt_max_level(len(chunk), w.dec_len)

    # extract only the wavelet approximation coefficient from
    # the latest meaningful level
    return pywt.downcoef('a', chunk, w, level=w_maxlev)


def reconstruct_wavelet(feature, w_maxlev, w_lenchunk, w_mother='db3'):
    
    w = pywt.Wavelet(w_mother)        

    # try the reconstruction from the considered coefficient
    rec_chunk = pywt.upcoef('a', feature, w, level=w_maxlev , take=w_lenchunk)

    # smooth the reconstruction
    rec_chunk = biosppy.tools.smoother(rec_chunk)['signal']

    return rec_chunk

    
def extract_feat(signal, window=(2,1), fps=25, show=False):

    chunk_len = int(fps) * window[0]
    overlap_len = chunk_len - (int(fps) * window[1])
    
    # split signal in chunks and apply the related feature extractor
    feature =  biosppy.signals.tools.windower(signal=signal, size=chunk_len, step=1, fcn=apply_wavelet)
    
    feature = pd.DataFrame(feature[1])

    return feature


def get_dataset():
    
    # load data

    n_subject = len(files)

    skip_sample = 100
    n_samples = 1500

    data = pd.DataFrame()

    for f in files:

        session_info = f["session_info"].tolist()

        mediafile = session_info["mediaFile"]
        sid = int(session_info["sessionId"])
        label = int(session_info["feltVlnc"])

        print "load data for {}".format(sid)

        d = pd.DataFrame()

        for signal in ['ecg', 'eda', 'resp', 'skt']:
            s = f[signal].tolist()
            s_fps = s["fps"]
            s_data = s["processed"][skip_sample:skip_sample + n_samples]
            coef = extract_feat(s_data, fps=s_fps, window=(1,1))
            d = pd.concat([d, coef], axis=1)
            
        d["label"] = label

        # concatenate all subjects' data
        data = pd.concat([data, d], ignore_index=True)

    # remove NaN values with median for each feature 
    data.fillna(data.median(), inplace=True)

    X = data.drop(["label"], axis=1)
    Y = data["label"]
    
    # reduce class numbers
    Y[Y==1] = 0
    Y[Y==2] = 0
    Y[Y==3] = 0
    Y[Y==5] = 0
    Y[Y==7] = 0
    Y[Y==12] = 0

    Y[Y==4] = 1
    Y[Y==6] = 1
    Y[Y==8] = 1
    Y[Y==9] = 1
    Y[Y==11] = 1
    
    return X, Y

# Create classifier and test prediction
def create_and_test_classifier(X, Y):
    
    clf = LinearSVC(C=1, tol=1e-4, dual=False, fit_intercept=False, class_weight='balanced', verbose=False)
    cv = KFold(n_splits=10, shuffle=False, random_state=0)
    
    n_estimators = 10
    bagging = BaggingClassifier(clf, max_samples=1.0 / n_estimators, n_estimators=n_estimators, n_jobs=-1, verbose=True)

    scores = cross_val_score(bagging, X, Y.values.ravel(), cv=cv, n_jobs=-1, verbose=True, scoring='balanced_accuracy')

    return scores

In [4]:
X, Y = get_dataset()
print X.shape
print Y.shape

load data for 398
load data for 408
load data for 420
load data for 426
load data for 430
load data for 524
load data for 530
load data for 542
load data for 546
load data for 548
load data for 786
load data for 798
load data for 800
load data for 810
load data for 814
load data for 920
load data for 926
load data for 932
load data for 944
load data for 948
load data for 1052
load data for 1054
load data for 1066
load data for 10
load data for 1178
load data for 1188
load data for 1200
load data for 1206
load data for 1210
load data for 132
load data for 142
load data for 152
load data for 1562
load data for 1586
load data for 1588
load data for 1592
load data for 1598
load data for 160
load data for 166
load data for 1698
load data for 1702
load data for 1712
load data for 1714
load data for 1726
load data for 1952
load data for 1956
load data for 1962
load data for 1974
load data for 2086
load data for 2088
load data for 2090
load data for 20
load data for 2100
load data for 2118
loa

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documen

(165424, 36)
(165424,)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [5]:
scores = create_and_test_classifier(X, Y)
print "\taccuracy: {:.2f} (+/- {:.2f})".format(scores.mean(), scores.std() * 2)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


	accuracy: 0.52 (+/- 0.18)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  3.8min finished
