Steps:
* Reinterpolete the datasets in order to all observations to have the same length (20 timestamps)
* Roughly estimate parameters of classifiers with both datasets separately and together
* Compute first integral and add it to datasets
* Compare results

In [1]:
import csv
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import pandas as pd
%matplotlib notebook

In [2]:
from scipy.interpolate import interp1d
from copy import deepcopy

In [3]:
def inter(Obs, ndots=100):
    interpolated = []
    for obs in Obs:
        x = np.linspace(0, 100, num=obs.shape[0], endpoint=True)
        f = interp1d(x, obs, kind='slinear')
        xnew = np.linspace(0, 100, num=ndots, endpoint=True)
        interpolated.append(f(xnew))
    return interpolated

In [4]:
def integrate(data):
    data_int = deepcopy(data)
    for block in data_int:
        for signal_type in data_int[block]:
            for axis in data_int[block][signal_type]:
                for obs_ind in range(len(data_int[block][signal_type][axis])):
                    data_int[block][signal_type][axis][obs_ind] = np.cumsum(data_int[block][signal_type][axis][obs_ind])
    return data_int

In [5]:
def read_data(filename):
    data = {}
    with open(filename, 'rb') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='|')
        for row in reader:
            block = row[-1][1:]
            if block.isdigit():
                block = int(block)
            del row[-1]
            del row[0]
            if not block in data:
                data[block] = {}
                data[block]['gyr'] = {}
                data[block]['acc'] = {}
                data[block]['gyr']['x'] = []
                data[block]['gyr']['y'] = []
                data[block]['gyr']['z'] = []
                data[block]['acc']['x'] = []
                data[block]['acc']['y'] = []
                data[block]['acc']['z'] = []
            frow = np.array(map(float, row))
            data[block]['acc']['x'].append(np.array(frow[range(0,len(row),6)]))
            data[block]['acc']['y'].append(np.array(frow[range(1,len(row),6)]))
            data[block]['acc']['z'].append(np.array(frow[range(2,len(row),6)]))
            data[block]['gyr']['x'].append(np.array(frow[range(3,len(row),6)]))
            data[block]['gyr']['y'].append(np.array(frow[range(4,len(row),6)]))
            data[block]['gyr']['z'].append(np.array(frow[range(5,len(row),6)]))
    return data

In [6]:
def df_from_dict_interpoleted(data, ndots=20, use_gyro=True):
    columns = ['block']
    if use_gyro:
        for signal_type in ['acc', 'gyr']:
            for axis in ['x', 'y', 'z']:
                for ind in range(ndots):
                    columns.append('_'.join((signal_type, axis, str(ind))))
    else:
        for axis in ['x', 'y', 'z']:
            for ind in range(ndots):
                columns.append('_'.join(('acc', axis, str(ind))))
    df = pd.DataFrame(columns=columns)
    for block in data:
        acc_x = inter(data[block]['acc']['x'], ndots)
        acc_y = inter(data[block]['acc']['y'], ndots)
        acc_z = inter(data[block]['acc']['z'], ndots)
        if use_gyro:
            gyr_x = inter(data[block]['gyr']['x'], ndots)
            gyr_y = inter(data[block]['gyr']['y'], ndots)
            gyr_z = inter(data[block]['gyr']['z'], ndots)
        #print np.hstack(([[block]]*len(acc_x), acc_x, acc_y, acc_z, gyr_x, gyr_y, gyr_z))
            df = df.append(pd.DataFrame(np.hstack(([[block]]*len(acc_x), acc_x, acc_y, acc_z, gyr_x, gyr_y, gyr_z)), columns=columns))
        else:
            df = df.append(pd.DataFrame(np.hstack(([[block]]*len(acc_x), acc_x, acc_y, acc_z)), columns=columns))
    df.reset_index(inplace=True)
    del df['index']
    return df

In [7]:
data1 = read_data('data1.csv') # Your data
data2 = read_data('data2.csv') # My data (I dont have a gyroscope)

In [8]:
import os
mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-5.3.0-posix-seh-rt_v4-rev0\\mingw64\\bin'
os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']
from sklearn.svm import SVC
from sklearn import cross_validation
from sklearn.dummy import DummyClassifier
from xgboost.sklearn import XGBClassifier
from sklearn import grid_search
from sklearn.metrics import accuracy_score

In [9]:
df1 = df_from_dict_interpoleted(data1)
df2 = df_from_dict_interpoleted(data2)
df1_acc = df_from_dict_interpoleted(data1, use_gyro=False)
df2_acc = df_from_dict_interpoleted(data2, use_gyro=False)
df1_vel = df_from_dict_interpoleted(integrate(data1), use_gyro=False)
df2_vel = df_from_dict_interpoleted(integrate(data2), use_gyro=False)
df1i = pd.concat([df1, df_from_dict_interpoleted(integrate(data1))], axis=1)
df2i = pd.concat([df2, df_from_dict_interpoleted(integrate(data2))], axis=1)
df = df1.append(df2).reset_index()
dfi = df1i.append(df2i).reset_index()

In [10]:
target1 = df1['block']
del df1['block']
X1 = df1
target2 = df2['block']
del df2['block']
X2 = df2
target = df['block']
del df['block']
del df['index']
X = df
target1i = df1i['block'].ix[:,0]
del df1i['block']
X1i = df1i.as_matrix()
target2i = df2i['block'].ix[:,0]
del df2i['block']
X2i = df2i.as_matrix()
targeti = dfi['block'].ix[:,0]
del dfi['block']
del dfi['index']
Xi = dfi.as_matrix()
target1_acc = df1_acc['block']
del df1_acc['block']
X1_acc = df1_acc
target2_acc = df2_acc['block']
del df2_acc['block']
X2_acc = df2_acc

In [11]:
target1_vel = df1_vel['block']
del df1_vel['block']
X1_vel = df1_vel
target2_vel = df2_vel['block']
del df2_vel['block']
X2_vel = df2_vel

What do we have:

X1 - first dataset

X2 - second dataset

X - both datasets 

X1i - first dataset + its integrated observations

X2i - second dataset + its integrated observations

Xi - both datasets + its integrated observations

Here we sloppy estimate parameters of XGB and SVM clsfrs. Also we consider a dummy estimator, based on class frequency.

In [128]:
def params_xgb():
    max_depth=np.random.choice([5,10,15,20],1)
    learning_rate=np.random.choice([0.01, 0.05, 0.1, 0.5, 1],1)
    n_estimators=np.random.choice([10, 50, 100, 150, 200, 300],1)
    subsample=np.random.choice([0.3, 0.6, 0.7, 0.8, 1],1)
    colsample_bytree=np.random.choice([0.4, 0.6, 0.7, 0.8, 1],1)
    return XGBClassifier(learning_rate=learning_rate[0], n_estimators=n_estimators[0], max_depth=max_depth[0],
                        subsample=subsample[0], colsample_bytree=colsample_bytree[0])

In [None]:
while True:
    xgbc = params_xgb()
    scores_xgb = cross_validation.cross_val_score(xgbc, Xi, targeti, cv=4)
    if np.mean(scores_xgb)>0.6:
        print xgbc
        print np.mean(scores_xgb)

In [None]:
param_grid = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
              {'C': [1, 10, 100, 1000], 'kernel': ['rbf']}]
grid_search1 = grid_search.GridSearchCV(svc, param_grid=param_grid)
grid_search1.fit(Xi, targeti)
print(grid_search1.grid_scores_)

In [13]:
def svm_xgb_dummy(X, y):
    dumb = DummyClassifier()
    svc = SVC(kernel='linear')
    xgb = XGBClassifier(colsample_bytree=0.4, learning_rate=0.05, max_depth=10, n_estimators=200, subsample=0.6)
    scores_svc = cross_validation.cross_val_score(svc, X, y, cv=4)
    scores_dumb = cross_validation.cross_val_score(dumb, X, y, cv=4)
    scores_xgb = cross_validation.cross_val_score(xgb, X, y, cv=4)
    return np.mean(scores_svc), np.mean(scores_xgb), np.mean(scores_dumb)

In [14]:
results = {}
results['dataset'] = []
results['SVM'] = []
results['XGB'] = []
results['Dummy'] = []

In [15]:
s_svm, s_xgb, s_dum = svm_xgb_dummy(X1, target1)
results['dataset'].append('data1')
results['SVM'].append(s_svm)
results['XGB'].append(s_xgb)
results['Dummy'].append(s_dum)
s_svm, s_xgb, s_dum = svm_xgb_dummy(X2, target2)
results['dataset'].append('data2')
results['SVM'].append(s_svm)
results['XGB'].append(s_xgb)
results['Dummy'].append(s_dum)
s_svm, s_xgb, s_dum = svm_xgb_dummy(X, target)
results['dataset'].append('data1+data2')
results['SVM'].append(s_svm)
results['XGB'].append(s_xgb)
results['Dummy'].append(s_dum)
s_svm, s_xgb, s_dum = svm_xgb_dummy(X1i, target1i)
results['dataset'].append('data1+integrated')
results['SVM'].append(s_svm)
results['XGB'].append(s_xgb)
results['Dummy'].append(s_dum)
s_svm, s_xgb, s_dum = svm_xgb_dummy(X2i, target2i)
results['dataset'].append('data2+integrated')
results['SVM'].append(s_svm)
results['XGB'].append(s_xgb)
results['Dummy'].append(s_dum)
s_svm, s_xgb, s_dum = svm_xgb_dummy(Xi, targeti)
results['dataset'].append('data1+data2+integrated')
results['SVM'].append(s_svm)
results['XGB'].append(s_xgb)
results['Dummy'].append(s_dum)
s_svm, s_xgb, s_dum = svm_xgb_dummy(X1_acc, target1_acc)
results['dataset'].append('data1 accelerometer only')
results['SVM'].append(s_svm)
results['XGB'].append(s_xgb)
results['Dummy'].append(s_dum)
s_svm, s_xgb, s_dum = svm_xgb_dummy(X2_acc, target2_acc)
results['dataset'].append('data2 accelerometer only')
results['SVM'].append(s_svm)
results['XGB'].append(s_xgb)
results['Dummy'].append(s_dum)
s_svm, s_xgb, s_dum = svm_xgb_dummy(X1_vel, target1_vel)
results['dataset'].append('data1 velocity only')
results['SVM'].append(s_svm)
results['XGB'].append(s_xgb)
results['Dummy'].append(s_dum)
s_svm, s_xgb, s_dum = svm_xgb_dummy(X2_vel, target2_vel)
results['dataset'].append('data2 velocity only')
results['SVM'].append(s_svm)
results['XGB'].append(s_xgb)
results['Dummy'].append(s_dum)

In [16]:
results_df = pd.DataFrame()
results_df.from_dict(results)

Unnamed: 0,Dummy,SVM,XGB,dataset
0,0.058201,0.8775,0.811234,data1
1,0.065368,0.912507,0.794024,data2
2,0.082476,0.74038,0.682392,data1+data2
3,0.081172,0.938828,0.751285,data1+integrated
4,0.08105,0.909472,0.789767,data2+integrated
5,0.081209,0.739604,0.701771,data1+data2+integrated
6,0.098201,0.869435,0.739506,data1 accelerometer only
7,0.057841,0.912507,0.808035,data2 accelerometer only
8,0.051885,0.874079,0.647512,data1 velocity only
9,0.072547,0.909472,0.805909,data2 velocity only


In [239]:
DummyClassifier()

DummyClassifier(constant=None, random_state=None, strategy='stratified')