In [87]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import signal
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (RandomForestRegressor, GradientBoostingRegressor)
from sklearn.svm import SVR
from sklearn.model_selection import (KFold, GridSearchCV)
from sklearn.preprocessing import (
    MinMaxScaler,
    Normalizer,
    RobustScaler,
    StandardScaler,
)

In [88]:
# applies low pass filtering
def lpFilter(exp):
    sos = signal.butter(2,5,btype='lowpass',output='sos', analog=False, fs=1/0.01)
    features = ['ax','ay','az','mx','my','mz','gx','gy','gz']

    for i in features:
        exp[i] = signal.sosfilt(sos,exp[i])

def getZeroCrossingRate(arr):
    my_array = np.array(arr)
    return float("{0:.2f}".format((((my_array[:-1] * my_array[1:]) < 0).sum())/len(arr)))
def getMeanCrossingRate(arr):
    return getZeroCrossingRate(np.array(arr) - np.mean(arr))


# raw experiment data, combine imu and video data and create time window splits
def augmentation(exp,expXY):
    exp.yaw = exp.yaw - exp.iloc[0].yaw
    exp['t'] = exp.index * expXY.t.max()/len(exp)

    index=[0]
    epsilon = 0.01
    for i in expXY.index:
        if i == 0:
            continue
        index.append(index[-1])
        while index[-1] < len(exp.index)-1 and abs(exp.iloc[index[-1]].t - expXY.iloc[i].t) > epsilon:
            index[-1] += 1

    newexp = pd.concat([exp.iloc[index].drop(columns=['t']).reset_index(drop=True), expXY], join='outer', axis=1)
    newexp.head()

    # get intervals
    # 1 sec intervals w/ 0.25 sec overlap
    rowdeltaT = newexp.t.max()/len(newexp.index)
    numRows = (round)(1/rowdeltaT)
    numRowsOverlap = (round)(0.25/rowdeltaT)

    indexPairs = []
    currentIndex = 0 
    while currentIndex+numRows < len(newexp.index):
        indexPairs.append((currentIndex, currentIndex+numRows))
        currentIndex += numRows-numRowsOverlap

    stats = ['avg_', 'std_', 'kurt_']#,'var_','rms_','med_','zcr_','mcr_']
    features = ['ax','ay','az','mx','my','mz','gx','gy','gz','yaw','pitch','roll']
    finalexp = {}
    for i in features:
        for e in stats:
            finalexp[e+i] = []
    finalexp['dx'] = []
    finalexp['dy'] = []

    for i in indexPairs:
        data = newexp.iloc[i[0]:i[1]]
        for i in features:
            finalexp[stats[0]+i].append(data[i].mean())
            finalexp[stats[1]+i].append(data[i].std())
            finalexp[stats[2]+i].append(data[i].kurt())
            # finalexp[stats[3]+i].append(data[i].var())
            # finalexp[stats[4]+i].append(np.sqrt(data[i].mean()*data[i].mean()))
            # finalexp[stats[5]+i].append(data[i].median())
            # finalexp[stats[6]+i].append(getZeroCrossingRate(data[i]))
            # finalexp[stats[7]+i].append(getMeanCrossingRate(data[i]))
        finalexp['dx'].append(data.iloc[-1].x - data.iloc[0].x)
        finalexp['dy'].append(data.iloc[-1].y - data.iloc[0].y)

    return pd.DataFrame(finalexp)

In [89]:
# loads all datasets and stores in exp array
exp = {}
numExp = 9
for i in range(9):
    expTemp = pd.read_csv('data/exp' + str(i+1) + '.csv')
    expXYTemp = pd.read_csv('data/exp' + str(i+1) + 'XY.csv')
    lpFilter(expTemp)
    exp[i+1] = augmentation(expTemp,expXYTemp)

# creates finalexp
include = [exp[1],exp[2],exp[3],exp[4],exp[6],exp[7],exp[8],exp[9]]
finalexp = pd.concat(include,join='inner')
finalexp.reset_index(drop=True,inplace=True)

finalexp.tail()
finalexp.drop(columns=['dy']).corr().dx.sort_values(ascending=False)[1:11]

avg_az     0.226646
avg_ay     0.121036
std_mx     0.096383
avg_gy     0.076980
std_gx     0.074002
std_my     0.067339
std_yaw    0.065085
std_gz     0.063159
avg_yaw    0.042978
std_mz     0.036426
Name: dx, dtype: float64

In [90]:
# prep data
features = finalexp.drop(columns=['dy']).corr().dx.sort_values(ascending=False).index[1:11]
X = finalexp[features]
Y = finalexp.dx

num_splits = 10
kf = KFold(n_splits=num_splits, shuffle=True,random_state=42)
stdsclr = StandardScaler()

In [91]:
# decision tree regression
dregr = DecisionTreeRegressor(random_state=42)
dt_params = {'max_features': [None, 1.0,5,'sqrt','log2'], 'min_samples_split': [2,5,7,10,15], 'min_samples_leaf': [1,5,7,10,15,20]}
d_clf = GridSearchCV(estimator=dregr,param_grid=dt_params,cv=num_splits)

d_clf.fit(stdsclr.fit_transform(X),Y)
print(d_clf.best_params_)

# kfold score
score = 0
dregr = DecisionTreeRegressor(max_features=d_clf.best_params_['max_features'], min_samples_split=d_clf.best_params_['min_samples_split'],min_samples_leaf=d_clf.best_params_['min_samples_leaf'], random_state=42)
for (train, test) in kf.split(X):
    trainX = stdsclr.fit_transform(X.iloc[train])
    trainY = Y.iloc[train]
    testX = stdsclr.transform(X.iloc[test])
    testY = Y.iloc[test]
    dregr.fit(trainX, trainY)
    score += dregr.score(testX,testY)

score /= num_splits
print(score)

{'max_features': None, 'min_samples_leaf': 15, 'min_samples_split': 2}
0.6875414103981742


In [92]:
# random forests regression
regr = RandomForestRegressor()
rf_params = {'max_features': [None, 1.0,5,'sqrt','log2'],'n_estimators': [5,10,20,30,50,100], 'min_samples_split': [2,5,7,10,15], 'min_samples_leaf': [1,5,7,10,15,20]}
r_clf = GridSearchCV(estimator=regr,param_grid=rf_params,cv=num_splits)

r_clf.fit(stdsclr.fit_transform(X),Y)
print(r_clf.best_params_)

# kfold score
score = 0
regr = RandomForestRegressor(max_features=r_clf.best_params_['max_features'],min_samples_leaf=r_clf.best_params_['min_samples_leaf'], min_samples_split=r_clf.best_params_['min_samples_split'],n_estimators=r_clf.best_params_['n_estimators'],random_state=42)
for (train, test) in kf.split(X):
    trainX = stdsclr.fit_transform(X.iloc[train])
    trainY = Y.iloc[train]
    testX = stdsclr.transform(X.iloc[test])
    testY = Y.iloc[test]
    regr.fit(trainX, trainY)
    score += regr.score(testX,testY)

score /= num_splits
print(score)

{'max_features': 5, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
0.7504549554817971


In [93]:
# gradient boosted tree regression
gregr = GradientBoostingRegressor()
gb_params = {'max_features': [None, 1.0,5,'sqrt','log2'],'n_estimators': [5,10,20,30,50,100], 'min_samples_split': [2,5,7,10,15], 'min_samples_leaf': [1,5,7,10,15,20]}
g_clf = GridSearchCV(estimator=gregr,param_grid=gb_params,cv=num_splits)

g_clf.fit(stdsclr.fit_transform(X),Y)
print(g_clf.best_params_)

# kfold score
score = 0
gregr = GradientBoostingRegressor(max_features=g_clf.best_params_['max_features'],min_samples_leaf=g_clf.best_params_['min_samples_leaf'], min_samples_split=g_clf.best_params_['min_samples_split'],n_estimators=g_clf.best_params_['n_estimators'],random_state=42)
for (train, test) in kf.split(X):
    trainX = stdsclr.fit_transform(X.iloc[train])
    trainY = Y.iloc[train]
    testX = stdsclr.transform(X.iloc[test])
    testY = Y.iloc[test]
    gregr.fit(trainX, trainY)
    score += gregr.score(testX,testY)

score /= num_splits
print(score)

{'max_features': None, 'min_samples_leaf': 7, 'min_samples_split': 15, 'n_estimators': 10}
0.5855171469102359


In [94]:
# try different preprocessing
minmaxsclr = MinMaxScaler()
normsclr = Normalizer()
rbsclr = RobustScaler(quantile_range=(25, 75))

sclrs = [minmaxsclr,normsclr,rbsclr]

# on decision trees
scores = {'Decision Tree': [0,0,0], 'Random Forests': [0,0,0], 'Gradient Boosting': [0,0,0]}
for i in range(3):
    for (train, test) in kf.split(X):
        trainX = sclrs[i].fit_transform(X.iloc[train])
        trainY = Y.iloc[train]
        testX = sclrs[i].transform(X.iloc[test])
        testY = Y.iloc[test]
        dregr.fit(trainX, trainY)
        scores['Decision Tree'][i] += dregr.score(testX,testY)
        regr.fit(trainX, trainY)
        scores['Random Forests'][i] += regr.score(testX,testY)
        gregr.fit(trainX, trainY)
        scores['Gradient Boosting'][i] += gregr.score(testX,testY)
    scores['Decision Tree'][i] /= num_splits
    scores['Random Forests'][i] /= num_splits
    scores['Gradient Boosting'][i] /= num_splits

for i in scores.keys():
    print(i+" Model")
    print('MinMax:' + str(scores[i][0]))
    print('Norm:' + str(scores[i][1]))
    print('Robust:' + str(scores[i][2]))
    print()

Decision Tree Model
MinMax:0.6875414103981742
Norm:0.5389971241715183
Robust:0.6875414103981742

Random Forests Model
MinMax:0.7504413791309332
Norm:0.6959241003439114
Robust:0.7504561244674429

Gradient Boosting Model
MinMax:0.5855171469102359
Norm:0.49603133338614463
Robust:0.5855171469102359



In [97]:
# repeat for dy
# prep data
print(finalexp.drop(columns=['dx']).corr().dy.sort_values(ascending=False)[1:11])
print()
features = finalexp.drop(columns=['dx']).corr().dy.sort_values(ascending=False).index[1:11]
X = finalexp[features]
Y = finalexp.dy

num_splits = 10
kf = KFold(n_splits=num_splits, shuffle=True,random_state=42)
stdsclr = StandardScaler()

# decision tree regression
print('DecisionTree')
dregr = DecisionTreeRegressor(random_state=42)
dt_params = {'max_features': [None, 1.0,5,'sqrt','log2'], 'min_samples_split': [2,5,7,10,15], 'min_samples_leaf': [1,5,7,10,15,20]}
d_clf = GridSearchCV(estimator=dregr,param_grid=dt_params,cv=num_splits)

d_clf.fit(stdsclr.fit_transform(X),Y)
print(d_clf.best_params_)

# kfold score
score = 0
dregr = DecisionTreeRegressor(max_features=d_clf.best_params_['max_features'], min_samples_split=d_clf.best_params_['min_samples_split'],min_samples_leaf=d_clf.best_params_['min_samples_leaf'], random_state=42)
for (train, test) in kf.split(X):
    trainX = stdsclr.fit_transform(X.iloc[train])
    trainY = Y.iloc[train]
    testX = stdsclr.transform(X.iloc[test])
    testY = Y.iloc[test]
    dregr.fit(trainX, trainY)
    score += dregr.score(testX,testY)

score /= num_splits
print(score)
print()

# random forests regression
print('Random Forests')
regr = RandomForestRegressor()
rf_params = {'max_features': [None, 1.0,5,'sqrt','log2'],'n_estimators': [5,10,20,30,50,100], 'min_samples_split': [2,5,7,10,15], 'min_samples_leaf': [1,5,7,10,15,20]}
r_clf = GridSearchCV(estimator=regr,param_grid=rf_params,cv=num_splits)

r_clf.fit(stdsclr.fit_transform(X),Y)
print(r_clf.best_params_)

# kfold score
score = 0
regr = RandomForestRegressor(max_features=r_clf.best_params_['max_features'],min_samples_leaf=r_clf.best_params_['min_samples_leaf'], min_samples_split=r_clf.best_params_['min_samples_split'],n_estimators=r_clf.best_params_['n_estimators'],random_state=42)
for (train, test) in kf.split(X):
    trainX = stdsclr.fit_transform(X.iloc[train])
    trainY = Y.iloc[train]
    testX = stdsclr.transform(X.iloc[test])
    testY = Y.iloc[test]
    regr.fit(trainX, trainY)
    score += regr.score(testX,testY)

score /= num_splits
print(score)
print()

# gradient boosted tree regression
print('Gradient Boosting')
gregr = GradientBoostingRegressor()
gb_params = {'max_features': [None, 1.0,5,'sqrt','log2'],'n_estimators': [5,10,20,30,50,100], 'min_samples_split': [2,5,7,10,15], 'min_samples_leaf': [1,5,7,10,15,20]}
g_clf = GridSearchCV(estimator=gregr,param_grid=gb_params,cv=num_splits)

g_clf.fit(stdsclr.fit_transform(X),Y)
print(g_clf.best_params_)

# kfold score
score = 0
gregr = GradientBoostingRegressor(max_features=g_clf.best_params_['max_features'],min_samples_leaf=g_clf.best_params_['min_samples_leaf'], min_samples_split=g_clf.best_params_['min_samples_split'],n_estimators=g_clf.best_params_['n_estimators'],random_state=42)
for (train, test) in kf.split(X):
    trainX = stdsclr.fit_transform(X.iloc[train])
    trainY = Y.iloc[train]
    testX = stdsclr.transform(X.iloc[test])
    testY = Y.iloc[test]
    gregr.fit(trainX, trainY)
    score += gregr.score(testX,testY)

score /= num_splits
print(score)
print()

# try different preprocessing
print('Preprocessing effect')
minmaxsclr = MinMaxScaler()
normsclr = Normalizer()
rbsclr = RobustScaler(quantile_range=(25, 75))

sclrs = [minmaxsclr,normsclr,rbsclr]

# on decision trees
scores = {'Decision Tree': [0,0,0], 'Random Forests': [0,0,0], 'Gradient Boosting': [0,0,0]}
for i in range(3):
    for (train, test) in kf.split(X):
        trainX = sclrs[i].fit_transform(X.iloc[train])
        trainY = Y.iloc[train]
        testX = sclrs[i].transform(X.iloc[test])
        testY = Y.iloc[test]
        dregr.fit(trainX, trainY)
        scores['Decision Tree'][i] += dregr.score(testX,testY)
        regr.fit(trainX, trainY)
        scores['Random Forests'][i] += regr.score(testX,testY)
        gregr.fit(trainX, trainY)
        scores['Gradient Boosting'][i] += gregr.score(testX,testY)
    scores['Decision Tree'][i] /= num_splits
    scores['Random Forests'][i] /= num_splits
    scores['Gradient Boosting'][i] /= num_splits

for i in scores.keys():
    print(i+" Model")
    print('MinMax:' + str(scores[i][0]))
    print('Norm:' + str(scores[i][1]))
    print('Robust:' + str(scores[i][2]))
    print()

avg_ay       0.225051
std_mx       0.189047
std_yaw      0.072882
avg_az       0.068624
kurt_mx      0.053175
avg_yaw      0.050325
kurt_az      0.050188
kurt_ay      0.034056
kurt_ax      0.026149
avg_pitch    0.020075
Name: dy, dtype: float64

DecisionTree
{'max_features': None, 'min_samples_leaf': 10, 'min_samples_split': 2}
0.6176472834637319

Random Forests
{'max_features': 5, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
0.7293022334543902

Gradient Boosting
{'max_features': None, 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 20}
0.6275386667750651

Preprocessing effect
Decision Tree Model
MinMax:0.6176472834637319
Norm:0.4698743361304413
Robust:0.617656191301047

Random Forests Model
MinMax:0.7290548151268974
Norm:0.6325581090088523
Robust:0.7291967292699522

Gradient Boosting Model
MinMax:0.6275386667750651
Norm:0.5467116595030923
Robust:0.6276933124068129



In [98]:
len(X)

792