In [1]:
## Load modules
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import numpy as np
from numpy import genfromtxt
import matplotlib.pyplot as plt
import os
import dataHelp as dh
cwd = os.getcwd() # current working directory

In [2]:
def splitSignals(signals, bin_size, data_labels, features, forces):
    if signals == "EMG":
        start = forces*bin_size
        stop = features*bin_size-1
        data_labels = data_labels[forces:]
    elif signals == "forces":
        start = 0
        stop = forces*bin_size-1
        data_labels = data_labels[0:forces]
    elif signals == "all":
        start = 0
        stop = features*bin_size-1
    return start, stop, data_labels

In [6]:
##### Load data
folder_name= "assisted_conditions_30bins"#"incline-load_conditions_30bins" # dataset name here, use conditions for conditions/subjcond simulations
data_type = "subjcond" # subjects, conditions, or subjcond
subject_test = True # set true to iteratively test on all subjects
signals = "all" # all, EMG, or forces

y_ind = 0 # which metabolic predictor to use, 0 is direct measured metabolic
norm = True # whether to normalize data
seed = 1 # to keep consistent results
train_size = 0.8 # percent of data for training set
conds_holdout = 2 # number of conditions to holdout and test for subjcond


if folder_name[0:8] == "assisted":
    dataset = "exo"
    num_conds = 9
    features = 22
    forces = 6
    seed_list = [2,10,3,5,25,1,9,7]
    data_labels = np.array(["Fx_R", "Fy_R", "Fz_R", "Fx_L", "Fy_L", "Fz_L", 
               "MGAS_R", "LGAS_R", "MSOL_R", "LSOL_R", "TA_R", "VASM_R", "RF_R", "BF_R",
               "MGAS_L", "LGAS_L", "MSOL_L", "LSOL_L", "TA_L", "VASM_L", "RF_L", "BF_L"])

elif folder_name[0:12] == "incline-load":
    dataset = "incline_load"
    num_conds = 12
    features = 14
    forces = 6
    seed_list = [41,17,13,5,7,1,40,32,2,10,3,12,9]
    data_labels = np.array(["Fx_R", "Fy_R", "Fz_R", "Fx_L", "Fy_L", "Fz_L",
                            "SOL", "GAS", "TA", "MH", "BF", "VM", "VL", "RF"])


if data_type == "subjects" or data_type == "conditions" or data_type[0:8] == "subjcond":
    if subject_test:
        test_size = 1
        runs = len(seed_list)
    else:
        test_size = 3
        runs = 1
else:
    test_size = 0.1
    runs = 1
    
if data_type == "subjcond":
    data_type = data_type + " " + str(conds_holdout)

training_err = []
deving_err = []
testing_err = []

R2s = []
test_rmses = []
test_maes = []

pred_out = [] # for confusion mat
actual_out = [] # for confusion mat

for i in range(runs): # number of times iteratively testing for averaged results
    
    if subject_test:
        seed = seed_list[i]
    X_train, Y_train, X_dev, Y_dev, X_test, Y_test = dh.loadData(data_type, cwd, seed, y_ind, train_size, test_size, features, norm, folder_name, None, signals)

    # Selecting which signals to include:
    bin_size = int(X_train.shape[1]/features) # check to compute bin size is 30
    start, stop, data_labels = splitSignals(signals, bin_size, data_labels, features, forces)
               
    signal_rng = np.linspace(start,stop,stop-start+1).astype(int)
    X_train = X_train[:,signal_rng]
    X_dev = X_dev[:,signal_rng]
    X_test = X_test[:,signal_rng]
    
    lm = LinearRegression() # define linear model
    lm.fit(X_train,Y_train) # train on the data
    
    train_pred = lm.predict(X_train)
    test_pred = lm.predict(X_test)
    dev_pred = lm.predict(X_dev)
    
    R2 = r2_score(Y_train, train_pred) # compute R squared on training data
    
    test_mae = np.mean(abs(test_pred - Y_test))
    test_rmse = np.sqrt(mean_squared_error(Y_test, test_pred))

    train_err = np.mean(abs((train_pred - Y_train)/Y_train)) # absolute error
    dev_err = np.mean(abs((dev_pred - Y_dev)/Y_dev))
    test_err = np.mean(abs((test_pred - Y_test)/Y_test))
    training_err.append(train_err)
    testing_err.append(test_err)
    deving_err.append(dev_err)
    R2s.append(R2)
    test_maes.append(test_mae)
    test_rmses.append(test_rmse)
    pred_avg, actual_avg = dh.avgSubjectCond(test_pred,Y_test)
    pred_out.append(pred_avg)
    actual_out.append(actual_avg)

print("Train error percent: ",np.mean(training_err)*100)
#print("Dev error percent: ",np.mean(deving_err)*100)
print("Test error percent: ",np.mean(testing_err)*100)
print("Test MAE: ",np.mean(test_maes))
print("Test RMSE: ",np.mean(test_rmses))
print("R2 for the train sets: ",np.mean(R2s))

Test subjects:  [0]
Test conditions (held out from train):  [8, 6]
Dev subjects:  [0]
Train subjects:  [4, 1, 6, 2, 3, 7, 5]
Test subjects:  [1]
Test conditions (held out from train):  [4, 0]
Dev subjects:  [1]
Train subjects:  [2, 3, 6, 7, 0, 4, 5]
Test subjects:  [2]
Test conditions (held out from train):  [8, 3]
Dev subjects:  [2]
Train subjects:  [5, 7, 4, 6, 3, 1, 0]
Test subjects:  [3]
Test conditions (held out from train):  [3, 6]
Dev subjects:  [3]
Train subjects:  [7, 2, 4, 1, 0, 5, 6]
Test subjects:  [4]
Test conditions (held out from train):  [4, 6]
Dev subjects:  [4]
Train subjects:  [3, 0, 1, 5, 7, 2, 6]
Test subjects:  [5]
Test conditions (held out from train):  [5, 8]
Dev subjects:  [5]
Train subjects:  [7, 2, 1, 6, 0, 4, 3]
Test subjects:  [6]
Test conditions (held out from train):  [5, 6]
Dev subjects:  [6]
Train subjects:  [7, 1, 2, 3, 0, 5, 4]
Test subjects:  [7]
Test conditions (held out from train):  [4, 6]
Dev subjects:  [7]
Train subjects:  [2, 5, 0, 6, 3, 1, 4]


In [4]:
print(X_train.shape)
# Analayzing coefficients for linear model
coef_print = 10 # how many coef to print out
coef = lm.coef_ # coefficients for trained model
coef_abs = abs(coef)
coef_order = np.argsort(-coef_abs) # reverse order --> get largest weighted predictors first
for i in range(coef_print):
    feature = coef_order[i]//(bin_size) 
    bin_num = coef_order[i]%bin_size + 1
    print(data_labels[feature] + ' bin ' + str(bin_num))

(4040, 420)
Fz_L bin 10
Fz_R bin 28
MH bin 18
MH bin 19
Fz_R bin 26
Fz_L bin 12
Fz_R bin 23
MH bin 17
Fz_R bin 30
Fz_L bin 9
