In [1]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.tree import DecisionTreeClassifier

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

from utilize.data import *
from utilize.transform import *
from utilize.feature_selection import *
from utilize.test import *

Found 60 users data.


In [2]:
# Load all the data 
X, y, M, user_index, feature_names, label_names = load_all_data()

In [3]:
# Only select body state label
target_label = ['LYING_DOWN', 'SITTING', 'FIX_walking', 'FIX_running', 'BICYCLING', 'OR_standing']

# Use the last 5 user's data as test set
test_uuid = list(range(56, 61))

# Fill the Nan with mean value and normalize all the data 
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('std_scaler', StandardScaler())
])

In [4]:
# Process the feature data and split the train test set

X_new, y_new, M_new = select_target_labels(X, y, M, target_label, label_names, drop_all_zero = False)
#y_new = np.sum(y_new * [1, 2, 3, 4, 5, 6], axis = 1)

X_new = pipeline.fit_transform(X_new, y_new)
X_train, y_train, M_train, X_test, y_test, M_test = split_by_users(X_new, y_new, M_new, test_uuid, user_index)

In [5]:
X.shape, y.shape, M.shape
X_new.shape, y_new.shape, M_new.shape
X_train.shape, X_test.shape

((356519, 225), (20827, 225))

In [6]:
core_sensor = ['raw_acc', 'proc_gyro', 'watch_acceleration', 'audio_naive', 'location', 'discrete']
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((356519, 225), (356519, 6), (20827, 225), (20827, 6))

In [7]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape


((356519, 225), (356519, 6), (20827, 225), (20827, 6))

## Random Forest Test

In [8]:
# from sklearn.ensemble import RandomForestClassifier
# model = RandomForestClassifier(class_weight = 'balanced', n_estimators = 10, min_samples_split = 10)

# model.fit(X_train, y_train)
# accuracy, sensitivity, specificity, BA = evaluate_model(model, X_test, y_test, report = True)

# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
# rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# rf = RandomForestRegressor(n_estimators = 10, min_samples_split = 10)
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(class_weight = 'balanced', n_estimators = 10, min_samples_split = 10)
# Train the model on training data
rf.fit(X_train, y_train);

In [9]:
# Use the forest's predict method on the test data
predictions = rf.predict(X_test)
# # Calculate the absolute errors
# errors = abs(predictions - y_test)
# # Print out the mean absolute error (mae)
# print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

In [10]:
for i in range(len(predictions)):
    for j in range(len(predictions[0])):
        if(predictions[i][j] == y_test[i][j]):
            print(target_label[j])

LYING_DOWN
SITTING
FIX_walking
FIX_running
BICYCLING
OR_standing
LYING_DOWN
FIX_walking
FIX_running
BICYCLING
OR_standing
LYING_DOWN
SITTING
FIX_walking
FIX_running
BICYCLING
OR_standing
LYING_DOWN
FIX_walking
FIX_running
BICYCLING
OR_standing
LYING_DOWN
SITTING
FIX_walking
FIX_running
BICYCLING
OR_standing
LYING_DOWN
FIX_walking
FIX_running
BICYCLING
OR_standing
LYING_DOWN
SITTING
FIX_walking
FIX_running
BICYCLING
OR_standing
LYING_DOWN
SITTING
FIX_walking
FIX_running
BICYCLING
OR_standing
LYING_DOWN
SITTING
FIX_walking
FIX_running
BICYCLING
OR_standing
LYING_DOWN
SITTING
FIX_walking
FIX_running
BICYCLING
OR_standing
LYING_DOWN
SITTING
FIX_walking
FIX_running
BICYCLING
OR_standing
LYING_DOWN
FIX_walking
FIX_running
BICYCLING
OR_standing
LYING_DOWN
FIX_walking
FIX_running
BICYCLING
OR_standing
LYING_DOWN
FIX_walking
FIX_running
BICYCLING
OR_standing
LYING_DOWN
SITTING
FIX_walking
FIX_running
BICYCLING
OR_standing
LYING_DOWN
FIX_walking
FIX_running
BICYCLING
OR_standing
LYING_DOWN
FIX_w

In [11]:
evaluate_model(rf, X_test, y_test, report = True)

accuaracy      sensitivity    specificity    BA             
0.899714       0.232129       0.943655       0.587892       


(0.899713512907924,
 0.23212898907306603,
 0.9436551167636348,
 0.5878920529183503)

In [12]:
# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot
# Pull out one tree from the forest
tree = rf.estimators_[5]
# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot
# Pull out one tree from the forest
tree = rf.estimators_[5]
# Export the image to a dot file
export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_names, rounded = True, precision = 1)
# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('tree.dot')
# Write graph to a png file
graph.write_png('tree.png')

ModuleNotFoundError: No module named 'pydot'

In [None]:
# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('./tree.dot')
# Write graph to a png file
graph.write_png('tree.png')

In [None]:
# Limit depth of tree to 3 levels
rf_small = RandomForestRegressor(n_estimators=10, max_depth = 3)
rf_small.fit(X_train, y_train)
# Extract the small tree
tree_small = rf_small.estimators_[5]
# Save the tree as a png image
export_graphviz(tree_small, out_file = 'small_tree.dot', feature_names = feature_names, rounded = True, precision = 1)
(graph, ) = pydot.graph_from_dot_file('small_tree.dot')
graph.write_png('small_tree.png');

In [None]:
(graph, ) = pydot.graph_from_dot_file('./small_tree.dot')
graph.write_png('small_tree.png');

In [None]:
prediction_small = rf_small.predict(X_test)
prediction_small

In [None]:
y_test.shape

In [None]:
y_pred = [[False]*6]*17957
for i in range(len(prediction_small)):
    y_pred_temp = [False]*6
    for j in range(len(prediction_small[0])):
        if (prediction_small[i][j] > 0.5):
            y_pred_temp[j] = True
        else:
            y_pred_temp[j] = False
    y_pred[i] = y_pred_temp

In [None]:
y_pred == y_test

In [None]:
np.sum(y_pred == y_test), len(y_test)*len(y_test[0])

In [None]:
np.sum(y_pred == y_test)/(len(y_test)*len(y_test[0]))