## User Identification From Walking Activity  - Try 1 vs All classifier

- Author: Ryan Inghilterra
- Date: 4.29.21

- Dataset: https://archive.ics.uci.edu/ml/datasets/User+Identification+From+Walking+Activity#

**Prompt:** imagine you are working at a company that develops personalized health and lifestyle recommendations for users based on their activity patterns, which are recorded via a wearable device. Before giving recommendations, your manager asks you to assess the viability of the recommendation system by first seeing if you can identify users using only their accelerometer data. In this assignment, you will design and implement an experiment to answer this question. 

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import warnings
import os

import matplotlib.pyplot as plt

import plotly.graph_objects as go
import plotly.express as px
import plotly.offline as py

from plotly.subplots import make_subplots

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import (
    make_scorer,
    classification_report,
    accuracy_score,
    balanced_accuracy_score,
)

warnings.filterwarnings(action="once")
np.set_printoptions(suppress=True)
pd.set_option("display.float_format", lambda x: "%.5f" % x)
pd.options.display.max_rows = 500
pd.options.display.max_columns = 500

%matplotlib inline

In [2]:
date = dt.datetime.today().strftime("%m_%d_%Y")
print(date)

07_14_2021


train and test already split from previous notebook export

In [9]:
train_df = pd.read_csv("user_ident_class_train.csv")

In [10]:
train_df.describe()

Unnamed: 0,time_step,x_accel,y_accel,z_accel,user_id
count,104532.0,104532.0,104532.0,104532.0,104532.0
mean,130.04721,-1.71654,8.82758,0.49362,13.89742
std,117.01097,2.87894,2.8231,3.07268,5.9214
min,0.0,-19.572,-9.616,-14.982,1.0
25%,39.78,-3.2553,7.2733,-1.2258,9.0
50%,89.301,-1.3757,8.8941,-0.08172,16.0
75%,192.4725,-0.04086,10.379,1.4574,18.0
max,474.5,15.895,19.572,19.341,22.0


In [11]:
test_df = pd.read_csv("user_ident_class_test.csv")

In [12]:
test_df.describe()

Unnamed: 0,time_step,x_accel,y_accel,z_accel,user_id
count,44800.0,44800.0,44800.0,44800.0,44800.0
mean,315.0879,-1.5125,8.6336,0.70014,13.89701
std,192.94463,2.83375,2.64481,3.31144,5.92163
min,20.31,-17.734,-10.924,-12.558,1.0
25%,136.94,-2.833,6.8919,-1.2667,9.0
50%,269.725,-1.1441,8.6898,-0.04086,16.0
75%,498.2,-0.0,10.147,1.6889,18.0
max,676.68,19.314,19.423,19.341,22.0


In [13]:
print(train_df.shape, test_df.shape)

(104532, 5) (44800, 5)


### Feature Engineering and EDA

#### Time Step Difff

combine time_diff between each step, since small dataset and limited time, do basic for look per user

In [16]:
def time_step_calc_df(df):

    all_df_list = []
    for user_id in sorted(df.user_id.unique()):
        adf = df[df["user_id"] == user_id].reset_index(drop=True)
        # get previous time_step values
        shift_time_step = adf["time_step"].shift(1)
        # create new column with prev time step values
        adf["prev_time_step"] = shift_time_step
        # the first row will be null since no previous value, so fill 0
        adf["prev_time_step"] = adf["prev_time_step"].fillna(0)
        # calc diff
        adf["time_step_diff"] = adf["time_step"] - adf["prev_time_step"]
        all_df_list.append(adf)

    all_df = pd.concat(all_df_list)
    print(all_df.shape)
    return all_df

In [17]:
train_df = time_step_calc_df(train_df)
train_df.head(2)

(104532, 7)


Unnamed: 0,time_step,x_accel,y_accel,z_accel,user_id,prev_time_step,time_step_diff
0,0.0,0.69464,3.1735,7.5048,1,0.0,0.0
1,0.03064,0.14982,3.4868,9.2755,1,0.0,0.03064


In [18]:
test_df = time_step_calc_df(test_df)
test_df.head(2)

(44800, 7)


Unnamed: 0,time_step,x_accel,y_accel,z_accel,user_id,prev_time_step,time_step_diff
0,107.38,-4.985,10.188,-2.833,1,0.0,107.38
1,107.41,-3.9771,8.2812,-3.4459,1,107.38,0.03


#### Resultant Vector

Want to combine x,y,z acceleration together into single number, could be useful feature

In [19]:
def res_vector_calc_row(row):
    x = row["x_accel"]
    y = row["y_accel"]
    z = row["z_accel"]
    res = res_vector_calc(x, y, z)
    return res


def res_vector_calc(x, y, z):
    res = np.sqrt(x ** 2 + y ** 2 + z ** 2)
    return res

In [20]:
train_df["res_vector"] = train_df.apply(lambda row: res_vector_calc_row(row), axis=1)
test_df["res_vector"] = test_df.apply(lambda row: res_vector_calc_row(row), axis=1)

In [21]:
train_df.head(2)

Unnamed: 0,time_step,x_accel,y_accel,z_accel,user_id,prev_time_step,time_step_diff,res_vector
0,0.0,0.69464,3.1735,7.5048,1,0.0,0.0,8.17775
1,0.03064,0.14982,3.4868,9.2755,1,0.0,0.03064,9.91035


#### Window and Lag Feature Engineering

for modeling with this type of time series classification we need to transform our data into a format where each row represents a window. So each row window will have the summary statistics of the window as features (columns), and each lag feature as a column, which are precisely all the values within the window that window sequential order.

In [22]:
def get_window_feat_df(df, window_size):
    adf = df.copy()
    # lag should be (window size - 1) since window includes current
    lags = list(range(1, window_size))
    lag_cols = ["x_accel", "y_accel", "z_accel", "res_vector"]
    for lag_col in lag_cols:
        for lag in lags:
            adf = add_lag_cols(adf, lag, lag_col)

    window_cols = ["x_accel", "y_accel", "z_accel", "res_vector", "time_step_diff"]
    for window_col in window_cols:
        adf = add_roll_window_mean_std(adf, window_size, window_col)

    # remove nulls
    adf = adf.dropna()
    return adf

In [23]:
def add_lag_cols(df, lag_num, col):
    df_list = []
    for user_id in sorted(df.user_id.unique()):
        adf = df[df["user_id"] == user_id].reset_index(drop=True)
        # get previous time_step values
        lag_col = adf[col].shift(lag_num)
        # create new column
        lag_col_name = "{0}_lag{1}".format(col, lag_num)
        adf[lag_col_name] = lag_col
        df_list.append(adf)

    fdf = pd.concat(df_list)
    return fdf

In [24]:
def add_roll_window_mean_std(df, window_size, col):
    df_list = []
    for user_id in sorted(df.user_id.unique()):
        adf = df[df["user_id"] == user_id].reset_index(drop=True)
        window = adf[col].rolling(window=window_size)
        wmean_col = window.mean()
        wstd_col = window.std()
        mean_col_name = "{0}_rwindow{1}_mean".format(col, window_size)
        std_col_name = "{0}_rwindow{1}_std".format(col, window_size)
        adf[mean_col_name] = wmean_col
        adf[std_col_name] = wstd_col
        df_list.append(adf)

    fdf = pd.concat(df_list)
    return fdf

In [25]:
w2_train_df = get_window_feat_df(train_df, 2)
w5_train_df = get_window_feat_df(train_df, 5)
w10_train_df = get_window_feat_df(train_df, 10)
w25_train_df = get_window_feat_df(train_df, 25)

w2_test_df = get_window_feat_df(test_df, 2)
w5_test_df = get_window_feat_df(test_df, 5)
w10_test_df = get_window_feat_df(test_df, 10)
w25_test_df = get_window_feat_df(test_df, 25)

In [26]:
w50_train_df = get_window_feat_df(train_df, 50)
w50_test_df = get_window_feat_df(test_df, 50)

In [27]:
w2_test_df.head()

Unnamed: 0,time_step,x_accel,y_accel,z_accel,user_id,prev_time_step,time_step_diff,res_vector,x_accel_lag1,y_accel_lag1,z_accel_lag1,res_vector_lag1,x_accel_rwindow2_mean,x_accel_rwindow2_std,y_accel_rwindow2_mean,y_accel_rwindow2_std,z_accel_rwindow2_mean,z_accel_rwindow2_std,res_vector_rwindow2_mean,res_vector_rwindow2_std,time_step_diff_rwindow2_mean,time_step_diff_rwindow2_std
1,107.41,-3.9771,8.2812,-3.4459,1,107.38,0.03,9.81172,-4.985,10.188,-2.833,11.69066,-4.48105,0.71269,9.2346,1.34831,-3.13945,0.43339,10.75119,1.32861,53.705,75.90791
2,107.44,-5.8567,8.9213,-4.6309,1,107.41,0.03,11.63339,-3.9771,8.2812,-3.4459,9.81172,-4.9169,1.32908,8.60125,0.45262,-4.0384,0.83792,10.72255,1.28812,0.03,0.0
3,107.47,-7.7772,9.6568,-4.4811,1,107.44,0.03,13.18404,-5.8567,8.9213,-4.6309,11.63339,-6.81695,1.358,9.28905,0.52008,-4.556,0.10592,12.40871,1.09647,0.03,0.0
4,107.5,-6.851,8.1177,-3.4459,1,107.47,0.03,11.16725,-7.7772,9.6568,-4.4811,13.18404,-7.3141,0.65492,8.88725,1.08831,-3.9635,0.732,12.17564,1.42608,0.03,0.0
5,107.53,-2.8739,6.7012,-1.5255,1,107.5,0.03,7.44933,-6.851,8.1177,-3.4459,11.16725,-4.86245,2.81223,7.40945,1.00162,-2.4857,1.35793,9.30829,2.62897,0.03,0.0


In [28]:
w10_train_df.columns

Index(['time_step', 'x_accel', 'y_accel', 'z_accel', 'user_id',
       'prev_time_step', 'time_step_diff', 'res_vector', 'x_accel_lag1',
       'x_accel_lag2', 'x_accel_lag3', 'x_accel_lag4', 'x_accel_lag5',
       'x_accel_lag6', 'x_accel_lag7', 'x_accel_lag8', 'x_accel_lag9',
       'y_accel_lag1', 'y_accel_lag2', 'y_accel_lag3', 'y_accel_lag4',
       'y_accel_lag5', 'y_accel_lag6', 'y_accel_lag7', 'y_accel_lag8',
       'y_accel_lag9', 'z_accel_lag1', 'z_accel_lag2', 'z_accel_lag3',
       'z_accel_lag4', 'z_accel_lag5', 'z_accel_lag6', 'z_accel_lag7',
       'z_accel_lag8', 'z_accel_lag9', 'res_vector_lag1', 'res_vector_lag2',
       'res_vector_lag3', 'res_vector_lag4', 'res_vector_lag5',
       'res_vector_lag6', 'res_vector_lag7', 'res_vector_lag8',
       'res_vector_lag9', 'x_accel_rwindow10_mean', 'x_accel_rwindow10_std',
       'y_accel_rwindow10_mean', 'y_accel_rwindow10_std',
       'z_accel_rwindow10_mean', 'z_accel_rwindow10_std',
       'res_vector_rwindow10_mean', 're

### Modeling and Evaluation

Remove time_step and previous time_step, since we don't want our model to rely at all on those values.

For simple MVP, we can start with default parameters and randomforest classifier only.

Later on it would be easier to add in GridSearchCV and different models.

#### Basic Modeling

In [29]:
def train_classifier_evaluate(train_df, test_df, y_col, clf, test_split_perc=0.3):
    X_train = train_df.copy()
    del X_train["user_id"]
    del X_train["time_step"]
    del X_train["prev_time_step"]
    y_train = train_df[y_col]

    X_test = test_df.copy()
    del X_test["user_id"]
    del X_test["time_step"]
    del X_test["prev_time_step"]
    y_test = test_df[y_col]

    clf.fit(X_train, y_train)

    train_predictions = clf.predict(X_train)
    print("train accuracy: {0}".format(accuracy_score(y_train, train_predictions)))
    test_predictions = clf.predict(X_test)
    print("test accuracy: {0}".format(accuracy_score(y_test, test_predictions)))

    return clf, X_train, X_test, y_train, y_test

In [30]:
def plot_feature_importance(clf, feature_names):
    # from https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html

    importances = clf.feature_importances_
    std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)
    forest_importances = pd.Series(importances, index=feature_names)
    fig, ax = plt.subplots()
    forest_importances.plot.bar(yerr=std, ax=ax)
    ax.set_title("Feature importances using MDI")
    ax.set_ylabel("Mean decrease in impurity")
    fig.tight_layout()

### One vs Rest classifier

In [31]:
from sklearn.multiclass import OneVsRestClassifier

w10_clf, w10_X_train, w10_X_test, w10_y_train, w10_y_test = train_classifier_evaluate(
    w10_train_df, w10_test_df, "user_id", OneVsRestClassifier(RandomForestClassifier(random_state=0))
)

train accuracy: 1.0
test accuracy: 0.5628447154836106


In [32]:
w10_clf, w10_X_train, w10_X_test, w10_y_train, w10_y_test = train_classifier_evaluate(
    w10_train_df, w10_test_df, "user_id", RandomForestClassifier(random_state=0)
)

train accuracy: 1.0
test accuracy: 0.5589211246132461


So one vs all did not help accuracy