In [1]:
import datetime
import os
import re
from collections import Counter
from datetime import datetime, timedelta

import numpy as np
from keras.preprocessing import sequence
import sys
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm, trange
import umap
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.model_selection import StratifiedKFold, TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage
import sklearn

import sys
sys.path.append('../')
from ftw_model.extract_feature import computing_feature_wo

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
activity_mapping = {
                              "Cook_Breakfast": "Cook",
                              "Cook_Lunch": "Cook",
                              "Cook_Dinner": "Cook",
                              "Eat_Breakfast": "Eat",
                              "Eat_Lunch": "Eat",
                              "Eat_Dinner": "Eat",
                              "Morning_Meds": "Take_Medicine",
                              "Evening_Meds": "Take_Medicine",
                              "Wash_Breakfast_Dishes": "Wash_Dishes",
                              "Wash_Lunch_Dishes": "Wash_Dishes",
                              "Wash_Dinner_Dishes": "Wash_Dishes",
                              "Work_At_Table": "Work",
                              "Watch_TV": "Relax",
                              "Read": "Work",
                              "Entertain_Guests": "Relax",
                              "Sleep_Out_Of_Bed": "Relax",
                              "Step_Out": "Leave_Home",
                     }
FTWs = [720, 540, 360, 180, 60, 30, 15, 5, 3, 2, 1, 0, 0]
# FTWs = [0, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144][::-1]
ftw_window = 10

In [3]:
def read_hh_dataset(dataset_path):

    ann_dataset = pd.read_csv(dataset_path, sep='\t')

    raw_columns = ['Date & Time', 'Sensor ID', 'Room-level', 'Sensor location', 'Message', 'Sensor Type']
    ann_columns = raw_columns + ['Activity']

    ann_dataset.columns = ann_columns
    ann_dataset['Activity'] = ann_dataset['Activity'].apply(lambda x: activity_mapping[x] if x in activity_mapping else x)

    ann_dataset['Date & Time'] = pd.to_datetime(ann_dataset['Date & Time'], format='%Y-%m-%d %H:%M:%S')
    start_time, end_time = ann_dataset['Date & Time'].min(), ann_dataset['Date & Time'].max()
    timeframed_dataset = ann_dataset.set_index(['Date & Time'])

    activity2id = {}
    count = 0
    for act in ann_dataset['Activity'].unique():
        if act != 'Other_Activity':
            activity2id[act] = count
            count += 1
    activity2id['Other_Activity'] = count
    
    return timeframed_dataset, start_time, end_time, activity2id
    
hh_dataset = [f'hh1{str(i) if i >=10 else "0"+str(i)}' for i in range(1, 31)]
file = hh_dataset[1]
timeframed_dataset, start_time, end_time, activity2id = read_hh_dataset(f'../hh_dataset/{file}/{file}.ann.txt')

In [7]:
def preprocess_features(features, method) -> np.ndarray:
    if method not in ['mean', 'mean_std', 'mean_with_weight', 'mean_std_max_min']:
        raise ValueError('Please double check the method parameter')

    preprocessed_features = np.array([])
    if method == 'mean':
        return features.mean(axis=1)
    elif method == 'mean_std':
        preprocess_features = np.array(features.mean(axis=1))
        print(preprocess_features.shape)
        preprocess_features = np.concatenate([preprocess_features, features.std(axis=1)], axis=1)
        print(preprocess_features.shape)
        return preprocess_features
    elif method == 'mean_with_weight':
        out = []
        for instance in features:
            out.append(np.sum([vector*((i+1)/10) for i, vector in enumerate(instance)], axis=0))
        return np.array(out)
    return np.array([])

### Model training

In [9]:
features = np.load('../hh_dataset/hh_npy/hh102_feature.npy')
activities = np.load('../hh_dataset/hh_npy/hh102_activity.npy')
# Splitting the data into training and validation
tsv = TimeSeriesSplit(n_splits=3)
processed_features = preprocess_features(features, 'mean_with_weight')
X = processed_features
Y = activities
for train, test in list(tsv.split(X, Y))[-1:]:
    X_train, Y_train, X_test, Y_test = X[train], Y[train], X[test], Y[test]

In [5]:
# Use simple model like random forest for classification
# Normalising the input
instance = X_train.shape[0]
X_train_reshaped = X_train.reshape((instance, -1))
instance = X_test.shape[0]
X_test_reshaped = X_test.reshape((instance, -1))

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_reshaped)
X_test = scaler.transform(X_test_reshaped)

### Multiclass-multioutput Classification by Random Forest

In [10]:
rfc = RandomForestClassifier(class_weight='balanced')
rfc.fit(X_train, Y_train)
y_pred = rfc.predict(X_test)
# classification_report(Y_test, y_pred)
pd.DataFrame(classification_report(Y_test, y_pred, output_dict=True)).rename(columns={str(i): act for i, act in enumerate(list(activity2id.keys())[:-1])}).T

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
Sleep,0.773585,0.691011,0.72997,178.0
Bed_Toilet_Transition,1.0,0.454545,0.625,11.0
Toilet,0.714286,0.069444,0.126582,72.0
Take_Medicine,1.0,0.157895,0.272727,19.0
Dress,1.0,0.044444,0.085106,45.0
Work,0.928571,0.213115,0.346667,122.0
Cook,1.0,0.175439,0.298507,57.0
Eat,0.7,0.194444,0.304348,36.0
Wash_Dishes,0.0,0.0,0.0,37.0
Relax,0.793893,0.675325,0.729825,154.0


### Multiclass-single Classification by Random Forest

In [7]:
activity_count = dict(Counter(timeframed_dataset['Activity']))
del activity_count['Other_Activity']
act_sum = sum([v for k, v in activity_count.items()])
activity_weight = {k: 1/(v/act_sum) for k, v in activity_count.items()}
activity_weight

{'Sleep': 16.05227839202403,
 'Bed_Toilet_Transition': 187.16095380029807,
 'Toilet': 19.14113702179546,
 'Take_Medicine': 114.7418912745546,
 'Dress': 17.846383402017903,
 'Work': 12.727779466909903,
 'Cook': 6.662687675738765,
 'Eat': 32.17240937620085,
 'Wash_Dishes': 13.367216604576901,
 'Relax': 7.444059156515811,
 'Personal_Hygiene': 5.585030685760029,
 'Bathe': 38.8627572334829,
 'Groom': 12.140267775146214,
 'Drink': 32.23848029777949,
 'Leave_Home': 58.83579292574374,
 'Enter_Home': 115.37436839687643,
 'Phone': 377.69924812030075}

In [18]:
ensemble_activities = activities.T
tsv = TimeSeriesSplit(n_splits=3)
processed_features = preprocess_features(features, 'mean_with_weight')

for activity, index in activity2id.items():
    if index == 17:
        break
    print(activity)
    activity_label = ensemble_activities[index]
    train, test = list(tsv.split(processed_features, activity_label))[-1]
    # print('X_train shape:', processed_features[train].shape)
    # print('y_train shape:', activity_label[train].shape)
    train_features, train_activities = processed_features[train], activity_label[train]
    dev_features, dev_activities = processed_features[test], activity_label[test]

    rfc = RandomForestClassifier(class_weight='balanced')
    rfc.fit(train_features, train_activities)
    y_pred = rfc.predict(dev_features)
    print(classification_report(dev_activities, y_pred))

Sleep
              precision    recall  f1-score   support

         0.0       0.95      0.97      0.96       937
         1.0       0.81      0.71      0.76       178

    accuracy                           0.93      1115
   macro avg       0.88      0.84      0.86      1115
weighted avg       0.92      0.93      0.93      1115

Bed_Toilet_Transition
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1104
         1.0       1.00      0.64      0.78        11

    accuracy                           1.00      1115
   macro avg       1.00      0.82      0.89      1115
weighted avg       1.00      1.00      1.00      1115

Toilet
              precision    recall  f1-score   support

         0.0       0.94      1.00      0.97      1043
         1.0       0.85      0.15      0.26        72

    accuracy                           0.94      1115
   macro avg       0.90      0.58      0.61      1115
weighted avg       0.94      0.94      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### KNN

In [12]:
neigh = KNeighborsClassifier(n_neighbors=16)
neigh.fit(X_train, Y_train)
y_pred = neigh.predict(X_test).shape

(1115, 17)

In [14]:
pd.DataFrame(classification_report(Y_test, y_pred, output_dict=True)).rename(columns={str(i): act for i, act in enumerate(list(activity2id.keys())[:-1])}).T

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
Sleep,0.773585,0.691011,0.72997,178.0
Bed_Toilet_Transition,1.0,0.454545,0.625,11.0
Toilet,0.714286,0.069444,0.126582,72.0
Take_Medicine,1.0,0.157895,0.272727,19.0
Dress,1.0,0.044444,0.085106,45.0
Work,0.928571,0.213115,0.346667,122.0
Cook,1.0,0.175439,0.298507,57.0
Eat,0.7,0.194444,0.304348,36.0
Wash_Dishes,0.0,0.0,0.0,37.0
Relax,0.793893,0.675325,0.729825,154.0
