# Human Activity Recognition Using Smartphones: Feature Engineering Techniques and Stacking/Ensemble Models for Accurate Classification

This project aims to develop a model that accurately recognizes human activity based on data collected from smartphones. The dataset contains 563 different features and 8239 data points. In this project, we will be using different feature engineering techniques and classification models to accurately classify human activity into one of the six classes.

In [1]:
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np

import time
import os
import csv

from datetime import datetime

In [2]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import f1_score
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import train_test_split

from mrmr import mrmr_classif

# Configuration options

The notebook is configured to run automatically run all the tuned models. All the configuration variables are defined below. The notebook executes by default using Variance Thresholding feature selection technique.

To run using the mRMR technique, change `do_mrmr` to `True` and `do_var_thr` to `False`.


In [3]:
variance = 0.07
n_comp = 100
best_n = 422

do_var_thr = True
do_mrmr = False
do_std_sca = False
do_pca = False

outfile = 'out_data_improved.csv'
train_data = '../data/train_data.csv'
train_labels = '../data/train_labels.csv'
test_data = '../data/test_data.csv'

In [4]:
df1 = pd.read_csv(train_data)
df2 = pd.read_csv(train_labels)

testdf = pd.read_csv(test_data)
testdf = testdf.drop(columns=['Unnamed: 0'])

print(f'df1 has shape {df1.shape} while df2 has shape {df2.shape}')
print(f'df2 has {len(df2.columns)} namely {df2.columns[0]} and {df2.columns[1]}')

common_col = 'Unnamed: 0'
print(f"Merging df1 with df2 with '{common_col}' as the common column.")
df = pd.merge(df1, df2, on=common_col)
print(f'The shape of the merged df is now {df.shape}')

FileNotFoundError: [Errno 2] No such file or directory: '../train_data.csv'

In [None]:
# Dropping column not needed
df = df.drop(columns=['Unnamed: 0'])
print(df.shape)
df

In [None]:
df['Activity'].groupby(df['Activity']).count()
activity = df['Activity'].groupby(df['Activity']).count().index
activity_data = df['Activity'].groupby(df['Activity']).count().values
colors = plt.cm.plasma(np.linspace(0, 1, len(activity_data)))
plt.pie(activity_data, labels=activity,  colors= colors, autopct='%1.1f%%', shadow=True, startangle=140)
plt.title("% of Different categories")
plt.show()

In [None]:
print(df.isna().sum())

In [None]:
X = df.drop(['Activity'],axis=1)
y = df['Activity']

## Preprocessing

We now perform variance thresholding on the data and remove features having variance less than 0.8 as found out in a previous python script in `trial1`.

In [None]:
def variance_thresholder(variance):
    global X
    # selection = VarianceThreshold(threshold=(variance))    
    # X = selection.fit_transform(X)
    # print(f'variance_thresholder() returned X with shape {X.shape}')
    
    vt = VarianceThreshold(threshold=variance)
    vt.fit(X)
    mask = vt.get_support()
    X = X.loc[:, mask]
    print(f'variance_thresholder() returned X with shape {X.shape}')

In [None]:
def mrmr(best_n):
    global X, y
    selected_features = mrmr_classif(X=X, y=y, K=best_n)
    X = X[X.columns.intersection(selected_features)]

In [None]:
def standard_scaler():
    global X_train, X_test, testdf
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    testdf = sc.transform(testdf)

In [None]:
def pc_analyzer(n):
    global X_train, X_test
    pca = PCA(n_components=n)
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)

## Classification

In [None]:
def classify(clfname, clf):
    
    start_time = time.time()
    
    print(clfname, end='\t')
    clf.fit(X_train, y_train) # Train model

    y_train_pred = clf.predict(X_train)
    y_test_pred = clf.predict(X_test)
    
    end_time = time.time()
    time_diff = end_time - start_time
    
    performance = {}
    
    performance['clfname'] = clfname
    
    if do_var_thr:
        performance['variance_threshold'] = variance
    else:
        performance['variance_threshold'] = 0
    
    if do_mrmr:
        performance['mrmr'] = best_n
    else:
        performance['mrmr'] = 0

    performance['standard_scaler'] = do_std_sca
        
    if do_pca:
        performance['pca_n'] = n_comp
    else:
        performance['pca_n'] = 0
    
    performance['time'] = time_diff
    performance['train_accuracy'] = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
    performance['train_precision'] = precision_score(y_train,y_train_pred,average = 'macro')
    performance['train_recall'] = recall_score(y_train,y_train_pred,average = 'macro')
    performance['train_mcc'] = matthews_corrcoef(y_train, y_train_pred) # Calculate MCC
    performance['train_f1'] = f1_score(y_train, y_train_pred, average='macro') # Calculate F1-score
    performance['test_accuracy'] = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
    performance['test_precision'] = precision_score(y_test,y_test_pred,average = 'macro')
    performance['test_recall'] = recall_score(y_test,y_test_pred,average = 'macro')
    performance['test_mcc'] = matthews_corrcoef(y_test, y_test_pred) # Calculate MCC
    performance['test_f1'] = f1_score(y_test, y_test_pred, average='macro') # Calculate F1-score
    
    clf_out_data.append(performance)
    
    print(f'✓ ({time_diff} s)')

    print(confusion_matrix(y_test, y_test_pred))

## Build Classification models

In [None]:
all_start = time.time()

if do_var_thr:
    variance_thresholder(variance)

if do_mrmr:
    mrmr(best_n)

selected_columns = list(X.columns.values)
testdf = testdf[selected_columns]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

if do_std_sca:
    standard_scaler()
    
if do_pca:
    pc_analyzer(n_comp)

knn = KNeighborsClassifier(n_neighbors=4, weights='distance')
svm_rbf = SVC(kernel='rbf', C= 5, gamma= 0.01, random_state=42)
dt = DecisionTreeClassifier(ccp_alpha= 0.001, criterion= 'entropy', max_depth= None, max_features= 100, random_state=42)
rf = RandomForestClassifier(ccp_alpha= 0.001, criterion= 'log_loss', max_depth= None, n_estimators= 500, random_state=42)
mlp = MLPClassifier(activation= 'tanh', alpha= 0.0001, max_iter= 500, solver= 'adam', random_state=42)

# knn = KNeighborsClassifier()
# svm_rbf = SVC()
# dt = DecisionTreeClassifier()
# rf = RandomForestClassifier()
# mlp = MLPClassifier()

estimator_list = [('mlp', mlp), ('svm_rbf', svm_rbf), ('rf', rf), ('knn', knn)]

stack_model = StackingClassifier(estimators=estimator_list, final_estimator=LogisticRegression(solver='liblinear'))

classifiers = [knn, svm_rbf, dt, rf, mlp, stack_model]
clf_names = ['knn', 'svm_rbf', 'dt', 'rf', 'mlp', 'stack']

clf_out_data = []

perf_cols = ['clfname', 'variance_threshold', 'mrmr', 'standard_scaler', 'pca_n', 'time', 'train_accuracy', 'train_precision', 'train_recall', 'train_mcc', 'train_f1', 'test_accuracy', 'test_precision', 'test_recall', 'test_mcc', 'test_f1']

for clf in range(len(classifiers)):
    classify(clf_names[clf], classifiers[clf])
    
all_end = time.time()
total_time = all_end - all_start
print(f'Total time taken: {total_time}')

In [None]:
with open(outfile, mode='a', newline='') as file:
    writer = csv.writer(file)
    if os.stat(outfile).st_size == 0:
        writer.writerow(perf_cols)
    writer.writerows([entry.values() for entry in clf_out_data])

In [None]:
prediction = list(stack_model.predict(testdf))

In [None]:
print(len(prediction))

In [None]:
with open('predicted.txt', mode='a', newline='') as file:
    writer = csv.writer(file)
    writer.writerows([[i, prediction[i]] for i in range(len(prediction))])

mRMR Total time taken: 553.9091100692749

Without feature engineering Total time taken: 528.2790336608887

Variance Thresholding Total time taken: 494.20610761642456