# Human Activity Recognition

## Setting up the environment

In [None]:
import os
import re
import numpy as np
import pandas as pd
import math
import statistics as st
import itertools

from scipy import stats
from statsmodels import robust

import warnings
warnings.filterwarnings('ignore')

## Functions for Statistical Calculations

In [None]:
def correlation(data1, data2):
    corr, p = stats.pearsonr(data1, data2)
    if math.isnan(corr):
        return 0
    else:
        return corr

def sma(x, y, z):
    sum_ = 0
    X = list(x)
    Y = list(y)
    Z = list(z)
    for i in range(len(X)):
        sum_ += abs(X[i]) + abs(Y[i]) + abs(Z[i])
    return sum_ / len(X)

def calc_entropy(data):
    entropy = stats.entropy(data, base=2)
    if math.isinf(entropy) or math.isnan(entropy):
        return -1
    else:
        return entropy

def energy(data):
    sum_ = 0
    for d in data:
        sum_ += d ** 2
        
    return sum_ / len(data)

def iqr(data):
    return np.subtract(*np.percentile(data, [75, 25]))

## Function for reading the Datasets

In [None]:
def read_dataset(dirName):
    print("Reading Raw Data...")
    raw_datasets = []
    raw_datalabels = []

    for filename in os.listdir(dirName):
        if filename.endswith(".csv"):
            raw = pd.read_csv(os.path.join(dirName, filename))
            cols = raw.columns
            cols = cols.str.replace('([\(\[]).*?([\)\]])', '')
            cols = cols.str.replace('\s','_')
            raw.columns = cols
            to_drop = []
            for col in raw.columns:
                if not ("_X_" in col or "_Y_" in col or "_Z_" in col):
                    to_drop.append(col)
            raw = raw.drop(to_drop, axis=1)
            column_names = raw.columns
            raw = raw.drop_duplicates(keep='last')
            raw_datasets.append(raw)
            raw_datalabels.append(filename.split(".")[0])
            
    return raw_datasets, raw_datalabels, column_names

## Function for Processing Raw Data

In [None]:
def process_raw_data(raw_datasets, raw_datalabels, column_names):
    print("Processing Data...")
    datasets = dict()
    statistics = ["mean", "mad", "max", "min", "std", "energy", "iqr", "entropy"]
    diff_col_names = []
    for col in range(0,len(column_names),3):
        diff_col_names.append(column_names[col][:len(column_names[col])-3])

    for col in column_names:
        for stat in statistics:
            key = col + "~" + stat
            datasets[key] = []

    for col in diff_col_names:
        datasets[col+"_XY_~correlation"] = []
        datasets[col+"_YZ_~correlation"] = []
        datasets[col+"_ZX_~correlation"] = []
        datasets[col+"_XYZ_~sma"] = []

    datasets["Activity"] = []

    for ind,raw_data in enumerate(raw_datasets):
        for d in range(0, len(raw_data), 5):
            if d+5 < len(raw_data):
                data = raw_data[d:d+5]
            else:
                break
            for c in diff_col_names:
                col_X = c + "_X_"
                col_Y = c + "_Y_"
                col_Z = c + "_Z_"

                datasets[col_X+"~mean"].append(st.mean(data[col_X])) # mean X
                datasets[col_Y+"~mean"].append(st.mean(data[col_Y])) # mean Y
                datasets[col_Z+"~mean"].append(st.mean(data[col_Z])) # mean Z

                datasets[col_X+"~mad"].append(robust.mad(np.array(data[col_X]))) # median absolute deviation X
                datasets[col_Y+"~mad"].append(robust.mad(np.array(data[col_Y]))) # median absolute deviation Y
                datasets[col_Z+"~mad"].append(robust.mad(np.array(data[col_Z]))) # median absolute deviation Z

                datasets[col_X+"~max"].append(max(data[col_X])) # maximum X
                datasets[col_Y+"~max"].append(max(data[col_Y])) # maximum Y
                datasets[col_Z+"~max"].append(max(data[col_Z])) # maximum Z

                datasets[col_X+"~min"].append(min(data[col_X])) # minimum X
                datasets[col_Y+"~min"].append(min(data[col_Y])) # minimum Y
                datasets[col_Z+"~min"].append(min(data[col_Z])) # minimum Z

                datasets[col_X+"~std"].append(st.stdev(data[col_X])) # standard deviation X
                datasets[col_Y+"~std"].append(st.stdev(data[col_Y])) # standard deviation Y
                datasets[col_Z+"~std"].append(st.stdev(data[col_Z])) # standard deviation Z

                datasets[col_X+"~energy"].append(energy(data[col_X])) # energy X
                datasets[col_Y+"~energy"].append(energy(data[col_Y])) # energy Y
                datasets[col_Z+"~energy"].append(energy(data[col_Z])) # energy Z

                datasets[col_X+"~iqr"].append(iqr(data[col_X])) # interquartile range X
                datasets[col_Y+"~iqr"].append(iqr(data[col_Y])) # interquartile range Y
                datasets[col_Z+"~iqr"].append(iqr(data[col_Z])) # interquartile range Z

                datasets[col_X+"~entropy"].append(calc_entropy(data[col_X])) # entropy X
                datasets[col_Y+"~entropy"].append(calc_entropy(data[col_Y])) # entropy Y
                datasets[col_Z+"~entropy"].append(calc_entropy(data[col_Z])) # entropy Z

                datasets[c+"_XY_~correlation"].append(correlation(data[col_X], data[col_Y])) # correlation between X and Y
                datasets[c+"_YZ_~correlation"].append(correlation(data[col_Y], data[col_Z])) # correlation between Y and Z
                datasets[c+"_ZX_~correlation"].append(correlation(data[col_Z], data[col_X])) # correlation between Z and X


                datasets[c+"_XYZ_~sma"].append(sma(data[col_X], data[col_Y], data[col_Z]))

            datasets["Activity"].append(raw_datalabels[ind])
            data = []
        print(raw_datalabels[ind]," collected and processed")
    print("Done")
    return datasets

## Processing Our Version of the Original

In [None]:
og_raw_dataset, og_raw_datalabels, og_column_names = read_dataset("OriginalRawDataSet")
original_dataset = process_raw_data(og_raw_dataset, og_raw_datalabels, og_column_names)
df = pd.DataFrame.from_dict(original_dataset, orient="columns")
df.to_csv("Processed_DataSet/OriginalProcessedData.csv",index=False)
print("Saved to OriginalProcessedData.csv")

## Processing Our New DataSet

In [None]:
new_raw_dataset, new_raw_datalabels, new_column_names = read_dataset("NewRawDataSet")
new_dataset = process_raw_data(new_raw_dataset, new_raw_datalabels, new_column_names)
df = pd.DataFrame.from_dict(new_dataset, orient="columns")
df.to_csv("Processed_DataSet/NewProcessedData.csv",index=False)
print("Saved to NewProcessedData.csv")

## Function for Splitting up Data

In [None]:
def split_data(processed_file, test_size=0.3):
    processed = pd.read_csv("Processed_DataSet/"+processed_file)

    print("No of NAN in processed: {}".format(processed.isnull().values.sum()))
    print("No of duplicates in processed: {}".format(sum(processed.duplicated())))

    train, test = train_test_split(processed, test_size=test_size, shuffle=True)

    y_train = train.Activity
    X_train = train.drop(['Activity'], axis=1)
    y_test = test.Activity
    X_test = test.drop(['Activity'], axis=1)
    
    return X_train, y_train, X_test, y_test

# Data Analysis

## Setting up environment

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split

## Reading and Splitting Data

In [None]:
print("Obtaining Original Processed Data")
og_X_train, og_y_train, og_X_test, og_y_test = split_data("OriginalProcessedData.csv")
og_labels = ['Sitting', 'Standing','Walking', 'WalkingDownStairs', 'WalkingUpStairs', 'Laying']

print("\nObtaining New Processed Data")
new_X_train, new_y_train, new_X_test, new_y_test = split_data("NewProcessedData.csv")
new_labels = ['Cycling', 'Football', 'Swimming', 'Jogging', 'Pushups', 'JumpRope']

## Apply t-sne on the data

In [None]:
# performs t-sne with different perplexity values and their repective plots

def perform_tsne(X_data, y_data, perplexities, markers, n_iter=1000):
    
    for index, perplexity in enumerate(perplexities):
        # perform t-sne
        print("\nPerforming tsne with perplexity {} and with {} iterations at max".format(perplexity, n_iter))
        X_reduced = TSNE(verbose=2, perplexity=perplexity).fit_transform(X_data)
        print('Done..')
        
        # prepare data for seaborn
        print("Creating plot for this t-sne visualization")
        df = pd.DataFrame({'x':X_reduced[:,0], 'y':X_reduced[:,1], 'label':y_data})
        
        # draw the plot in appropriate palce in the grid
        sns.lmplot(data=df, x='x', y='y', hue='label', fit_reg=False, height=8, palette="Set1",markers=markers)
        plt.title("perplexity : {} and max_iter: {}".format(perplexity, n_iter))
        plt.show()
        print("Done")

## t-sne for Original DataSet

In [None]:
perform_tsne(X_data = og_X_train, y_data = og_y_train, perplexities = [5,10,20], markers = ['^', 'v', 'o', 's', '1', '2'])

## t-sne for New DataSet

In [None]:
perform_tsne(X_data = new_X_train, y_data = new_y_train, perplexities = [5,10,20], markers = ['^', 'v', 'o', 's', '1', '2'])

## Modelling Data

In [None]:
import matplotlib.colors as colors

from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from datetime import datetime

### Setting up Confusion Matrix

In [None]:
# function to plot the confusion matrix
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion Matrix', cmap=plt.cm.Blues):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)
    
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2
    
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black")
        
    plt.tight_layout()
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')

### Performing Model

In [None]:
# generic function to run any model specified
def perform_model(model, X_train, y_train, X_test, y_test, class_labels, cm_normalize=True, print_cm=True, cm_map=plt.cm.Greens):
    # to store results at various phases
    results = dict()
    
    # time at which model starts training
    train_start_time = datetime.now()
    print('training the model..')
    model.fit(X_train, y_train)
    print("Done\n\n")
    train_end_time = datetime.now()
    results['training_time'] = train_end_time - train_start_time
    print('training_time(HH:MM:SS.ms) - {}\n\n'.format(results['training_time']))
    
    # predict test data
    print('Predicting test data')
    test_start_time = datetime.now()
    y_pred = model.predict(X_test)
    test_end_time = datetime.now()
    print('Done\n\n')
    results['testing_time'] = test_end_time - test_start_time
    print('testing_time(HH:MM:SS.ms) - {}\n\n'.format(results['testing_time']))
    results['predicted'] = y_pred
    
    # calculate overall accuracty of the model
    accuracy = metrics.accuracy_score(y_true=y_test, y_pred=y_pred)
    # store accuracy in results
    results['accuracy'] = accuracy
    print('-----------------------')
    print('|       Accuracy      |')
    print('-----------------------')
    print('\n      {}\n\n'.format(accuracy))
    
    # confusion matrix
    cm = metrics.confusion_matrix(y_test, y_pred)
    results['confusion_matrix'] = cm
    if print_cm:
        print('-----------------------')
        print('|   Confusion Matrix  |')
        print('-----------------------')
        print('\n {}'.format(cm))
    
    # plot confusion matrix
    plt.figure(figsize=(8,8))
    plt.grid(b=False)
    plot_confusion_matrix(cm, classes=class_labels, normalize=True, title='Normalized Confusion Matrix', cmap=cm_map)
    plt.show()
    
    # get classification report
    print('-----------------------------')
    print('|   Classification Report   |')
    print('-----------------------------')
    classification_report = metrics.classification_report(y_test, y_pred)
    
    # store report in results
    results['classification_report'] = classification_report
    print(classification_report)
    
    # add the trained model to the results
    results['model'] = model
    
    return results

### Determine Best Parameters from GridSearch

In [None]:
# function to print the gridsearch Attributes
def print_grid_search_attributes(model):
    # Estimator that gave highest score among all the estimators formed in GridSearch
    print('-----------------------')
    print('|    Best Estimator   |')
    print('-----------------------')
    print('\n\t{}\n'.format(model.best_estimator_))
    
    # parameters that gave best results while performing grid search
    print('-----------------------')
    print('|   Best Parameters   |')
    print('-----------------------')
    print('\tParameters of best estimator : \n\n\t{}\n'.format(model.best_params_))
    
    # number of cross validation splits
    print('--------------------------------')
    print('|  No of CrossValidation sets  |')
    print('--------------------------------')
    print('\n\tTotal number of cross validation sets: {}\n'.format(model.n_splits_))
    
    # Average cross validated score of the best estimator, from the Grid Search
    print('-----------------------')
    print('|      Best Score     |')
    print('-----------------------')
    print('\n\tAverage Cross Validate scores of best estimator : \n\n\t{}\n'.format(model.best_score_))

## Predicting Activities

### Importing Different Models

In [None]:
from sklearn import svm
from sklearn import neighbors
from sklearn import linear_model
from sklearn import tree

## Support Vector Machines

### Radial Basis Function

In [None]:
rbf_kernel = svm.SVC(kernel='rbf')
rbf_params = {'C':[55, 60, 65, 81, 90], 'gamma':['scale', 0.01, 0.03, 0.05, 0.07]}
rbf_svc_grid = GridSearchCV(rbf_kernel, param_grid=rbf_params, n_jobs=-1, verbose=1)
print("Original DataSet")
og_rbf_svc_grid_results = perform_model(rbf_svc_grid, og_X_train, og_y_train, og_X_test, og_y_test,class_labels=og_labels)
print("\nNew DataSet")
new_rbf_svc_grid_results = perform_model(rbf_svc_grid, new_X_train, new_y_train, new_X_test, new_y_test,class_labels=new_labels)

In [None]:
print("Original DataSet")
print_grid_search_attributes(og_rbf_svc_grid_results['model'])
print("\nNew DataSet")
print_grid_search_attributes(new_rbf_svc_grid_results['model'])

### Polynomial

In [None]:
poly_kernel = svm.SVC(kernel='poly')
poly_params = {'C':[0.125, 0.5, 1, 2, 8, 16], 'degree': [1, 2, 3, 4, 5]}
poly_svc_grid = GridSearchCV(poly_kernel, param_grid=poly_params, n_jobs=-1, verbose=1)
print("Original DataSet")
og_poly_svc_grid_results = perform_model(poly_svc_grid, og_X_train, og_y_train, og_X_test, og_y_test,class_labels=og_labels)
print("\nNew DataSet")
new_poly_svc_grid_results = perform_model(poly_svc_grid, new_X_train, new_y_train, new_X_test, new_y_test,class_labels=new_labels)

In [None]:
print("Original DataSet")
print_grid_search_attributes(og_poly_svc_grid_results['model'])
print("\nNew DataSet")
print_grid_search_attributes(new_poly_svc_grid_results['model'])

## K-Nearest Neighbours

In [None]:
knn_params = {'n_neighbors':[7, 9, 11, 13, 17], 'weights': ['uniform','distance'], 'algorithm': ['ball_tree', 'kd_tree','brute']}
knn_kernel = neighbors.KNeighborsClassifier()
knn_grid = GridSearchCV(knn_kernel, param_grid=knn_params, n_jobs=-1, verbose=1)
print("Original DataSet")
og_knn_grid_results = perform_model(knn_grid, og_X_train, og_y_train, og_X_test, og_y_test,class_labels=og_labels)
print("\nNew DataSet")
new_knn_grid_results = perform_model(knn_grid, new_X_train, new_y_train, new_X_test, new_y_test,class_labels=new_labels)

In [None]:
print("Original DataSet")
print_grid_search_attributes(og_knn_grid_results['model'])
print("\nNew DataSet")
print_grid_search_attributes(new_knn_grid_results['model'])

## Logistic Regression

In [None]:
reg_params = {'C':[0.001, 0.125, 0.5, 1, 8, 16, 50], 'class_weight':['balanced',None], 
              'solver':[ 'newton-cg', 'sag', 'saga', 'lbfgs'], 'multi_class':['ovr', 'multinomial']}
reg_kernel = linear_model.LogisticRegression()
reg_grid = GridSearchCV(reg_kernel, param_grid=reg_params, n_jobs=-1, verbose=1)
print("Original DataSet")
og_reg_grid_results = perform_model(reg_grid, og_X_train, og_y_train, og_X_test, og_y_test,class_labels=og_labels)
print("\nNew DataSet")
new_reg_grid_results = perform_model(reg_grid, new_X_train, new_y_train, new_X_test, new_y_test,class_labels=new_labels)

In [None]:
print("Original DataSet")
print_grid_search_attributes(og_reg_grid_results['model'])
print("\nNew DataSet")
print_grid_search_attributes(new_reg_grid_results['model'])

## Decision Trees

In [None]:
dt_params = {'max_depth':[3, 5, 7, 9], 'splitter': ['best','random'], 'criterion': ['gini', 'entropy'], 'class_weight':['balanced',None]}
dt_kernel = tree.DecisionTreeClassifier()
dt_grid = GridSearchCV(dt_kernel, param_grid=dt_params, n_jobs=-1, verbose=1)
print("Original DataSet")
og_dt_grid_results = perform_model(dt_grid, og_X_train, og_y_train, og_X_test, og_y_test,class_labels=og_labels)
print("\nNew DataSet")
new_dt_grid_results = perform_model(dt_grid, new_X_train, new_y_train, new_X_test, new_y_test,class_labels=new_labels)

In [None]:
print("Original DataSet")
print_grid_search_attributes(og_dt_grid_results['model'])
print("\nNew DataSet")
print_grid_search_attributes(new_dt_grid_results['model'])