## Problem Statement
Using the attributes given, predict which activity of daily living (ADL) each test subject is performing (i.e. WALKING, WALKING_UPSTAIRS, WALKING_DOWNSTAIRS, SITTING, STANDING, LAYING)

In [1]:
library(readr)
library(dplyr)
library(glmnet)



Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

Loading required package: Matrix
Loading required package: foreach
Loaded glmnet 2.0-5



### Read Feature Names

In [2]:
activities = read.table("activity_labels.txt", sep = "", col.names = c("Id", "activity"))
activityLabels = as.character(activities$activity)
features = read.table("features.txt", sep = "", col.names = c("Id", "feature"))
attributeNames = features$feature

In [3]:
cat("Total no of features - ", length(attributeNames))

Total no of features -  561

In [4]:
attributeNames

In [18]:
# Load the training dataset
# Add training data column names from features

train <- tbl_df(read.table("./train/X_train.txt"))
colnames(train) <- attributeNames #features$feature.Variable


# Load the test dataset
# Add test data column names from features

test <- tbl_df(read.table("./test/X_test.txt"))
colnames(test) <- attributeNames  # features$feature.Variable
merged <- rbind(train, test)

In [None]:
# Add subject data, and activity data to the training dataset
train <- cbind(
        rename(tbl_df(read.table("./train/subject_train.txt")), 
               subject = V1),
        rename(tbl_df(read.table("./train/y_train.txt")),
               Activity = V1),
        Dataset.Partition = c("Training"),
        train)

# Add subject data, and activity data to the test dataset
test <- cbind(
    rename(tbl_df(read.table("./test/subject_test.txt")), 
           subject = V1),
    rename(tbl_df(read.table("./test/y_test.txt")),
           Activity = V1),
    Dataset.Partition = c("Test"),
    test)



In [29]:
head(merged)

tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,⋯,fBodyBodyGyroJerkMag-meanFreq(),fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)"
0.2885845,-0.02029417,-0.1329051,-0.9952786,-0.9831106,-0.9135264,-0.9951121,-0.9831846,-0.923527,-0.9347238,⋯,-0.07432303,-0.2986764,-0.7103041,-0.11275434,0.030400372,-0.4647614,-0.01844588,-0.8412468,0.1799406,-0.05862692
0.2784188,-0.01641057,-0.1235202,-0.9982453,-0.9753002,-0.960322,-0.9988072,-0.9749144,-0.9576862,-0.9430675,⋯,0.15807454,-0.5950509,-0.8614993,0.05347695,-0.007434566,-0.7326262,0.70351059,-0.8447876,0.1802889,-0.05431672
0.2796531,-0.01946716,-0.1134617,-0.9953796,-0.967187,-0.978944,-0.9965199,-0.9636684,-0.9774686,-0.9386916,⋯,0.41450281,-0.3907482,-0.7601037,-0.11855926,0.17789948,0.1006992,0.80852908,-0.8489335,0.1806373,-0.04911782
0.2791739,-0.02620065,-0.1232826,-0.9960915,-0.9834027,-0.9906751,-0.9970995,-0.9827498,-0.9893025,-0.9386916,⋯,0.40457253,-0.1172902,-0.4828445,-0.03678797,-0.012892494,0.640011,-0.48536645,-0.8486494,0.1819348,-0.04766318
0.2766288,-0.01656965,-0.1153619,-0.9981386,-0.9808173,-0.9904816,-0.9983211,-0.9796719,-0.9904411,-0.9424691,⋯,0.08775301,-0.3514709,-0.6992052,0.12332005,0.12254196,0.6935783,-0.61597061,-0.8478653,0.1851512,-0.04389225
0.2771988,-0.01009785,-0.1051373,-0.997335,-0.9904868,-0.99542,-0.9976274,-0.9902177,-0.9955489,-0.9424691,⋯,0.01995331,-0.5454101,-0.8446193,0.08263215,-0.14343901,0.2750408,-0.36822404,-0.8496316,0.1848225,-0.04212638


In [30]:
selected_measures <- grepl('-(mean|std)\\(',colnames(merged))
data_set <- subset(merged, select=selected_measures)
merged <- subset(merged, select=selected_measures)
dim(merged)

In [32]:
colnames(merged) <- gsub("mean", ".Mean.", colnames(merged))
colnames(merged) <- gsub("std", ".Std.", colnames(merged))
colnames(merged) <- gsub("^t", "Time.", colnames(merged))
colnames(merged) <- gsub("\\.t", ".Time.", colnames(merged))
colnames(merged) <- gsub("^f", "Frequency.", colnames(merged))
colnames(merged) <- gsub("\\.f", ".Frequency.", colnames(merged))
colnames(merged) <- gsub("\\(\\)", "", colnames(merged))
colnames(merged) <- gsub("-", "", colnames(merged))
colnames(merged) <- gsub("\\.\\.", ".", colnames(merged))
colnames(merged) <- gsub("\\.\\.", ".", colnames(merged))
colnames(merged) <- gsub("\\.$", "", colnames(merged))
colnames(merged) <- gsub("BodyBody", "Body.", colnames(merged))
colnames(merged) <- gsub("^angle\\.", "Angle.", colnames(merged))
colnames(merged) <- gsub("Gyro", ".Gyro", colnames(merged))
colnames(merged) <- gsub("Acc", ".Acc", colnames(merged))
colnames(merged) <- gsub("Jerk", ".Jerk", colnames(merged))
colnames(merged) <- gsub("Mag", ".Mag", colnames(merged))
colnames(merged) <- gsub("^", "MeanOf.", colnames(merged))
colnames(merged) <- gsub("(^|[\\.])([[:alpha:]])", "\\1\\U\\2", colnames(merged), perl=TRUE)


In [33]:
activities_train <- read.table("./train/y_train.txt")
activities_test <- read.table("./test/y_test.txt")
activities <- rbind(activities_train,activities_test)[,1]
labels <- c("WALKING", "WALKING_UPSTAIRS", "WALKING_DOWNSTAIRS",
            "SITTING", "STANDING", "LAYING")
activities <- labels[activities]
merged <- cbind(Activity = activities,merged)


In [34]:
subjects_train <- read.table("train/subject_train.txt")
subjects_test <- read.table("test/subject_test.txt")
subjects <- rbind(subjects_train,subjects_test)[,1]
merged <- cbind(Subject = subjects,merged)
merged[1:4,1:5]

Subject,Activity,MeanOf.Time.Body.Acc.Mean.X,MeanOf.Time.Body.Acc.Mean.Y,MeanOf.Time.Body.Acc.Mean.Z
1,STANDING,0.2885845,-0.02029417,-0.1329051
1,STANDING,0.2784188,-0.01641057,-0.1235202
1,STANDING,0.2796531,-0.01946716,-0.1134617
1,STANDING,0.2791739,-0.02620065,-0.1232826


In [35]:
library('dplyr')
average_data_set <- merged %>%
    group_by(Subject,Activity) %>%
    summarise_each(funs(mean))

In [36]:
average_data_set

Subject,Activity,MeanOf.Time.Body.Acc.Mean.X,MeanOf.Time.Body.Acc.Mean.Y,MeanOf.Time.Body.Acc.Mean.Z,MeanOf.Time.Body.Acc.Std.X,MeanOf.Time.Body.Acc.Std.Y,MeanOf.Time.Body.Acc.Std.Z,MeanOf.Time.Gravity.Acc.Mean.X,MeanOf.Time.Gravity.Acc.Mean.Y,⋯,MeanOf.Frequency.Body.Gyro.Std.Y,MeanOf.Frequency.Body.Gyro.Std.Z,MeanOf.Frequency.Body.Acc.Mag.Mean,MeanOf.Frequency.Body.Acc.Mag.Std,MeanOf.Frequency.Body..Acc.Jerk.Mag.Mean,MeanOf.Frequency.Body..Acc.Jerk.Mag.Std,MeanOf.Frequency.Body..Gyro.Mag.Mean,MeanOf.Frequency.Body..Gyro.Mag.Std,MeanOf.Frequency.Body..Gyro.Jerk.Mag.Mean,MeanOf.Frequency.Body..Gyro.Jerk.Mag.Std
1,LAYING,0.2215982,-0.040513953,-0.11320355,-0.92805647,-0.836827406,-0.82606140,-0.2488818,0.70554977,⋯,-0.95123205,-0.9165825,-0.86176765,-0.79830094,-0.933300361,-0.92180398,-0.8621902,-0.8243194,-0.9423669,-0.9326607
1,SITTING,0.2612376,-0.001308288,-0.10454418,-0.97722901,-0.922618642,-0.93958629,0.8315099,0.20441159,⋯,-0.96234504,-0.9439178,-0.94778292,-0.92844480,-0.985262127,-0.98160618,-0.9584356,-0.9321984,-0.9897975,-0.9870496
1,STANDING,0.2789176,-0.016137590,-0.11060182,-0.99575990,-0.973190056,-0.97977588,0.9429520,-0.27298383,⋯,-0.98710773,-0.9823453,-0.98535636,-0.98231380,-0.992542478,-0.99253600,-0.9846176,-0.9784661,-0.9948154,-0.9946711
1,WALKING,0.2773308,-0.017383819,-0.11114810,-0.28374026,0.114461337,-0.26002790,0.9352232,-0.28216502,⋯,-0.03350816,-0.4365622,-0.12862345,-0.39803259,-0.057119400,-0.10349240,-0.1992526,-0.3210180,-0.3193086,-0.3816019
1,WALKING_DOWNSTAIRS,0.2891883,-0.009918505,-0.10756619,0.03003534,-0.031935943,-0.23043421,0.9318744,-0.26661034,⋯,-0.18141473,-0.2384436,0.09658453,-0.18653030,0.026218495,-0.10405226,-0.1857203,-0.3983504,-0.2819634,-0.3919199
1,WALKING_UPSTAIRS,0.2554617,-0.023953149,-0.09730200,-0.35470803,-0.002320265,-0.01947924,0.8933511,-0.36215336,⋯,0.15153891,-0.5717078,-0.35239594,-0.41626010,-0.442652162,-0.53305985,-0.3259615,-0.1829855,-0.6346651,-0.6939305
2,LAYING,0.2813734,-0.018158740,-0.10724561,-0.97405946,-0.980277399,-0.98423330,-0.5097542,0.75253664,⋯,-0.98191062,-0.9631742,-0.97511020,-0.97512139,-0.985374115,-0.98456849,-0.9721130,-0.9610984,-0.9902487,-0.9894927
2,SITTING,0.2770874,-0.015687994,-0.10921827,-0.98682228,-0.950704499,-0.95982817,0.9404773,-0.10563002,⋯,-0.97735619,-0.9635227,-0.96127375,-0.95557560,-0.983874699,-0.98412419,-0.9718406,-0.9613857,-0.9898620,-0.9896329
2,STANDING,0.2779115,-0.018420827,-0.10590854,-0.98727189,-0.957304989,-0.94974185,0.8969286,-0.37006270,⋯,-0.97103605,-0.9697543,-0.96405217,-0.96051938,-0.977065296,-0.97516046,-0.9617759,-0.9567887,-0.9778498,-0.9777543
2,WALKING,0.2764266,-0.018594920,-0.10550036,-0.42364284,-0.078091253,-0.42525752,0.9130173,-0.34660709,⋯,-0.53304695,-0.5598566,-0.32428943,-0.57710521,-0.169064353,-0.16409197,-0.5307048,-0.6517928,-0.5832493,-0.5581046


In [None]:
require(plyr)
library(dplyr)

# A function that takes in a variable name and generates a description for the
# variable
generateVariableDescription <- function(x) {
    x <- as.character(x)
    desc <- ""
    
    if (x == "Activity")
        desc <- paste(desc, "The motion activity" , sep = "")
    
    if (x == "Subject.Id")
        desc <- paste(desc, "The subject identifier" , sep = "")
    
    if (grepl("\\.mean", x, ignore.case = TRUE)[1] == TRUE) 
        desc <- paste(desc, "The mean of " , sep = "")
    
    if (grepl("\\.std", x, ignore.case = TRUE)[1] == TRUE) 
        desc <- paste(desc, "The standard deviation of ", sep = "")
    
    if (grepl("frequency\\.", x, ignore.case = TRUE)[1] == TRUE) {
        desc <- paste(desc, "a Fourier transformed ", sep = "")
        if (grepl("mag\\.", x, ignore.case = TRUE)[1] == TRUE) {
            desc <- paste(desc, "resultant XYZ signal (Euclidean norm) ", sep = "")
        } else if (grepl("\\.x", x, ignore.case = TRUE)[1] == TRUE) {
            desc <- paste(desc, "X signal ", sep = "")
        } else if (grepl("\\.y", x, ignore.case = TRUE)[1] == TRUE) {
            desc <- paste(desc, "Y signal ", sep = "")
        } else if (grepl("\\.z", x, ignore.case = TRUE)[1] == TRUE) {
            desc <- paste(desc, "Z signal ", sep = "")
        }
    } else {
        if (grepl("mag\\.", x, ignore.case = TRUE)[1] == TRUE) {
            desc <- paste(desc, "the resultant XYZ ", sep = "")
        } else if (grepl("\\.x", x, ignore.case = TRUE)[1] == TRUE) {
            desc <- paste(desc, "the X ", sep = "")
        } else if (grepl("\\.y", x, ignore.case = TRUE)[1] == TRUE) {
            desc <- paste(desc, "the Y ", sep = "")
        } else if (grepl("\\.z", x, ignore.case = TRUE)[1] == TRUE) {
            desc <- paste(desc, "the Z ", sep = "")
        }
    }
    
    if (grepl("jerk\\.", x, ignore.case = TRUE)[1] == TRUE) {
        if (grepl("acc\\.", x, ignore.case = TRUE)[1] == TRUE)
            desc <- paste(desc, "jerk from an accelerometer signal ", sep = "")
        
        if (grepl("gyro\\.", x, ignore.case = TRUE)[1] == TRUE)
            desc <- paste(desc, "jerk from a gyroscope signal ", sep = "")
    }
    else {
        if (grepl("acc\\.", x, ignore.case = TRUE)[1] == TRUE)
            desc <- paste(desc, "acceleration from an accelerometer signal ", sep = "")
        
        if (grepl("gyro\\.", x, ignore.case = TRUE)[1] == TRUE)
            desc <- paste(desc, "angular velocity from a gyroscope signal ", sep = "")
    }
    
    if (grepl("\\.body", x, ignore.case = TRUE)[1] == TRUE) 
        desc <- paste(desc, "related to body motion", sep = "") 
    
    if (grepl("\\.gravity", x, ignore.case = TRUE)[1] == TRUE) 
        desc <- paste(desc, "related gravity", sep = "") 
    
    desc
}

# 1. Read the variable names from the tidy_summary.txt file
tidy_summary_variables <- read.table("tidy_summary.txt", header = TRUE) %>%
    names %>%
    data.frame %>%
    tbl_df %>%
    plyr::rename(c("." ="Variable"))

# 2. Generate a list of descriptions related to the variable names
descriptions <- data.frame(
    tapply(tidy_summary_variables$Variable, 
           tidy_summary_variables[1], 
           generateVariableDescription))

# 3. Rename the descriptions column to Description
colnames(descriptions)[1] <- "Description"

# 4. Add the row names of the descriptions as the variable name
descriptions <- cbind(Variable = rownames(descriptions), descriptions)

# 5. Output the variable names and descriptions to file
write.table(descriptions, "variable_descriptions.txt", row.names = FALSE)

### Read Train Data

In [None]:
train_X = read.table("./train/X_train.txt", sep="", col.names=attributeNames)
train_y <- read.table("./train/y_train.txt", sep="\n")
names(train_y) = "Activity"
#combining features and activity label
train_y$Activity = as.factor(train_y$Activity)
levels(train_y$Activity) = activityLabels
trainSubjects = read.table("./train/subject_train.txt", sep = "")
names(trainSubjects) = "subject"
trainSubjects$subject = as.factor(trainSubjects$subject)

train <- cbind(train_X, trainSubjects, train_y)
train_labels <- train$Activity

In [None]:
dim(train)

### Read Test Data

In [None]:
test_X = read.table("./test/X_test.txt", sep="", col.names=attributeNames)
test_y <- read.table("./test/y_test.txt", sep="\n")
names(test_y) = "Activity"

test_y$Activity = as.factor(test_y$Activity)
levels(test_y$Activity) = activityLabels
testSubjects = read.table("./test/subject_test.txt", sep = "")
names(testSubjects) = "subject"
testSubjects$subject = as.factor(testSubjects$subject)

#combined features and acitvity
test <- cbind(test_X, testSubjects, test_y)

In [None]:
dim(test)

In [None]:
head(train)

### Dataset Exploration

In [None]:
summary(train)

In [None]:
summary(train$subject)

In [None]:
summary(test)

In [None]:
summary(test$subject)

In [None]:
cat_var <- names(train)[which(sapply(train, is.character))]
num_var <- names(train)[which(sapply(train, is.numeric))]

In [None]:
cat(paste0("Total number of numeric variables: ", num_var))

In [None]:
train$Partition = "Train"
test$Partition = "Test"

library(ggplot2)
all = rbind(train,test)

all$Partition = as.factor(all$Partition)
qplot(data = all, x = subject, fill = Partition)

In [None]:
qplot(data = all , x = subject, fill = Activity)

Unique number of features

### Dataset Subset

In [None]:
# Create a calibration and training set from the main training set
set.seed(42)
idx = sample(c(TRUE, FALSE), nrow(train_X), replace = TRUE, prob = c(0.8, 0.2))

train = subset(train_X, idx)
train_labels  = subset(train_labels, idx)

cal = subset(train_X, !idx)
cal_labels = subset(train_labels, !idx)

### Lasso Feature Selection

With over 500 features, it's important to reduce the feature-set to avoid overfitting and to create a sensical model. The more features you have, the easier it is to create a model that trains well, but may not do so well with test data. To learn more about these problems, checkout the bias-variance trade-off and the curse of dimensionality.

Lasso selection works by reducing the coefficients of certain features towards (and exactly) 0. The higher the lambda, the faster this will happen. A sufficiently high lamba would set all features to 0, meaning only the coefficient will be used in the model (a simple mean, for example, in regression).

In [None]:
x <- model.matrix(train_labels ~ ., train)[,-1]

In [None]:
fit = glmnet(x, train_labels, family = "multinomial", type.multinomial = "grouped")