## Problem Statement
Using the attributes given, predict which activity of daily living (ADL) each test subject is performing (i.e. WALKING, WALKING_UPSTAIRS, WALKING_DOWNSTAIRS, SITTING, STANDING, LAYING)

In [None]:
library(readr)
library(dplyr)
library(glmnet)


### Read Feature Names

In [None]:
activities = read.table("activity_labels.txt", sep = "", col.names = c("Id", "activity"))
activityLabels = as.character(activities$activity)
features = read.table("features.txt", sep = "", col.names = c("Id", "feature"))
attributeNames = features$feature

In [None]:
cat("Total no of features - ", length(attributeNames))

In [None]:
attributeNames

In [None]:
# Load the training dataset
# Add training data column names from features

train <- tbl_df(read.table("./train/X_train.txt"))
colnames(train) <- attributeNames #features$feature.Variable


# Load the test dataset
# Add test data column names from features

test <- tbl_df(read.table("./test/X_test.txt"))
colnames(test) <- attributeNames  # features$feature.Variable
merged <- rbind(train, test)

In [None]:
# Add subject data, and activity data to the training dataset
train <- cbind(
        rename(tbl_df(read.table("./train/subject_train.txt")), 
               subject = V1),
        rename(tbl_df(read.table("./train/y_train.txt")),
               Activity = V1),
        Dataset.Partition = c("Training"),
        train)

# Add subject data, and activity data to the test dataset
test <- cbind(
    rename(tbl_df(read.table("./test/subject_test.txt")), 
           subject = V1),
    rename(tbl_df(read.table("./test/y_test.txt")),
           Activity = V1),
    Dataset.Partition = c("Test"),
    test)



In [None]:
head(merged)

In [None]:
selected_measures <- grepl('-(mean|std)\\(',colnames(merged))
data_set <- subset(merged, select=selected_measures)
merged <- subset(merged, select=selected_measures)
dim(merged)

In [None]:
colnames(merged) <- gsub("mean", ".Mean.", colnames(merged))
colnames(merged) <- gsub("std", ".Std.", colnames(merged))
colnames(merged) <- gsub("^t", "Time.", colnames(merged))
colnames(merged) <- gsub("\\.t", ".Time.", colnames(merged))
colnames(merged) <- gsub("^f", "Frequency.", colnames(merged))
colnames(merged) <- gsub("\\.f", ".Frequency.", colnames(merged))
colnames(merged) <- gsub("\\(\\)", "", colnames(merged))
colnames(merged) <- gsub("-", "", colnames(merged))
colnames(merged) <- gsub("\\.\\.", ".", colnames(merged))
colnames(merged) <- gsub("\\.\\.", ".", colnames(merged))
colnames(merged) <- gsub("\\.$", "", colnames(merged))
colnames(merged) <- gsub("BodyBody", "Body.", colnames(merged))
colnames(merged) <- gsub("^angle\\.", "Angle.", colnames(merged))
colnames(merged) <- gsub("Gyro", ".Gyro", colnames(merged))
colnames(merged) <- gsub("Acc", ".Acc", colnames(merged))
colnames(merged) <- gsub("Jerk", ".Jerk", colnames(merged))
colnames(merged) <- gsub("Mag", ".Mag", colnames(merged))
colnames(merged) <- gsub("^", "MeanOf.", colnames(merged))
colnames(merged) <- gsub("(^|[\\.])([[:alpha:]])", "\\1\\U\\2", colnames(merged), perl=TRUE)


In [None]:
activities_train <- read.table("./train/y_train.txt")
activities_test <- read.table("./test/y_test.txt")
activities <- rbind(activities_train,activities_test)[,1]
labels <- c("WALKING", "WALKING_UPSTAIRS", "WALKING_DOWNSTAIRS",
            "SITTING", "STANDING", "LAYING")
activities <- labels[activities]
merged <- cbind(Activity = activities,merged)


In [None]:
subjects_train <- read.table("train/subject_train.txt")
subjects_test <- read.table("test/subject_test.txt")
subjects <- rbind(subjects_train,subjects_test)[,1]
merged <- cbind(Subject = subjects,merged)
merged[1:4,1:5]

In [None]:
library('dplyr')
average_data_set <- merged %>%
    group_by(Subject,Activity) %>%
    summarise_each(funs(mean))

In [None]:
average_data_set

In [None]:
write.table(average_data_set,row.name = FALSE,file = "tidy_data_set.txt")  

In [None]:
require(plyr)
library(dplyr)

# A function that takes in a variable name and generates a description for the
# variable
generateVariableDescription <- function(x) {
    x <- as.character(x)
    desc <- ""
    
    if (x == "Activity")
        desc <- paste(desc, "The motion activity" , sep = "")
    
    if (x == "Subject.Id")
        desc <- paste(desc, "The subject identifier" , sep = "")
    
    if (grepl("\\.mean", x, ignore.case = TRUE)[1] == TRUE) 
        desc <- paste(desc, "The mean of " , sep = "")
    
    if (grepl("\\.std", x, ignore.case = TRUE)[1] == TRUE) 
        desc <- paste(desc, "The standard deviation of ", sep = "")
    
    if (grepl("frequency\\.", x, ignore.case = TRUE)[1] == TRUE) {
        desc <- paste(desc, "a Fourier transformed ", sep = "")
        if (grepl("mag\\.", x, ignore.case = TRUE)[1] == TRUE) {
            desc <- paste(desc, "resultant XYZ signal (Euclidean norm) ", sep = "")
        } else if (grepl("\\.x", x, ignore.case = TRUE)[1] == TRUE) {
            desc <- paste(desc, "X signal ", sep = "")
        } else if (grepl("\\.y", x, ignore.case = TRUE)[1] == TRUE) {
            desc <- paste(desc, "Y signal ", sep = "")
        } else if (grepl("\\.z", x, ignore.case = TRUE)[1] == TRUE) {
            desc <- paste(desc, "Z signal ", sep = "")
        }
    } else {
        if (grepl("mag\\.", x, ignore.case = TRUE)[1] == TRUE) {
            desc <- paste(desc, "the resultant XYZ ", sep = "")
        } else if (grepl("\\.x", x, ignore.case = TRUE)[1] == TRUE) {
            desc <- paste(desc, "the X ", sep = "")
        } else if (grepl("\\.y", x, ignore.case = TRUE)[1] == TRUE) {
            desc <- paste(desc, "the Y ", sep = "")
        } else if (grepl("\\.z", x, ignore.case = TRUE)[1] == TRUE) {
            desc <- paste(desc, "the Z ", sep = "")
        }
    }
    
    if (grepl("jerk\\.", x, ignore.case = TRUE)[1] == TRUE) {
        if (grepl("acc\\.", x, ignore.case = TRUE)[1] == TRUE)
            desc <- paste(desc, "jerk from an accelerometer signal ", sep = "")
        
        if (grepl("gyro\\.", x, ignore.case = TRUE)[1] == TRUE)
            desc <- paste(desc, "jerk from a gyroscope signal ", sep = "")
    }
    else {
        if (grepl("acc\\.", x, ignore.case = TRUE)[1] == TRUE)
            desc <- paste(desc, "acceleration from an accelerometer signal ", sep = "")
        
        if (grepl("gyro\\.", x, ignore.case = TRUE)[1] == TRUE)
            desc <- paste(desc, "angular velocity from a gyroscope signal ", sep = "")
    }
    
    if (grepl("\\.body", x, ignore.case = TRUE)[1] == TRUE) 
        desc <- paste(desc, "related to body motion", sep = "") 
    
    if (grepl("\\.gravity", x, ignore.case = TRUE)[1] == TRUE) 
        desc <- paste(desc, "related gravity", sep = "") 
    
    desc
}

# 1. Read the variable names from the tidy_summary.txt file
tidy_summary_variables <- read.table("tidy_data_set.txt", header = TRUE) %>%
    names %>%
    data.frame %>%
    tbl_df %>%
    plyr::rename(c("." ="Variable"))

# 2. Generate a list of descriptions related to the variable names
descriptions <- data.frame(
    tapply(tidy_summary_variables$Variable, 
           tidy_summary_variables[1], 
           generateVariableDescription))

# 3. Rename the descriptions column to Description
colnames(descriptions)[1] <- "Description"

# 4. Add the row names of the descriptions as the variable name
descriptions <- cbind(Variable = rownames(descriptions), descriptions)

# 5. Output the variable names and descriptions to file
write.table(descriptions, "variable_descriptions.txt", row.names = FALSE)
rm(description)

### Read Train Data

In [None]:
train_X = read.table("./train/X_train.txt", sep="", col.names=attributeNames)
train_y <- read.table("./train/y_train.txt", sep="\n")
names(train_y) = "Activity"
#combining features and activity label
train_y$Activity = as.factor(train_y$Activity)
levels(train_y$Activity) = activityLabels
trainSubjects = read.table("./train/subject_train.txt", sep = "")
names(trainSubjects) = "subject"
trainSubjects$subject = as.factor(trainSubjects$subject)

train <- cbind(train_X, trainSubjects, train_y)
train_labels <- train$Activity

In [None]:
dim(train)

### Read Test Data

In [None]:
test_X = read.table("./test/X_test.txt", sep="", col.names=attributeNames)
test_y <- read.table("./test/y_test.txt", sep="\n")
names(test_y) = "Activity"

test_y$Activity = as.factor(test_y$Activity)
levels(test_y$Activity) = activityLabels
testSubjects = read.table("./test/subject_test.txt", sep = "")
names(testSubjects) = "subject"
testSubjects$subject = as.factor(testSubjects$subject)

#combined features and acitvity
test <- cbind(test_X, testSubjects, test_y)

In [None]:
dim(test)

In [None]:
head(train)

### Dataset Exploration

In [None]:
summary(train)

In [None]:
summary(train$subject)

In [None]:
summary(test)

In [None]:
summary(test$subject)

In [None]:
cat_var <- names(train)[which(sapply(train, is.character))]
num_var <- names(train)[which(sapply(train, is.numeric))]

In [None]:
cat(paste0("Total number of numeric variables: ", num_var))

In [None]:
train$Partition = "Train"
test$Partition = "Test"

library(ggplot2)
all = rbind(train,test)

all$Partition = as.factor(all$Partition)
qplot(data = all, x = subject, fill = Partition)

In [None]:
qplot(data = all , x = subject, fill = Activity)

Unique number of features

### Dataset Subset

In [None]:
# Create a calibration and training set from the main training set
set.seed(42)
idx = sample(c(TRUE, FALSE), nrow(train_X), replace = TRUE, prob = c(0.8, 0.2))

train = subset(train_X, idx)
train_labels  = subset(train_labels, idx)

cal = subset(train_X, !idx)
cal_labels = subset(train_labels, !idx)

### Lasso Feature Selection

With over 500 features, it's important to reduce the feature-set to avoid overfitting and to create a sensical model. The more features you have, the easier it is to create a model that trains well, but may not do so well with test data. To learn more about these problems, checkout the bias-variance trade-off and the curse of dimensionality.

Lasso selection works by reducing the coefficients of certain features towards (and exactly) 0. The higher the lambda, the faster this will happen. A sufficiently high lamba would set all features to 0, meaning only the coefficient will be used in the model (a simple mean, for example, in regression).

In [None]:
x <- model.matrix(train_labels ~ ., train)[,-1]

In [None]:
fit = glmnet(x, train_labels, family = "multinomial", type.multinomial = "grouped")