# Data Getting and Cleaning
## Course Project

### 1. Script for run_analysis.r script

In [1]:
# Import dplyr Package
library(dplyr) 


Attaching package: 'dplyr'


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union




In [2]:
# Data source url
fileUrl = "https://d396qusza40orc.cloudfront.net/getdata%2Fprojectfiles%2FUCI%20HAR%20Dataset.zip"

In [3]:
if (!file.exists("data")) { dir.create("data")}
## Download and unzip file
if (!file.exists("./Dataset.zip")) { download.file(fileUrl, destfile = "./data/Dataset.Zip") }
unzip("./data/Dataset.zip", files = NULL, list = FALSE, overwrite = TRUE, exdir = ".")

In [4]:
# Read training data 
x_train   <- read.table("./UCI HAR Dataset/train/X_train.txt")
y_train   <- read.table("./UCI HAR Dataset/train/Y_train.txt") 
subject_train <- read.table("./UCI HAR Dataset/train/subject_train.txt")

In [5]:
# Read test data 
x_test   <- read.table("./UCI HAR Dataset/test/X_test.txt")
y_test   <- read.table("./UCI HAR Dataset/test/Y_test.txt") 
subject_test <- read.table("./UCI HAR Dataset/test/subject_test.txt")

In [6]:
# read features description 
features <- read.table("./UCI HAR Dataset/features.txt", as.is = TRUE)
features

V1,V2
<int>,<chr>
1,tBodyAcc-mean()-X
2,tBodyAcc-mean()-Y
3,tBodyAcc-mean()-Z
4,tBodyAcc-std()-X
5,tBodyAcc-std()-Y
6,tBodyAcc-std()-Z
7,tBodyAcc-mad()-X
8,tBodyAcc-mad()-Y
9,tBodyAcc-mad()-Z
10,tBodyAcc-max()-X


In [8]:
# read activity labels 
activity_labels <- read.table("./UCI HAR Dataset/activity_labels.txt") 
colnames(activity_labels) <- c("Id", "activityLabel")

### 2. Merge the training and test sets

In [9]:
# merge training and test sets
x_total   <- rbind(x_train, x_test)
y_total   <- rbind(y_train, y_test) 
subject_total <- rbind(subject_train, subject_test) 

In [10]:
# combine into a frame
MergedData <- cbind( subject_total, x_total, y_total)
colnames(MergedData) <- c("subject", features[,2],"activity")
head(MergedData)

Unnamed: 0_level_0,subject,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,⋯,fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",activity
Unnamed: 0_level_1,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>
1,1,0.2885845,-0.02029417,-0.1329051,-0.9952786,-0.9831106,-0.9135264,-0.9951121,-0.9831846,-0.923527,⋯,-0.2986764,-0.7103041,-0.11275434,0.030400372,-0.4647614,-0.01844588,-0.8412468,0.1799406,-0.05862692,5
2,1,0.2784188,-0.01641057,-0.1235202,-0.9982453,-0.9753002,-0.960322,-0.9988072,-0.9749144,-0.9576862,⋯,-0.5950509,-0.8614993,0.05347695,-0.007434566,-0.7326262,0.70351059,-0.8447876,0.1802889,-0.05431672,5
3,1,0.2796531,-0.01946716,-0.1134617,-0.9953796,-0.967187,-0.978944,-0.9965199,-0.9636684,-0.9774686,⋯,-0.3907482,-0.7601037,-0.11855926,0.17789948,0.1006992,0.80852908,-0.8489335,0.1806373,-0.04911782,5
4,1,0.2791739,-0.02620065,-0.1232826,-0.9960915,-0.9834027,-0.9906751,-0.9970995,-0.9827498,-0.9893025,⋯,-0.1172902,-0.4828445,-0.03678797,-0.012892494,0.640011,-0.48536645,-0.8486494,0.1819348,-0.04766318,5
5,1,0.2766288,-0.01656965,-0.1153619,-0.9981386,-0.9808173,-0.9904816,-0.9983211,-0.9796719,-0.9904411,⋯,-0.3514709,-0.6992052,0.12332005,0.12254196,0.6935783,-0.61597061,-0.8478653,0.1851512,-0.04389225,5
6,1,0.2771988,-0.01009785,-0.1051373,-0.997335,-0.9904868,-0.99542,-0.9976274,-0.9902177,-0.9955489,⋯,-0.5454101,-0.8446193,0.08263215,-0.14343901,0.2750408,-0.36822404,-0.8496316,0.1848225,-0.04212638,5


 ### 3. Extract the measurements on the mean and standard deviation for each measurement

In [11]:
# grep logical to identify entries for subject, activity, mean and std 
MergedData <- MergedData[, grepl("subject|activity",colnames(MergedData))|grepl("mean", colnames(MergedData))|grepl("std", colnames(MergedData))]
head(MergedData)

Unnamed: 0_level_0,subject,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tGravityAcc-mean()-X,tGravityAcc-mean()-Y,tGravityAcc-mean()-Z,⋯,fBodyBodyAccJerkMag-mean(),fBodyBodyAccJerkMag-std(),fBodyBodyAccJerkMag-meanFreq(),fBodyBodyGyroMag-mean(),fBodyBodyGyroMag-std(),fBodyBodyGyroMag-meanFreq(),fBodyBodyGyroJerkMag-mean(),fBodyBodyGyroJerkMag-std(),fBodyBodyGyroJerkMag-meanFreq(),activity
Unnamed: 0_level_1,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>
1,1,0.2885845,-0.02029417,-0.1329051,-0.9952786,-0.9831106,-0.9135264,0.9633961,-0.1408397,0.11537494,⋯,-0.9937257,-0.993755,0.3469885,-0.9801349,-0.9613094,-0.1289889,-0.9919904,-0.9906975,-0.07432303,5
2,1,0.2784188,-0.01641057,-0.1235202,-0.9982453,-0.9753002,-0.960322,0.9665611,-0.1415513,0.10937881,⋯,-0.9903355,-0.9919603,0.5320605,-0.9882956,-0.9833219,-0.2719585,-0.9958539,-0.9963995,0.15807454,5
3,1,0.2796531,-0.01946716,-0.1134617,-0.9953796,-0.967187,-0.978944,0.9668781,-0.1420098,0.10188392,⋯,-0.9892801,-0.9908667,0.660795,-0.9892548,-0.9860277,-0.2127279,-0.9950305,-0.9951274,0.41450281,5
4,1,0.2791739,-0.02620065,-0.1232826,-0.9960915,-0.9834027,-0.9906751,0.9676152,-0.1439765,0.09985014,⋯,-0.9927689,-0.9916998,0.6789213,-0.9894128,-0.9878358,-0.0356842,-0.9952207,-0.9952369,0.40457253,5
5,1,0.2766288,-0.01656965,-0.1153619,-0.9981386,-0.9808173,-0.9904816,0.9682244,-0.1487502,0.0944859,⋯,-0.9955228,-0.994389,0.5590577,-0.991433,-0.9890594,-0.273582,-0.9950928,-0.9954648,0.08775301,5
6,1,0.2771988,-0.01009785,-0.1051373,-0.997335,-0.9904868,-0.99542,0.9679482,-0.14821,0.09190972,⋯,-0.9947329,-0.9951562,0.2469096,-0.9905,-0.9858609,-0.2973291,-0.9951433,-0.9952387,0.01995331,5


### 4. Use descriptive activity names to name activities in the data set

In [12]:
MergedData$activity <- factor(MergedData$activity, levels = activity_labels[,1], 
                              labels = activity_labels[,2])

### 5. Appropriately label data set with descriptive variable names 

In [13]:
# get column names
MergedDataCols <- colnames(MergedData)
View(MergedDataCols)

In [14]:
# clean and clarify names
MergedDataCols <- gsub("^f", "frequency", MergedDataCols)
MergedDataCols <- gsub("^t", "time", MergedDataCols)
MergedDataCols <- gsub("Acc", "Accelerometer", MergedDataCols)
MergedDataCols <- gsub("Gyro", "Gyroscope", MergedDataCols)
MergedDataCols <- gsub("Mag", "Magnitude", MergedDataCols)
MergedDataCols <- gsub("Freq", "Frequency", MergedDataCols)
MergedDataCols <- gsub("mean", "Mean", MergedDataCols)
MergedDataCols <- gsub("std", "StandardDev", MergedDataCols)
View(MergedDataCols)

In [16]:
# fix BodyBody typo
MergedDataCols <- gsub("BodyBody", "Body",MergedDataCols)
head(MergedData)

Unnamed: 0_level_0,subject,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tGravityAcc-mean()-X,tGravityAcc-mean()-Y,tGravityAcc-mean()-Z,⋯,fBodyBodyAccJerkMag-mean(),fBodyBodyAccJerkMag-std(),fBodyBodyAccJerkMag-meanFreq(),fBodyBodyGyroMag-mean(),fBodyBodyGyroMag-std(),fBodyBodyGyroMag-meanFreq(),fBodyBodyGyroJerkMag-mean(),fBodyBodyGyroJerkMag-std(),fBodyBodyGyroJerkMag-meanFreq(),activity
Unnamed: 0_level_1,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
1,1,0.2885845,-0.02029417,-0.1329051,-0.9952786,-0.9831106,-0.9135264,0.9633961,-0.1408397,0.11537494,⋯,-0.9937257,-0.993755,0.3469885,-0.9801349,-0.9613094,-0.1289889,-0.9919904,-0.9906975,-0.07432303,STANDING
2,1,0.2784188,-0.01641057,-0.1235202,-0.9982453,-0.9753002,-0.960322,0.9665611,-0.1415513,0.10937881,⋯,-0.9903355,-0.9919603,0.5320605,-0.9882956,-0.9833219,-0.2719585,-0.9958539,-0.9963995,0.15807454,STANDING
3,1,0.2796531,-0.01946716,-0.1134617,-0.9953796,-0.967187,-0.978944,0.9668781,-0.1420098,0.10188392,⋯,-0.9892801,-0.9908667,0.660795,-0.9892548,-0.9860277,-0.2127279,-0.9950305,-0.9951274,0.41450281,STANDING
4,1,0.2791739,-0.02620065,-0.1232826,-0.9960915,-0.9834027,-0.9906751,0.9676152,-0.1439765,0.09985014,⋯,-0.9927689,-0.9916998,0.6789213,-0.9894128,-0.9878358,-0.0356842,-0.9952207,-0.9952369,0.40457253,STANDING
5,1,0.2766288,-0.01656965,-0.1153619,-0.9981386,-0.9808173,-0.9904816,0.9682244,-0.1487502,0.0944859,⋯,-0.9955228,-0.994389,0.5590577,-0.991433,-0.9890594,-0.273582,-0.9950928,-0.9954648,0.08775301,STANDING
6,1,0.2771988,-0.01009785,-0.1051373,-0.997335,-0.9904868,-0.99542,0.9679482,-0.14821,0.09190972,⋯,-0.9947329,-0.9951562,0.2469096,-0.9905,-0.9858609,-0.2973291,-0.9951433,-0.9952387,0.01995331,STANDING


In [17]:
# use new labels as column names
colnames(MergedData) <- MergedDataCols
head(MergedData)

Unnamed: 0_level_0,subject,timeBodyAccelerometer-Mean()-X,timeBodyAccelerometer-Mean()-Y,timeBodyAccelerometer-Mean()-Z,timeBodyAccelerometer-StandardDev()-X,timeBodyAccelerometer-StandardDev()-Y,timeBodyAccelerometer-StandardDev()-Z,timeGravityAccelerometer-Mean()-X,timeGravityAccelerometer-Mean()-Y,timeGravityAccelerometer-Mean()-Z,⋯,frequencyBodyAccelerometerJerkMagnitude-Mean(),frequencyBodyAccelerometerJerkMagnitude-StandardDev(),frequencyBodyAccelerometerJerkMagnitude-MeanFrequency(),frequencyBodyGyroscopeMagnitude-Mean(),frequencyBodyGyroscopeMagnitude-StandardDev(),frequencyBodyGyroscopeMagnitude-MeanFrequency(),frequencyBodyGyroscopeJerkMagnitude-Mean(),frequencyBodyGyroscopeJerkMagnitude-StandardDev(),frequencyBodyGyroscopeJerkMagnitude-MeanFrequency(),activity
Unnamed: 0_level_1,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
1,1,0.2885845,-0.02029417,-0.1329051,-0.9952786,-0.9831106,-0.9135264,0.9633961,-0.1408397,0.11537494,⋯,-0.9937257,-0.993755,0.3469885,-0.9801349,-0.9613094,-0.1289889,-0.9919904,-0.9906975,-0.07432303,STANDING
2,1,0.2784188,-0.01641057,-0.1235202,-0.9982453,-0.9753002,-0.960322,0.9665611,-0.1415513,0.10937881,⋯,-0.9903355,-0.9919603,0.5320605,-0.9882956,-0.9833219,-0.2719585,-0.9958539,-0.9963995,0.15807454,STANDING
3,1,0.2796531,-0.01946716,-0.1134617,-0.9953796,-0.967187,-0.978944,0.9668781,-0.1420098,0.10188392,⋯,-0.9892801,-0.9908667,0.660795,-0.9892548,-0.9860277,-0.2127279,-0.9950305,-0.9951274,0.41450281,STANDING
4,1,0.2791739,-0.02620065,-0.1232826,-0.9960915,-0.9834027,-0.9906751,0.9676152,-0.1439765,0.09985014,⋯,-0.9927689,-0.9916998,0.6789213,-0.9894128,-0.9878358,-0.0356842,-0.9952207,-0.9952369,0.40457253,STANDING
5,1,0.2766288,-0.01656965,-0.1153619,-0.9981386,-0.9808173,-0.9904816,0.9682244,-0.1487502,0.0944859,⋯,-0.9955228,-0.994389,0.5590577,-0.991433,-0.9890594,-0.273582,-0.9950928,-0.9954648,0.08775301,STANDING
6,1,0.2771988,-0.01009785,-0.1051373,-0.997335,-0.9904868,-0.99542,0.9679482,-0.14821,0.09190972,⋯,-0.9947329,-0.9951562,0.2469096,-0.9905,-0.9858609,-0.2973291,-0.9951433,-0.9952387,0.01995331,STANDING


In [22]:
ActivityMeans <- MergedData %>% group_by(subject, activity) %>% summarize_each(list(mean = mean))
head(ActivityMeans)

subject,activity,timeBodyAccelerometer-Mean()-X_mean,timeBodyAccelerometer-Mean()-Y_mean,timeBodyAccelerometer-Mean()-Z_mean,timeBodyAccelerometer-StandardDev()-X_mean,timeBodyAccelerometer-StandardDev()-Y_mean,timeBodyAccelerometer-StandardDev()-Z_mean,timeGravityAccelerometer-Mean()-X_mean,timeGravityAccelerometer-Mean()-Y_mean,⋯,frequencyBodyAccelerometerMagnitude-MeanFrequency()_mean,frequencyBodyAccelerometerJerkMagnitude-Mean()_mean,frequencyBodyAccelerometerJerkMagnitude-StandardDev()_mean,frequencyBodyAccelerometerJerkMagnitude-MeanFrequency()_mean,frequencyBodyGyroscopeMagnitude-Mean()_mean,frequencyBodyGyroscopeMagnitude-StandardDev()_mean,frequencyBodyGyroscopeMagnitude-MeanFrequency()_mean,frequencyBodyGyroscopeJerkMagnitude-Mean()_mean,frequencyBodyGyroscopeJerkMagnitude-StandardDev()_mean,frequencyBodyGyroscopeJerkMagnitude-MeanFrequency()_mean
<int>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,WALKING,0.2773308,-0.017383819,-0.1111481,-0.28374026,0.114461337,-0.2600279,0.9352232,-0.282165,⋯,0.19064372,-0.0571194,-0.1034924,0.09382218,-0.1992526,-0.321018,0.2688443675,-0.3193086,-0.3816019,0.1906634
1,WALKING_UPSTAIRS,0.2554617,-0.023953149,-0.097302,-0.35470803,-0.002320265,-0.01947924,0.8933511,-0.3621534,⋯,-0.09774335,-0.44265216,-0.5330599,0.08535241,-0.3259615,-0.1829855,-0.2193033761,-0.6346651,-0.6939305,0.1142773
1,WALKING_DOWNSTAIRS,0.2891883,-0.009918505,-0.1075662,0.03003534,-0.031935943,-0.23043421,0.9318744,-0.2666103,⋯,0.11918714,0.02621849,-0.1040523,0.07649155,-0.1857203,-0.3983504,0.3496138955,-0.2819634,-0.3919199,0.1900007
1,SITTING,0.2612376,-0.001308288,-0.1045442,-0.97722901,-0.922618642,-0.93958629,0.8315099,0.2044116,⋯,0.23665501,-0.98526213,-0.9816062,0.3518522,-0.9584356,-0.9321984,-0.0002621867,-0.9897975,-0.9870496,0.1847759
1,STANDING,0.2789176,-0.01613759,-0.1106018,-0.9957599,-0.973190056,-0.97977588,0.942952,-0.2729838,⋯,0.28455529,-0.99254248,-0.992536,0.4222201,-0.9846176,-0.9784661,-0.0286057725,-0.9948154,-0.9946711,0.3344987
1,LAYING,0.2215982,-0.040513953,-0.1132036,-0.92805647,-0.836827406,-0.8260614,-0.2488818,0.7055498,⋯,0.08640856,-0.93330036,-0.921804,0.26639115,-0.8621902,-0.8243194,-0.1397750127,-0.9423669,-0.9326607,0.1764859


In [19]:
write.table(ActivityMeans, "tidy_data.txt", row.names = FALSE, quote = FALSE)