Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
ExperienceSampler/Data-Organization-R-Option/Splice, Prepare, and Clean Google Data.R
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
100 lines (87 sloc)
5.53 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
####Load All Necessary Libraries#### | |
library(tools) | |
#install.packages("tidyr"); install.packages("stringr"); install.packages("plyr"); | |
library("tidyr") | |
library("stringr") | |
library("plyr") | |
###Get list of files to process | |
rawfiles <- list.files(path="", pattern="*.csv", ) # this the directory to your folder of raw Google data | |
####FUNCTION TO CLEAN AND SPLICE THE DATA#### | |
###Google data is comma-separated values### | |
clean.data <- function (fileList){ | |
for (i in 1:length(fileList)){ | |
setwd("") # this the directory to your folder of raw Google data | |
data <- read.csv(fileList[i], header = F) | |
pid <-file_path_sans_ext(fileList[i]) | |
colnames(data)<-c("variable.string", "values") | |
####Remove Duplicate Rows from Dataset#### | |
#Sometimes ExperienceSampler will write the same data more than once | |
#because the participant will try to send the data more than once. | |
data.clean <- unique(data, ) | |
row.names(data.clean)<-NULL | |
####Splice the Data#### | |
values <- data.frame(data.clean$values) | |
variable.string <- data.frame(data.clean$variable.string) | |
spliced.variable.key<-data.frame(variable.string, do.call(rbind, str_split(variable.string$data.clean.variable.string, "_", n=9))) | |
spliced.variable.key$data.clean.variable.string<-NULL | |
spliced.variable.key.merged<-data.frame(list(spliced.variable.key,values)) | |
colnames(spliced.variable.key.merged)<-c("unique.key","variable.name","year","month","date","hour","minute","second","millisecond", "values") | |
####Convert Timestamps#### | |
#Timestamps must have leading 0s so they are sorted correctly.### | |
#convert timestamps to have leading 0s so data will be sorted correctly | |
###Add Leading 0s to Timestamps### | |
#months | |
spliced.variable.key.merged$month<-str_pad(spliced.variable.key.merged$month, 2, side=c("left"), pad="0") | |
#dates | |
spliced.variable.key.merged$date<-str_pad(spliced.variable.key.merged$date, 2, side=c("left"), pad="0") | |
#hours | |
spliced.variable.key.merged$hour<-str_pad(spliced.variable.key.merged$hour, 2, side=c("left"), pad="0") | |
#minutes | |
spliced.variable.key.merged$minute<-str_pad(spliced.variable.key.merged$minute, 2, side=c("left"), pad="0") | |
#seconds | |
spliced.variable.key.merged$second<-str_pad(spliced.variable.key.merged$second, 2, side=c("left"), pad="0") | |
#milliseconds | |
spliced.variable.key.merged$millisecond<-str_pad(spliced.variable.key.merged$millisecond, 3, side=c("left"), pad="0") | |
#merge columns together for updated date column | |
spliced.data<-unite(spliced.variable.key.merged, "timestamp", year, month, date, hour, minute, second, millisecond, sep="_", remove=TRUE) | |
colnames(spliced.data)<-c("unique.key", "variable.name", "timestamp", "values") | |
####Remove All Non-Data Lines#### | |
#Remove any line of data that does not have real data | |
remove.non.data<-with(spliced.data, spliced.data[!(unique.key=="pause" | unique.key=="notification" | | |
unique.key=="weekdayWakeHour"|unique.key== "weekdayWakeMinute" | | |
unique.key== "weekendWakeHour" | unique.key== "weekendWakeMinute" | | |
unique.key== "weekdayDinnerHour" | unique.key== "weekdayDinnerMinute" | | |
unique.key== "weekendDinnerHour" | unique.key== "weekendDinnerMinute" | | |
unique.key== "snoozed" | unique.key== "uniqueKey"| | |
unique.key== "notification1" | unique.key== "notification2" | | |
unique.key== "notification3" | unique.key== "notification4" | | |
unique.key== "notification5" | unique.key== "notification6" | | |
unique.key== "notification7" | unique.key== "notification8" | | |
unique.key== ""), ]) | |
row.names(remove.non.data)<-NULL | |
####Sort Data By Unique Key and then Timestamp#### | |
sorted.data <- remove.non.data[order(remove.non.data[,3], remove.non.data[,1]),] | |
row.names(sorted.data)<-NULL | |
####Write Spliced Data to File#### | |
###Write this data to a separate folder so you know what difference between spliced data and raw data | |
###We recommend creating a new folder includes "Spliced Data Files" in its name | |
setwd("") # this the directory to your folder of spliced Google data | |
write.csv(sorted.data, paste(pid,".csv",sep=""), row.names=FALSE) | |
} | |
} | |
####CLEAN AND SPLICE THE DATA#### | |
clean.data(rawfiles) | |
####Make Merged File of ALL Spliced Data#### | |
setwd("") # this the directory to your folder of spliced Google data | |
spliced.files <- list.files(pattern="*.csv", full.names = F) | |
all.spliced.data <- lapply(spliced.files, function(.file){ | |
dat<-read.csv(.file, header=T) | |
dat <- dat[,1:4] | |
dat$id<-as.numeric(file_path_sans_ext(.file)) | |
dat # return the dataframe | |
}) | |
spliced.dataframe <- do.call(rbind, all.spliced.data) | |
setwd("") #We recommend writing your merged spliced data file in different folder than your spliced data folder | |
##If you write your merged spliced data file to your spliced data folder and you have to rebuild your files, you will have duplicate entries | |
##because the original merged spliced data file will be merged into your new merged spliced data file | |
write.csv(spliced.dataframe, "all.spliced.data.csv", row.names=FALSE) |