In [2]:
# First installing the packages require for analyzing the dateset
install.packages("tidyverse")
install.packages("janitor")
install.packages("lubridate")
library(tidyverse)
library(janitor)
library(lubridate)

In [6]:
# now we will import the desired tables or csv files to work with them.

daily_activity <- read.csv("../input/fitbit/Fitabase Data 4.12.16-5.12.16/dailyActivity_merged.csv")
daily_sleep <- read.csv("../input/fitbit/Fitabase Data 4.12.16-5.12.16/sleepDay_merged.csv")
hourly_intensities <- read.csv("../input/fitbit/Fitabase Data 4.12.16-5.12.16/hourlyIntensities_merged.csv")
heartrate_seconds <- read.csv("../input/fitbit/Fitabase Data 4.12.16-5.12.16/heartrate_seconds_merged.csv")
weight_log <- read.csv("../input/fitbit/Fitabase Data 4.12.16-5.12.16/weightLogInfo_merged.csv")



### All this data is provided by the company itself so we are considering it as good data.

### Now our first task is to prepare the data check if it's sufficient and other aspects.

In [7]:
# 1. This data is Second party as it came from fitbit and is acquired by bellabeat company.
head(daily_activity)
head(daily_sleep)
head(hourly_intensities)
head(heartrate_seconds)
head(weight_log)


In [8]:
# Take some glimpse of the those tables
glimpse(daily_activity)
glimpse(daily_sleep)
glimpse(hourly_intensities)
glimpse(heartrate_seconds)
glimpse(weight_log)

### All datasets are structured, continuous and in long data form.
## Now we will prepare the datasets as we desired, so we will procceed to further steps

In [9]:
# let's prepare daily_activity data table
head(daily_activity)

activities <- daily_activity %>%  
  clean_names() %>% 
  mutate(activity_date = mdy(activity_date), day_week = weekdays(activity_date)) %>% 
  rename(date = activity_date)


# Rearranging the columns for more accessibility

activities <- activities[,c(1,2,15,16,3:14)]
head(activities)


In [10]:
# Now prepare daily_sleep table

head(daily_sleep)

sleep <- daily_sleep %>% 
  separate(col = SleepDay, c("date", "time"), sep = " ") %>% 
  mutate(date = mdy(date), day_week = weekdays(date)) %>% 
  mutate(time_awaken_in_bed = TotalTimeInBed - TotalMinutesAsleep) %>% 
  clean_names()
# Rearranging the columns
sleep <- sleep[,c(1,2,7,5,6,8)]
head(sleep)

In [37]:
# Now prepare hourly_intensities
head(hourly_intensities)
hourly <- hourly_intensities %>% 
  separate(col = ActivityHour,c("date","time","meridiem"), sep = " ") %>% 
  mutate(date= mdy(date))
head(hourly)

## Now Let's process the data to analyze it in further steps.

In [12]:
# find duplicates in tables

activities %>% 
  duplicated() %>% 
  sum()

sleep %>% 
  duplicated() %>% 
  sum()

hourly %>% 
  duplicated() %>% 
  sum()

heartrate_seconds %>% 
  duplicated() %>% 
  sum()

weight_log %>%
  duplicated() %>% 
  sum()

#### There are no Duplicates in activities table
#### There are three duplicates in sleep 
#### There are 1818 duplicates in hourly table
#### There are no Duplicates in heart-rate seconds table
#### There are no Duplicates in weight_log table

### Now we check the duplicates and get rid of them

In [13]:
sleep[duplicated(sleep),]
head(hourly[duplicated(hourly),])

In [39]:
# Removing duplicates

sleep_v1 <- sleep %>% 
  distinct()

hourly_v1 <- hourly %>% 
  distinct()

# check if the duplicates are removed
sleep_v1 %>% 
  duplicated() %>% 
  sum()

hourly_v1 %>% 
  duplicated() %>% 
  sum()

#### We have no duplicates in new versions of table

## Now take out insights from the data based on usage trends of the device

In [63]:
# Find trends on usage of band
activities_users <- unique(activities$id, incomparables = FALSE)
sleep_users <- unique(sleep_v2$id, incomparables = FALSE)
heartrate_users <- unique(heartrate_seconds$Id, incomparables = FALSE)
Weight_users <- unique(weight_log$Id, incomparables = FALSE)

print(Weight_users)

length(activities_users)
length(sleep_users)
length(heartrate_users)
length(Weight_users)

usage <- c("activities_tracker","sleep_monitor","heatrate_monitor","weight_track")
no_of_users <- c(length(activities_users),length(sleep_users),
                 length(heartrate_users),length(Weight_users))

users_df <- data.frame(usage, no_of_users)
options(repr.plot.width = 12, repr.plot.height =8)
ggplot(data = users_df) + 
  geom_col(mapping = aes(x = usage, y = no_of_users, fill= usage))+
  theme(axis.text.x = element_text(angle = 30),text = element_text(size = 20))


In [65]:
# to know trends of usage on weekdays or weekends
Weekly_usage <- activities %>% 
  group_by(day_week) %>% 
  summarise_at(vars("total_steps"), funs(mean(.,na.rm = TRUE))) %>% 
  arrange(factor(day_week, levels = c("Monday", "Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday")))
View(Weekly_usage)
options(repr.plot.width = 12, repr.plot.height =8)
level_order <- c("Monday", "Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday")
ggplot(data = Weekly_usage) + geom_col(mapping = aes(x=factor(day_week, level = level_order), y = total_steps, fill= day_week))+
  theme(axis.text.x = element_text(angle = 45),text = element_text(size = 20))+
    labs(title = "Everyday activies sum of users by week",
    caption = "Data is from Kaggle fitbit dataset")

### By this visualization we can see that people are less active on Sunday which is considered as weekend.

In [73]:
# If people are getting enough sleep or not. We will compare data or bedtime, sleeptime and the recommended sleep hours.
head(sleep_v1)
sleep_v2 <- sleep_v1 %>% 
  group_by(id) %>% 
  summarise_at(vars("total_minutes_asleep", "total_time_in_bed"), funs(mean(.,na.rm = TRUE))) %>% 
  arrange()
View(sleep_v2)
options(repr.plot.width = 12, repr.plot.height =6)
ggplot(data = sleep_v2) + 
  geom_line(mapping = aes(x=id, y=(total_minutes_asleep)), color = "blue", size=1) + 
  geom_line(mapping = aes(x=id, y=(total_time_in_bed )), color="green", size =1) +
  geom_segment(mapping = aes(x=0, y=420, xend=8792009665, yend=420), color="red",size = 1)+
  theme(text = element_text(size = 15))+
    labs(title = "Sleeping pattern of users",
    x= "User Ids" ,
    y= "Total time",
    caption = "Data is from Kaggle fitbit dataset")
  

### In this visualization we can see the bedtime is more than actual sleep for every user.
## Now find some insights from hourly_intensities data

In [77]:
# Now find some insights from hourly_intensities data
# Now find some insights from hourly_intensities data
glimpse(hourly_v1)

hourly_v1[["time"]] <- as.POSIXct(hourly_v1[["time"]],
                                format = "%H:%M:%S")

hourly_v2 <- hourly_v1 %>% 
  group_by(time) %>% 
  arrange()
head(hourly_v2)
options(repr.plot.width = 12, repr.plot.height =8)
ggplot(data = hourly_v2)+
  geom_col(mapping = aes(x=time, y= AverageIntensity))+
  theme(text = element_text(size = 18))+
  labs(title = "Hourly activities of users",
       x = "Time of activities",
    caption = "Data is from Kaggle fitbit dataset")+
  facet_wrap(~meridiem)
