In [2]:
#loaded libraries and installed the "janitor" package for further use. 

install.packages("janitor")
library(janitor)
library(tidyverse)
library(repr)
library(readxl)
library(dplyr)
library(RColorBrewer)
display.brewer.all()
options(repr.matrix.max.rows = 6)

“unable to access index for repository https://cran.r-project.org/src/contrib:
  cannot open URL 'https://cran.r-project.org/src/contrib/PACKAGES'”
“package ‘janitor’ is not available for this version of R

A version of this package for your version of R might be available elsewhere,
see the ideas at
https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages”


ERROR: Error in library(janitor): there is no package called ‘janitor’


In [3]:
# Retrived Sleep Health and Lifestyle Dataset from Kaggle --> downloaded the data on GitHub and using the GitHub pathway to extract the URL and read the data in Jupyter.

url <- "https://raw.githubusercontent.com/ptank1/dsci-100-2023w1-group-30/main/Sleep_health_and_lifestyle_dataset.csv"
sleep_data <- read_csv(url)
head(sleep_data)

sleep_data



ERROR: Error in read_csv(url): could not find function "read_csv"


In [4]:
# Added underscore (_) to replace the spaces within the column names. This will ensure that the tidyverse functions are being used properly. 

sleep_data <- sleep_data |> clean_names()

ERROR: Error in clean_names(sleep_data): could not find function "clean_names"


In [None]:
# We will be primarily focusing on factors like sleep_duration, physical_activity_level (and gender --> which will be the secondary exploratory analysis), and how these factors influence the chnages in quality_of_sleep in the subjects. 
# Using the select function, we will select the columns that we are primarily interested in (person_id, gender, sleep_duration, quality_of_sleep, physical_activity). 
# Created a new object: sleep_data_select

sleep_data_select <- sleep_data |>
select(person_id, gender, sleep_duration, quality_of_sleep, physical_activity_level) 



sleep_data_select

In [None]:
# Further functions will be correlated to tidying up (mutatung and arranging) the data in a way where the quality_of_sleep column can be used as a character variable instead of a numerical variable, and arranging the data in an ascending order based on the quality_of_sleep.
# Created a new object: sleep_data_tidy


sleep_data_tidy <- sleep_data_select |>
mutate(quality_of_sleep = as.character(quality_of_sleep)) |>
arrange(quality_of_sleep)

sleep_data_tidy

In [None]:
# glimpse() function lets is view the portion of the data with respect to each variable within the single line.  

glimpse(sleep_data_tidy)

In [None]:
# distinct() function lets us view the unique attributes that are present in a column.  
# We want to see what unique values are present in the quality_of_sleep column.

sleep_data_tidy |> distinct(quality_of_sleep)

In [None]:
# Using the mutate as_factor() function lets us convert the selected categorical variable (quality_of_sleep) into the factors.
# The factors can be further used to modify the numerical categories into the different type of categories like poor, good, and excellent using the fct_recode() function.
# We want to modify the numerical ranges into a poor, good, and excellent categories within the quality_of_sleep column as follows: (4 and 5) = "Poor", (6 and 7) = "Good", and (8 and 9) = "Excellent".
# Object name will remain the same: sleep_data_tid

sleep_data_tidy <- sleep_data_tidy |>

  mutate(quality_of_sleep = as_factor(quality_of_sleep)) |>
  mutate(quality_of_sleep = fct_recode(quality_of_sleep, "Poor" = "4", "Poor" = "5", "Good" = "6", "Good" = "7", "Excellent" = "8", "Excellent" = "9"))



sleep_data_tidy

In [None]:
# We want to use the distinct() function again to see whether the numerical categorical values have changed into poor, good, and excellent within the quality_of_sleep. 

sleep_data_tidy |> distinct(quality_of_sleep)


In [5]:
# Now we will examine the sleep_data_tidy dataset and use following functions like n, group_by(), and summarize() function to explore stats of the observations we have in the quality_of_sleep column.
# We will use the goroup_by() function to select quality_of_sleep column, and the n() function within summzarize will allow us to examine the counts of observations within each category (poor, good, and excellent). 
# Ultimately, we will have a table with the counts of observations within poor, good, and excellent categories in quality_of_sleep column, along with the percentage of observations, and mean/average of the observations.


num_obs <- nrow(sleep_data_tidy)
sleep_data_tidy|>
  group_by(quality_of_sleep) |>
  summarize(
    count = n(),
    percentage = n() / num_obs * 100, mean = n())

num_obs

ERROR: Error in nrow(sleep_data_tidy): object 'sleep_data_tidy' not found


In [None]:
# Finally, we will create a visualization, a scatter plot (using geom_point), to explore the relationship of physical_activity_level versus sleep_duration with respect to the quality_of_sleep data.
# Created a new object: sleep_duration_plot

sleep_duration_plot <- sleep_data_tidy |>
  ggplot(aes(x = sleep_duration, y = physical_activity_level, color = quality_of_sleep)) +
  geom_point(alpha = 0.6) +
  labs(x = "Sleep Duration (hours)",
       y = "Physical Activity Level (minutes/day)",
       color = "Quality of Sleep",
      caption = "Scatterplot examining the relationship between Physical Activity Level (minutes/day) over Sleep Duration (hours) with respect to the Quality of Sleep categories. There is a positive upward trend in distribution, where the quality of sleep is excellent as the sleep duration and physical activtiy levels increases. The quality of sleep mostly remains poor in the bottom of the graph when the physical activity levels and the hours of sleep duration are low.") +
  ggtitle("Distribution of Physical Activity Level (minutes/day) versus Sleep Duration (hours) with respect to the Quality of Sleep") +
  theme(text = element_text(size = 12))
       
sleep_duration_plot

In [None]:
# At last, we will create an explarotary visualization, a bar graph (using geom_bar), to explore how the quality of sleep is portrayed within different gender groups (male or female).
# This is just a curious, secondary explrotary analysis.
# Created a new object: gender_plot

gender_plot <- sleep_data_tidy |>
  ggplot(aes(x = gender, fill = quality_of_sleep)) +
  geom_bar(position = 'fill') +
  labs(x = "Gender",
       y = "Proportion of population with varied sleep quality",
       fill = "Quality of Sleep",
      caption = "Bar plot examining the proportional distribution of the sleep quality categories among male and female genders. A greater proportion of female subjects experience excellent quality of sleep compared to the male population.") +
ggtitle("Proportional distribution of Male and Female Subjects with respect to their Quality of Sleep pattern")
  theme(text = element_text(size = 12))
       
gender_plot