In [None]:
library(dplyr)

# Suppress warnings
options(warn = -1)

# Set display options
options(digits = 2)
options(width = 120)

# Set global theme options for ggplot2
theme_set(theme_minimal(base_size = 18))  # Equivalent to setting `titlesize`

input_dir <- file.path(getwd(), "..", "..", "data-files", "spotify_data")
file_list <- list.files(input_dir, recursive = TRUE, full.names = TRUE)
print(file_list)

In [None]:
library(readr)

data_file_path <- file.path(getwd(), "..", "..", "data-files", "spotify_data", "data.csv")
df <- read_csv(data_file_path)

# Print the first few rows
head(df)

In [74]:
# Define mappings as named vectors
map_key <- c(
  "0" = "C", "1" = "C#", "2" = "D", "3" = "D#", "4" = "E",
  "5" = "F", "6" = "F#", "7" = "G", "8" = "G#", "9" = "A",
  "10" = "A#", "11" = "B"
)

map_mode <- c("1" = "Major", "0" = "Minor")

# Apply the mappings to the 'mode' and 'key' columns
df$mode <- map_mode[as.character(df$mode)]
df$key <- map_key[as.character(df$key)]

In [75]:
# Select columns to check for duplicates
check_dups <- df[, c("artists", "name")]

# Find duplicated rows and get their indices
dups <- which(duplicated(check_dups))

In [None]:
# Print the row count before dropping duplicates
cat("Before dropping duplicates:", nrow(df), "rows\n")

# Drop duplicates using the indices found previously
df <- df[-dups, ]

# Print the row count after dropping duplicates
cat("After dropping duplicates:", nrow(df), "rows\n")

In [None]:
# Convert duration from milliseconds to minutes
df$duration_ms <- df$duration_ms / 60000

# Rename the column
colnames(df)[colnames(df) == "duration_ms"] <- "duration_min"

# Display the first few rows to verify
head(df)

In [None]:
# Drop columns "release_date" and "id"
df$release_date <- NULL
df$id <- NULL

# Display the first few rows to verify
head(df)

In [79]:
library(ggplot2)
library(gridExtra)

fun_subplots_ggplot <- function(df, col) {
    mean_val <- mean(df[[col]], na.rm = TRUE)
    median_val <- median(df[[col]], na.rm = TRUE)

    # Create histogram
    histogram_plot <- ggplot(df, aes_string(x = col)) +
        geom_histogram(binwidth = (max(df[[col]], na.rm = TRUE) - min(df[[col]], na.rm = TRUE)) / 50, 
                       fill = "#EBA0AC", color = "black") +
        labs(title = paste("Histogram of", col), x = col, y = "Count") +
        theme_minimal()

    # Create boxplot
    boxplot <- ggplot(df, aes_string(y = col)) +
        geom_boxplot(fill = "#F9E2AF") +
        labs(title = paste("Boxplot of", col), y = col) +
        theme_minimal()

    # Combine plots
    combined_plot <- grid.arrange(
        histogram_plot,
        boxplot,
        ncol = 2
    )

    # Add mean and median lines
    combined_plot <- combined_plot +
        geom_vline(xintercept = mean_val, color = "#A6E3A1", linetype = "dashed", size = 1) +
        geom_vline(xintercept = median_val, color = "#CBA6F7", linetype = "dashed", size = 1) +
        annotate("text", x = mean_val, y = 0, label = paste("Mean =", round(mean_val, 2)), 
                 vjust = -1, color = "#A6E3A1") +
        annotate("text", x = median_val, y = 0, label = paste("Median =", round(median_val, 2)), 
                 vjust = -1, color = "#CBA6F7")

    print(combined_plot)
}

In [None]:
music_vals <- c(
  "valence",
  "acousticness",
  "danceability",
  "duration_min",
  "energy",
  "instrumentalness",
  "liveness",
  "loudness",
  "popularity",
  "speechiness",
  "tempo"
)

for (col in music_vals) {
  fun_subplots_ggplot(df, col)
}