In [2]:
# Ensure necessary library is installed
install.packages("rvest")
install.packages("tidyverse")
install.packages("lubridate")

Installing package into ‘/srv/rlibs’
(as ‘lib’ is unspecified)

Installing package into ‘/srv/rlibs’
(as ‘lib’ is unspecified)

Installing package into ‘/srv/rlibs’
(as ‘lib’ is unspecified)



In [3]:
# Loading Libraries
library(rvest)
library(tidyverse)
library(lubridate)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.6     [32m✔[39m [34mdplyr  [39m 1.0.7
[32m✔[39m [34mtidyr  [39m 1.1.4     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.1     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m         masks [34mstats[39m::filter()
[31m✖[39m [34mreadr[39m::[32mguess_encoding()[39m masks [34mrvest[39m::guess_encoding()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m            masks [34mstats[39m::lag()


Attaching package: ‘lubridate’


The following objects are masked from ‘package:base’:

    date, intersect, setdiff, union




In [8]:
#Scrape Data from the Web Page
url <- "https://rateyourmusic.com/charts/top/album/all-time/"
webpage <- read_html(url)

In [13]:
# Extract CSS Selectors
album_names <- webpage %>% 
  html_nodes(".page_charts_section_charts_item_title") %>% 
  html_text() %>%
  str_trim() # Remove any leading or trailing white spaces
artists <- webpage %>% 
  html_nodes(".page_charts_section_charts_item_credited_links_primary") %>% 
  html_text() %>%
  str_trim() # Remove any leading or trailing white spaces
ratings <- webpage %>%
  html_nodes(".page_charts_section_charts_item_details_average_num") %>%
  html_text(trim = TRUE) %>%
  as.numeric() # convert to numeric
ratings <- ratings[seq(1, length(ratings), 2)]
release_date <- webpage %>% 
  html_nodes(".page_charts_section_charts_item_date span:first-child") %>% 
  html_text() %>%
  str_trim() %>% # Remove any leading or trailing white spaces
  parse_date_time(orders = c("dmY", "my")) %>%  # parse using multiple formats
  format("%d-%m-%Y")  # format as dd-mm-yyyy
genres <- webpage %>% 
  html_nodes(".page_charts_section_charts_item_genres_primary") %>% 
  map_chr(~.x %>% # This function from the purrr package is used to apply the nested function to each node, and ensures that the results are returned as character vectors.
            html_nodes("a") %>% 
            html_text() %>% 
            paste(collapse = ", ") #This results in a single string for each album that lists all associated genres in one go.
  )
num_ratings <- webpage %>% 
  html_nodes(".page_charts_section_charts_item_details_ratings .full") %>% 
  html_text() %>%
  str_trim() %>% # Remove any leading or trailing white spaces
  str_remove_all(",") %>%  # remove commas from numbers
  as.numeric() # convert to numeric
num_views <- webpage %>% 
  html_nodes(".page_charts_section_charts_item_details_reviews .full") %>% 
  html_text() %>%
  str_trim() %>%
  str_replace_all(",", "") %>%  # Removing commas
  as.numeric() # convert to numeric
descriptors <- webpage %>% 
  html_nodes(".page_charts_section_charts_item_genre_descriptors") %>% 
  html_text(trim = TRUE) %>%
  str_replace_all("\n", ", ") %>%
  str_squish()


In [17]:
# Transform into a data frame
df <- data.frame(
  Album = album_names,
  Artist = artists,
  Rating = ratings,
  ReleaseDate = release_date,
  Genres = genres,
  Descriptors = descriptors,
  NumberOfRatings = num_ratings,
  NumberOfViews = num_views
)
# Adding a ranking column
df <- df %>% mutate(Ranking = row_number())
# To ensure the Ranking column is first
df <- df %>% select(Ranking, everything())
head(df)


Unnamed: 0_level_0,Ranking,Album,Artist,Rating,ReleaseDate,Genres,Descriptors,NumberOfRatings,NumberOfViews
Unnamed: 0_level_1,<int>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>
1,1,To Pimp a Butterfly,Kendrick Lamar,4.36,15-03-2015,"Conscious Hip Hop, West Coast Hip Hop, Jazz Rap","political, conscious, concept album, poetic, introspective, urban, protest, eclectic",69640,597
2,2,OK Computer,Radiohead,4.27,16-06-1997,"Alternative Rock, Art Rock","melancholic, anxious, alienation, futuristic, existential, lonely, atmospheric, cold",94735,1713
3,3,Wish You Were Here,Pink Floyd,4.32,12-09-1975,"Progressive Rock, Art Rock","melancholic, atmospheric, progressive, concept album, serious, longing, introspective, alienation",65282,997
4,4,Madvillainy,Madvillain,4.33,23-03-2004,Abstract Hip Hop,"sampling, playful, abstract, humorous, cryptic, mysterious, eclectic, surreal",54310,469
5,5,In the Court of the Crimson King,King Crimson,4.31,10-10-1969,"Progressive Rock, Art Rock","fantasy, epic, progressive, complex, poetic, surreal, philosophical, melancholic",59998,939
6,6,In Rainbows,Radiohead,4.29,10-10-2007,"Art Rock, Alternative Rock","lush, melancholic, introspective, bittersweet, atmospheric, mellow, warm, ethereal",68581,872


In [18]:
get_album_rating <- function(album_name) {
  # Find the rating of the album
  rating <- df$Rating[df$Album == album_name]
  
  # Check if rating is found or not
  if(length(rating) == 0) {
    return(paste("No rating found for", album_name))
  } else {
    return(paste("The rating for", album_name, "is", rating))
  }
}

# Example usage
get_album_rating("In Rainbows")