In [4]:
import datetime
import json
import re
import sys

from bs4 import BeautifulSoup
import requests

In [5]:
# css selector constants
_CHART_NAME_SELECTOR = 'meta[name="twitter:title"]'
_DATE_ELEMENT_SELECTOR = "button.chart-detail-header__date-selector-button"
_PREVIOUS_DATE_SELECTOR = "span.fa-chevron-left"
_NEXT_DATE_SELECTOR = "span.fa-chevron-right"
_ENTRY_LIST_SELECTOR = "div.chart-list-item"
_ENTRY_TITLE_ATTR = "data-title"
_ENTRY_ARTIST_ATTR = "data-artist"
_ENTRY_IMAGE_SELECTOR = "img.chart-list-item__image"
_ENTRY_RANK_ATTR = "data-rank"

# constants for the getPositionRowValue helper function
_ROW_SELECTOR_FORMAT = "div.chart-list-item__%s"
_PEAK_POS_FORMAT = "weeks-at-one"
_LAST_POS_FORMAT = "last-week"
_WEEKS_ON_CHART_FORMAT = "weeks-on-chart"

In [6]:
class ChartEntry:

    def __init__(self, title, artist, image, peakPos, lastPos, weeks, rank, isNew):
        self.title = title
        self.artist = artist
        self.image = image
        self.peakPos = peakPos
        self.lastPos = lastPos
        self.weeks = weeks
        self.rank = rank
        self.isNew = isNew

    def __repr__(self):
        return "{}.{}(title={!r}, artist={!r})".format(
            self.__class__.__module__, self.__class__.__name__, self.title, self.artist
        )

    def __str__(self):
        """Returns a string of the form 'TITLE by ARTIST'.
        """
        if self.title:
            s = u"'%s' by %s" % (self.title, self.artist)
        else:
            s = u"%s" % self.artist

        if sys.version_info.major < 3:
            return s.encode(getattr(sys.stdout, "encoding", "") or "utf8")
        else:
            return s

    def json(self):
        """Returns the entry as a JSON string.
        This is useful for caching.
        """
        return json.dumps(self, default=lambda o: o.__dict__, sort_keys=True, indent=4)


In [7]:
class ChartData:
    """Represents a particular Billboard chart for a particular date.
    Attributes:
        name: The chart name, as a string.
        title: The human-readable chart name, as a string.
        date: The date of the chart.
        previousDate: The date of the previous chart, as a string in YYYY-MM-DD
            format, or None if this information was not available.
        entries: A list of ChartEntry objects, ordered by position on the chart
            (highest first).
    """

    def __init__(self, name, date=None, fetch=True, timeout=25):
        """Constructs a new ChartData instance.
        Args:
            name: The chart name, e.g. 'hot-100' or 'pop-songs'.
            date: The chart date, as a string in YYYY-MM-DD format.
                By default, the latest chart is fetched.
                If the argument is not a date on which a chart was published,
                Billboard automatically rounds dates up to the nearest date on
                which a chart was published.
                If this argument is invalid, no exception will be raised;
                instead, the chart will contain no entries.
            fetch: A boolean indicating whether to fetch the chart data from
                Billboard.com immediately (at instantiation time).
                If False, the chart data can be populated at a later time
                using the fetchEntries() method.
            timeout: The number of seconds to wait for a server response.
                If None, no timeout is applied.
        """
        self.name = name

        if date is not None:
            if not re.match("\d{4}-\d{2}-\d{2}", str(date)):
                raise ValueError("Date argument is not in YYYY-MM-DD format")
            try:
                datetime.datetime(*(int(x) for x in str(date).split("-")))
            except:
                raise ValueError("Date argument is invalid")

        self.date = date
        self.title = ""
        self.previousDate = None

        self._timeout = timeout

        self.entries = []
        if fetch:
            self.fetchEntries()

    def __repr__(self):
        return "{}.{}({!r}, date={!r})".format(
            self.__class__.__module__, self.__class__.__name__, self.name, self.date
        )

    def __str__(self):
        """Returns the chart as a human-readable string (typically multi-line).
        """
        if not self.date:
            s = "%s chart (current)" % self.name
        else:
            s = "%s chart from %s" % (self.name, self.date)
        s += "\n" + "-" * len(s)
        for n, entry in enumerate(self.entries):
            s += "\n%s. %s" % (entry.rank, str(entry))
        return s

    def __getitem__(self, key):
        """Returns the (key + 1)-th chart entry; i.e., chart[0] refers to the
        top entry on the chart.
        """
        return self.entries[key]

    def __len__(self):
        """Returns the number of entries in the chart.
        A length of zero may indicated a failed/bad request.
        """
        return len(self.entries)

    def json(self):
        """Returns the entry as a JSON string.
        This is useful for caching.
        """
        return json.dumps(self, default=lambda o: o.__dict__, sort_keys=True, indent=4)

    def _parseOldStylePage(self, soup):
        dateElement = soup.select_one(_DATE_ELEMENT_SELECTOR)
        if dateElement:
            dateText = dateElement.text.strip()
            curDate = datetime.datetime.strptime(dateText, "%B %d, %Y")
            if self.date and curDate < datetime.datetime.strptime(
                str(self.date), "%Y-%m-%d"
            ):
                # For dates that come after the date of a given chart's latest issue, Billboard.com returns a valid webpage
                # containing no chart data but displaying the date of the chart's latest issue.
                raise ValueError("Date argument is after the date of the latest issue")
            self.date = curDate.strftime("%Y-%m-%d")

        prevWeek = soup.select_one(_PREVIOUS_DATE_SELECTOR)
        nextWeek = soup.select_one(_NEXT_DATE_SELECTOR)
        if prevWeek and prevWeek.parent.get("href"):
            self.previousDate = prevWeek.parent.get("href").split("/")[-1]
        if nextWeek and nextWeek.parent.get("href"):
            self.nextDate = nextWeek.parent.get("href").split("/")[-1]

        for entrySoup in soup.select(_ENTRY_LIST_SELECTOR):
            try:
                title = entrySoup[_ENTRY_TITLE_ATTR].strip()
            except:
                message = "Failed to parse title"
                raise BillboardParseException(message)

            try:
                artist = entrySoup[_ENTRY_ARTIST_ATTR].strip() or ""
            except:
                message = "Failed to parse artist"
                raise BillboardParseException(message)

            if artist == "":
                title, artist = artist, title

            try:
                imageSoup = entrySoup.select_one(_ENTRY_IMAGE_SELECTOR)
                if imageSoup.has_attr("data-src"):
                    image = imageSoup["data-src"]
                else:
                    image = imageSoup["src"]
            except:
                message = "Failed to parse image"
                raise BillboardParseException(message)

            try:
                rank = int(entrySoup[_ENTRY_RANK_ATTR].strip())
            except:
                message = "Failed to parse rank"
                raise BillboardParseException(message)

            def getPositionRowValue(rowName, ifNoValue=None):
                try:
                    selector = _ROW_SELECTOR_FORMAT % rowName
                    selected = entrySoup.select(selector)
                    # We get the first element of selected because there are two
                    # elements matching _LAST_POS_FORMAT and we want the first
                    # one (the second is the position two weeks previous)
                    if (
                        not selected
                        or selected[0].string is None
                        or selected[0].string == "-"
                    ):
                        return ifNoValue
                    else:
                        return int(selected[0].string.strip())
                except:
                    message = "Failed to parse row value: %s" % rowName
                    raise BillboardParseException(message)

            if self.date:
                peakPos = getPositionRowValue(_PEAK_POS_FORMAT)
                lastPos = getPositionRowValue(_LAST_POS_FORMAT, ifNoValue=0)
                weeks = getPositionRowValue(_WEEKS_ON_CHART_FORMAT, ifNoValue=1)
                isNew = True if weeks == 1 else False
            else:
                peakPos = lastPos = weeks = None
                isNew = False

            entry = ChartEntry(
                title, artist, image, peakPos, lastPos, weeks, rank, isNew
            )
            self.entries.append(entry)

    def _parseNewStylePage(self, soup):
        dateElement = soup.select_one("button.date-selector__button.button--link")
        if dateElement:
            dateText = dateElement.text.strip()
            curDate = datetime.datetime.strptime(dateText, "%B %d, %Y")
            if self.date and curDate < datetime.datetime.strptime(
                str(self.date), "%Y-%m-%d"
            ):
                # For dates that come after the date of a given chart's latest issue, Billboard.com returns a valid webpage
                # containing no chart data but displaying the date of the chart's latest issue.
                raise ValueError("Date argument is after the date of the latest issue")
            self.date = curDate.strftime("%Y-%m-%d")

        self.previousDate = soup.select_one("#charts")["data-previous-chart-date"]
        self.nextDate = soup.select_one("#charts")["data-chart-next-date"]

        for entrySoup in soup.select("li.chart-list__element"):

            def getEntryAttr(selector):
                return entrySoup.select_one(selector).text.strip()

            try:
                title = getEntryAttr("span.chart-element__information__song")
            except:
                message = "Failed to parse title"
                raise BillboardParseException(message)

            try:
                artist = getEntryAttr("span.chart-element__information__artist") or ""
            except:
                message = "Failed to parse artist"
                raise BillboardParseException(message)

            if artist == "":
                title, artist = artist, title

            # TODO: Parse the image
            image = None

            try:
                rank = int(getEntryAttr("span.chart-element__rank__number"))
            except:
                message = "Failed to parse rank"
                raise BillboardParseException(message)

            def getMeta(attribute, ifNoValue=None):
                try:
                    selected = entrySoup.select_one(
                        "span.chart-element__meta.text--%s" % attribute
                    )
                    if (
                        not selected
                        or selected.string is None
                        or selected.string == "-"
                    ):
                        return ifNoValue
                    else:
                        return int(selected.string.strip())
                except:
                    message = "Failed to parse metadata value: %s" % attribute
                    raise BillboardParseException(message)

            if self.date:
                peakPos = getMeta("peak")
                lastPos = getMeta("last", ifNoValue=0)
                weeks = getMeta("week", ifNoValue=1)
                isNew = True if weeks == 1 else False
            else:
                peakPos = lastPos = weeks = None
                isNew = False

            entry = ChartEntry(
                title, artist, image, peakPos, lastPos, weeks, rank, isNew
            )
            self.entries.append(entry)

    def _parsePage(self, soup):
        chartTitleElement = soup.select_one(_CHART_NAME_SELECTOR)
        if chartTitleElement:
            self.title = chartTitleElement.get("content", "").split("|")[0].strip()

        if soup.select("table"):
            self._parseOldStylePage(soup)
        else:
            self._parseNewStylePage(soup)

    def fetchEntries(self):
        """GETs the corresponding chart data from Billboard.com, then parses
        the data using BeautifulSoup.
        """
        if not self.date:
            # Fetch latest chart
            url = "http://www.billboard.com/charts/%s" % (self.name)
        else:
            url = "http://www.billboard.com/charts/%s/%s" % (self.name, self.date)

        req = requests.get(url, timeout=self._timeout)
        if req.status_code == 404:
            message = "Chart not found (perhaps the name is misspelled?)"
            raise BillboardNotFoundException(message)
        req.raise_for_status()

        soup = BeautifulSoup(req.text, "html.parser")
        self._parsePage(soup)



In [8]:
def charts():
    """Gets a list of all Billboard charts from Billboard.com.
    """
    req = requests.get("https://www.billboard.com/charts", timeout=25)
    req.raise_for_status()
    soup = BeautifulSoup(req.text, "html.parser")
    return [
        link["href"].split("/")[-1]
        for link in soup.findAll("a", {"class": "chart-panel__link"})
    ]

In [18]:
chart.previousDate

'2019-10-05'

In [26]:
import time
temp = []
chart = ChartData('hot-100')
while chart.previousDate != '2019-09-05':
    chart = ChartData('hot-100', chart.previousDate)
    temp.append(chart)
    time.sleep(1)

HTTPError: 429 Client Error: Too Many Requests for url: https://www.billboard.com/charts/hot-100/2019-06-01

In [30]:
temp[0]

__main__.ChartData('hot-100', date='2019-10-05')

In [34]:
dir(temp[0])

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_parseNewStylePage',
 '_parseOldStylePage',
 '_parsePage',
 '_timeout',
 'date',
 'entries',
 'fetchEntries',
 'json',
 'name',
 'nextDate',
 'previousDate',
 'title']

In [44]:
ttemp = []
for i in temp:
    for idx,j in enumerate(i):
        ttemp.append(i.entries[idx].artist)

In [45]:
ttemp

['Lizzo',
 'Shawn Mendes & Camila Cabello',
 'Lewis Capaldi',
 'Lil Tecca',
 'Chris Brown Featuring Drake',
 'Lil Nas X',
 'Billie Eilish',
 'Post Malone',
 'Post Malone Featuring Young Thug',
 'Lil Nas X Featuring Billy Ray Cyrus',
 'Ed Sheeran & Justin Bieber',
 'Khalid',
 'Post Malone & Swae Lee',
 'Jonas Brothers',
 'Ed Sheeran Featuring Khalid',
 'Drake Featuring Rick Ross',
 'DaBaby',
 'DaBaby',
 'Ariana Grande & Social House',
 'Megan Thee Stallion, Nicki Minaj & Ty Dolla $ign',
 'Lil Baby & DaBaby',
 'Maroon 5',
 'Taylor Swift',
 'Saweetie',
 'Post Malone Featuring Ozzy Osbourne & Travis Scott',
 'Sam Smith',
 'Sam Smith & Normani',
 'Jonas Brothers',
 'Wale Featuring Jeremih',
 'Blanco Brown',
 'Shawn Mendes',
 'SHAED',
 'Post Malone',
 'Halsey',
 'Young Thug Featuring Gunna',
 'Post Malone Featuring DaBaby',
 'NLE Choppa',
 'YNW Melly & 9lokknine',
 'Chris Lane',
 'Young Thug, J. Cole & Travis Scott',
 'Megan Thee Stallion Featuring DaBaby',
 'Polo G Featuring Lil Tjay',
 'Li