In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
urls_with_names = {
    'ArcticMonkeys':'https://kworb.net/spotify/artist/7Ln80lUS6He07XvHI8qqHH_songs.html',
    'BTS': 'https://kworb.net/spotify/artist/3Nrfpe0tUJi4K4DXYWgMUX_songs.html',
    'Coldplay':'https://kworb.net/spotify/artist/4gzpq5DPGxSnKTe4SA8HAU_songs.html',
    'ImagineDragons':'https://kworb.net/spotify/artist/53XhwfbYqKCa1cC15pYq2q_songs.html',
    'LinkinPark':'https://kworb.net/spotify/artist/6XyY86QOPPrYVGvF9ch6wz_songs.html',
    'Maroon5':'https://kworb.net/spotify/artist/04gDigrS5kc9YWfZHwBETP_songs.html',
    'OneDirection': 'https://kworb.net/spotify/artist/4AK6F7OLvEQ5QYCBNiQWHq_songs.html',
    'OneRepublic': 'https://kworb.net/spotify/artist/5Pwc4xIPtQLFEnJriah9YJ_songs.html',
    'Queen': 'https://kworb.net/spotify/artist/1dfeR4HaWDbWqFHLkxsg1d_songs.html',
    'RedHotChiliPeppers' : 'https://kworb.net/spotify/artist/0L8ExT028jH3ddEcZwqJJ5_songs.html',
    'TheBeatles': 'https://kworb.net/spotify/artist/3WrFJ7ztbogyGnTHbHJFl2_songs.html',
    'TheChainsmokers': 'https://kworb.net/spotify/artist/69GGBxA162lTqCwzJG5jLp_songs.html',
    'TwentyOnePilots' : 'https://kworb.net/spotify/artist/3YQKmKGau1PzlVlkL1iodx_songs.html'
}

data_frames = {}

In [5]:
def scrape_data(url):
    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the website
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all tables on the page
        tables = soup.find_all('table')

        # Extract the first table with the song data
        if len(tables) > 0:
            song_table = tables[0]
            song_rows = song_table.find_all('tr')

            # Initialize lists to store song data
            songs = []
            total_streams = []
            as_lead_streams = []
            solo_streams = []
            as_feature_streams = []

            # Iterate over the song rows and extract data
            for row in song_rows[1:]:  # Skipping the header row
                columns = row.find_all('td')
                songs.append(columns[0].text.strip())
                total_streams.append(columns[1].text.strip())
                as_lead_streams.append(columns[2].text.strip())
                solo_streams.append(columns[3].text.strip())
                as_feature_streams.append(columns[4].text.strip())

            # Create a DataFrame for song data
            song_data = {
                'Song': songs,
                'Total Streams': total_streams,
                'As Lead Streams': as_lead_streams,
                'Solo Streams': solo_streams,
                'As Feature Streams': as_feature_streams
            }
            df_songs = pd.DataFrame(song_data)
        else:
            print("No song data found on the page")
            return None

        # Extract the second table with the additional data
        if len(tables) > 1:
            additional_table = tables[1]
            additional_rows = additional_table.find_all('tr')

            # Initialize lists to store additional data
            song_titles = []
            streams = []
            daily = []

            # Iterate over the additional rows and extract data
            for row in additional_rows[1:]:  # Skipping the header row
                columns = row.find_all('td')
                song_titles.append(columns[0].text.strip())
                streams.append(columns[1].text.strip())
                if len(columns) > 2:
                    daily.append(columns[2].text.strip())
                else:
                    daily.append('')  # Appending empty string if daily gain is not available

            # Create a DataFrame for additional data
            additional_data = {
                'Song Title': song_titles,
                'Streams': streams,
                'Daily': daily
            }
            df_additional = pd.DataFrame(additional_data)
        else:
            print("No additional data found on the page")
            return None

        return df_songs, df_additional

    else:
        print("Failed to retrieve the website")
        return None, None

In [6]:
for name, url in urls_with_names.items():
    print(f"Scraping data for {name}...")
    df_songs, df_additional = scrape_data(url)
    data_frames[name] = {'songs': df_songs, 'additional': df_additional}

Scraping data for ArcticMonkeys...
Scraping data for BTS...
Scraping data for Coldplay...
Scraping data for ImagineDragons...
Scraping data for LinkinPark...
Scraping data for Maroon5...
Scraping data for OneDirection...
Scraping data for OneRepublic...
Scraping data for Queen...
Scraping data for RedHotChiliPeppers...
Scraping data for TheBeatles...
Scraping data for TheChainsmokers...
Scraping data for TwentyOnePilots...


In [8]:
def print_website_data(data_frames, website_name):
    if website_name in data_frames:
        dfs = data_frames[website_name]
        print(f"\nData for {website_name}:")
        print("Artist Info:")
        print(dfs['songs'])
        print("\nSong Data:")
        print(dfs['additional'])
    else:
        print(f"Website '{website_name}' not found in the data frames.")

print_website_data(data_frames, 'BTS')



Data for BTS:
Artist Info:
      Song   Total Streams As Lead Streams    Solo Streams As Feature Streams
0  Streams  38,001,558,151  33,896,000,146  30,717,337,774      4,105,558,005
1    Daily      13,646,366      11,675,980      10,879,067          1,970,386
2   Tracks             272             249             232                 23

Song Data:
                                            Song Title        Streams    Daily
0                                             Dynamite  1,808,091,577  586,403
1                                               Butter  1,219,205,644  333,709
2                                        * My Universe  1,205,927,133  629,453
3                          Boy With Luv (Feat. Halsey)  1,129,232,727  296,553
4            * Left and Right (Feat. Jung Kook of BTS)    886,381,679  579,799
..                                                 ...            ...      ...
267                        Let Me Know - Japanese Ver.      2,275,228      678
268  * Bad Decis