# Webscraping from Wikipedia using Beautiful Soup

In [47]:
import sys

import requests
from bs4 import BeautifulSoup
import re
import unicodedata
import pandas as pd

## Request the Best Selling Albuns Wikipedia page from it's URL

In [48]:
url = "https://en.wikipedia.org/wiki/List_of_best-selling_albums"

In [49]:
# use requests.get() method with the provided url
# assign the response to a object

headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/115.0.0.0 Safari/537.36"
    ),
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://en.wikipedia.org/"
}

response = requests.get(url, headers=headers, timeout=20)

In [50]:
# Use BeautifulSoup() to create a BeautifulSoup object from a response text content
soup = BeautifulSoup(response.text, 'html.parser')

print(response.status_code)

200


In [51]:
# Use soup.title attribute
print("Title of the page:", soup.title.text)

Title of the page: List of best-selling albums - Wikipedia


## Extract column/variable names from the HTML table header

In [52]:
# Use the find_all function in the BeautifulSoup object, with element type `table`
# Assign the result to a list called `html_tables`
html_tables = soup.find_all('table')
print("Number of tables found:", len(html_tables))

Number of tables found: 12


In [53]:
# Finding the subtitles
subtitles = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
    
for subtitle in subtitles:
    print("Subtitle:", subtitle.text)

Subtitle: Contents
Subtitle: List of best-selling albums
Subtitle: Legend
Subtitle: 40 million copies or more
Subtitle: 30–39 million copies
Subtitle: 20–29 million copies
Subtitle: Timeline of the best-selling albums
Subtitle: Best-selling album by year worldwide
Subtitle: See also
Subtitle: Notes
Subtitle: References


In [54]:
# Finding the table names (if it exists)
for table in html_tables:
        table_names = table.caption.text if table.caption else "Not found"
        print("Table name:", table_names)

Table name: Not found
Table name: Not found
Table name: Not found
Table name: Not found
Table name: Timeline of the highest-selling album record

Table name: Not found
Table name: Not found
Table name: Not found
Table name: Not found
Table name: Not found
Table name: Not found
Table name: Not found


In [55]:
# Extracting '40 million copies or more' table (second one, inspecting the page)
forty_mi_table = html_tables[1]

In [56]:
if forty_mi_table:
        # Finding the table header (th) elements within the table
        header_cells = forty_mi_table.find_all('th')
        
        # Extracting text from header cells to get column names
        column_names = [cell.text.strip() for cell in header_cells]
        
        # Printing the extracted column names
        print("Column Names:", column_names)

Column Names: ['Artist', 'Album', 'Released', 'Genre', 'Total certified copies(from available markets)*', 'Reported sales*', 'Ref.']


## Create a data frame by parsing the launch HTML tables

In [57]:
 if forty_mi_table:
        # Extracting table data cells (td) from the table
        table_rows = forty_mi_table.find_all('tr')
        
        # Initializing an empty list to store table data
        table_data = []
        
        # Looping through each table row
        for row in table_rows:
            # Extracting table data cells (td) from the row
            row_data = [cell.text.strip() for cell in row.find_all('td')]
            if row_data:
                # Adding non-empty rows to the table data list
                table_data.append(row_data)
        
        # Creating a DataFrame from the table data
        df_40 = pd.DataFrame(table_data, columns=column_names)

In [58]:
df_40

Unnamed: 0,Artist,Album,Released,Genre,Total certified copies(from available markets)*,Reported sales*,Ref.
0,Michael Jackson,Thriller,1982,"Pop, post-disco, funk, rock","51.3\nUS: 34 million[8]\nJPN: 100,000[9]\nUK: ...",70,[30][31][32]
1,AC/DC,Back in Black,1980,Hard rock,31.2\nUS: 27 million[8]\nUK: 1 million[10]\nGE...,50,[40]
2,Whitney Houston / Various artists,The Bodyguard,1992,"R&B, soul, pop, soundtrack",29.7\nUS: 19 million[8]\nJPN: 2 million[41][42...,45,[46][47]
3,Pink Floyd,The Dark Side of the Moon,1973,Progressive rock,25.6\nUS: 15 million[8]\nUK: 4.8 million[10]\n...,45,[49]
4,Eagles,Their Greatest Hits (1971–1975),1976,"Country rock, soft rock, folk rock","41.2\nUS: 38 million[8]\nUK: 600,000[10]\nCAN:...",44,[50]
5,Eagles,Hotel California,1976,Soft rock,31.8\nUS: 26 million[8]\nUK: 1.8 million[10]\n...,42,[52]
6,Shania Twain,Come On Over,1997,"Country, pop","30.7\nUS: 20 million[8]\nJPN: 100,000[9]\nUK: ...",40,[55][56]
7,Fleetwood Mac,Rumours,1977,Soft rock,30.3\nUS: 21 million[8]\nUK: 4.5 million[10]\n...,40,[60][61]
8,Meat Loaf,Bat Out of Hell,1977,"Hard rock, glam rock, progressive rock",22\nUS: 14 million[8]\nUK: 3.3 million[10]\nGE...,40,[63]
9,Bee Gees / Various artists,Saturday Night Fever,1977,Disco,22.1\nUS: 16 million[8]\nUK: 2.1 million[10]\n...,40,[65][66]


### Extract complimentary tables, create the dataframes and join them

In [59]:
# Extracting '30 - 39 million copies' table (third one, inspecting the page)
thirty_mi_table = html_tables[2]

In [60]:
 if thirty_mi_table:
        # Extracting table data cells (td) from the table
        table_rows = thirty_mi_table.find_all('tr')
        
        # Initializing an empty list to store table data
        table_data = []
        
        # Looping through each table row
        for row in table_rows:
            # Extracting table data cells (td) from the row
            row_data = [cell.text.strip() for cell in row.find_all('td')]
            if row_data:
                # Adding non-empty rows to the table data list
                table_data.append(row_data)
        
        # Creating a DataFrame from the table data
        df_30 = pd.DataFrame(table_data, columns=column_names)

In [61]:
# Extracting '20 - 29 million copies' table (fourth one, inspecting the page)
twenty_mi_table = html_tables[3]

</tbody></table>


In [62]:
if twenty_mi_table:
        # Extracting table data cells (td) from the table
        table_rows = twenty_mi_table.find_all('tr')
        
        # Initializing an empty list to store table data
        table_data = []
        
        # Looping through each table row
        for row in table_rows:
            # Extracting table data cells (td) from the row
            row_data = [cell.text.strip() for cell in row.find_all('td')]
            if row_data:
                # Adding non-empty rows to the table data list
                table_data.append(row_data)
        
        # Creating a DataFrame from the table data
        df_20 = pd.DataFrame(table_data, columns=column_names)

### Join dataframes, perform some data cleaning and save into a file

In [63]:
# Vertically concatenate (stack) the DataFrames
df = pd.concat([df_40, df_30, df_20], axis=0)
df

Unnamed: 0,Artist,Album,Released,Genre,Total certified copies(from available markets)*,Reported sales*,Ref.
0,Michael Jackson,Thriller,1982,"Pop, post-disco, funk, rock","51.3\nUS: 34 million[8]\nJPN: 100,000[9]\nUK: ...",70,[30][31][32]
1,AC/DC,Back in Black,1980,Hard rock,31.2\nUS: 27 million[8]\nUK: 1 million[10]\nGE...,50,[40]
2,Whitney Houston / Various artists,The Bodyguard,1992,"R&B, soul, pop, soundtrack",29.7\nUS: 19 million[8]\nJPN: 2 million[41][42...,45,[46][47]
3,Pink Floyd,The Dark Side of the Moon,1973,Progressive rock,25.6\nUS: 15 million[8]\nUK: 4.8 million[10]\n...,45,[49]
4,Eagles,Their Greatest Hits (1971–1975),1976,"Country rock, soft rock, folk rock","41.2\nUS: 38 million[8]\nUK: 600,000[10]\nCAN:...",44,[50]
...,...,...,...,...,...,...,...
43,Lionel Richie,Can't Slow Down,1983,"Pop, R&B, soul","12.3\nUS: 10 million[8]\nUK: 900,000[10]\nGER:...",20,[212]
44,Celine Dion,The Colour of My Love,1993,Pop,"11.1\nUS: 6 million[8]\nJPN: 600,000[9]\nUK: 1...",20,[213]
45,Pink Floyd,Wish You Were Here,1975,Progressive rock,10.9\nUS: 6 million[8]\nGER: 1.5 million[11]\n...,20,[214][215]
46,Andrea Bocelli,Romanza,1997,Operatic pop,"10.1\nUS: 3 million[8]\nUK: 300,000[10]\nGER: ...",20,[216]


In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 79 entries, 0 to 47
Data columns (total 7 columns):
 #   Column                                           Non-Null Count  Dtype 
---  ------                                           --------------  ----- 
 0   Artist                                           79 non-null     object
 1   Album                                            79 non-null     object
 2   Released                                         79 non-null     object
 3   Genre                                            79 non-null     object
 4   Total certified copies(from available markets)*  79 non-null     object
 5   Reported sales*                                  79 non-null     object
 6   Ref.                                             79 non-null     object
dtypes: object(7)
memory usage: 4.9+ KB


In [65]:
# Droping unnecessary column
df.drop('Ref.', axis=1, inplace=True)

In [69]:
# Changing columns name
df = df.rename(columns={'Total certified copies(from available markets)*': 'Total Certified Copies', 
                        'Reported sales*': 'Reported Sales'})

In [70]:
# Cleaning string data
df['Total Certified Copies'] = df['Total Certified Copies'].str.split('\n').str[0]
df['Reported Sales'] = df['Reported Sales'].str[:2]

In [72]:
# Converting columns to float and datetime
df['Total Certified Copies'] = df['Total Certified Copies'].astype(float)
df['Reported Sales'] = df['Reported Sales'].astype(float)
df['Released'] = pd.to_datetime(df['Released']).dt.year

In [74]:
df.head(10)

Unnamed: 0,Artist,Album,Released,Genre,Total Certified Copies,Reported Sales
0,Michael Jackson,Thriller,1982,"Pop, post-disco, funk, rock",51.3,70.0
1,AC/DC,Back in Black,1980,Hard rock,31.2,50.0
2,Whitney Houston / Various artists,The Bodyguard,1992,"R&B, soul, pop, soundtrack",29.7,45.0
3,Pink Floyd,The Dark Side of the Moon,1973,Progressive rock,25.6,45.0
4,Eagles,Their Greatest Hits (1971–1975),1976,"Country rock, soft rock, folk rock",41.2,44.0
5,Eagles,Hotel California,1976,Soft rock,31.8,42.0
6,Shania Twain,Come On Over,1997,"Country, pop",30.7,40.0
7,Fleetwood Mac,Rumours,1977,Soft rock,30.3,40.0
8,Meat Loaf,Bat Out of Hell,1977,"Hard rock, glam rock, progressive rock",22.0,40.0
9,Bee Gees / Various artists,Saturday Night Fever,1977,Disco,22.1,40.0


In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 79 entries, 0 to 47
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Artist                  79 non-null     object 
 1   Album                   79 non-null     object 
 2   Released                79 non-null     int32  
 3   Genre                   79 non-null     object 
 4   Total Certified Copies  79 non-null     float64
 5   Reported Sales          79 non-null     float64
dtypes: float64(2), int32(1), object(3)
memory usage: 4.0+ KB


In [76]:
# Saving as a .csv file
df.to_csv('top_albums_web_scraped.csv', index=False)