# Webscraping from Wikipedia

In [200]:
import sys

import requests
from bs4 import BeautifulSoup
import re
import unicodedata
import pandas as pd

## Request the Best Selling Albuns Wikipedia page from it's URL

In [201]:
url = "https://en.wikipedia.org/wiki/List_of_best-selling_albums"

In [202]:
# use requests.get() method with the provided url
# assign the response to a object
response = requests.get(url)

In [203]:
# Use BeautifulSoup() to create a BeautifulSoup object from a response text content
soup = BeautifulSoup(response.text, 'html.parser')

In [204]:
# Use soup.title attribute
print("Title of the page:", soup.title.text)

Title of the page: List of best-selling albums - Wikipedia


## Extract column/variable names from the HTML table header

In [205]:
# Use the find_all function in the BeautifulSoup object, with element type `table`
# Assign the result to a list called `html_tables`
html_tables = soup.find_all('table')
print("Number of tables found:", len(html_tables))

Number of tables found: 12


In [206]:
# Finding the subtitles
subtitles = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
    
for subtitle in subtitles:
    print("Subtitle:", subtitle.text)

Subtitle: Contents
Subtitle: List of best-selling albums
Subtitle: Legend
Subtitle: 40 million copies or more
Subtitle: 30–39 million copies
Subtitle: 20–29 million copies
Subtitle: Timeline of the best-selling albums
Subtitle: Best-selling album by year worldwide
Subtitle: See also
Subtitle: Notes
Subtitle: References


In [207]:
# Finding the table names (if it exists)
for table in html_tables:
        table_names = table.caption.text if table.caption else "Not found"
        print("Table name:", table_names)

Table name: Not found
Table name: Not found
Table name: Not found
Table name: Not found
Table name: Timeline of the highest-selling album record

Table name: Not found
Table name: Not found
Table name: Not found
Table name: Not found
Table name: Not found
Table name: Not found
Table name: Not found


In [208]:
# Print the '40 million copies or more' table and check its content
forty_mi_table = html_tables[1]
print(forty_mi_table)

<table class="wikitable sortable" style="width:100%; text-align: center">
<tbody><tr style="background:#e8e8e8;">
<th scope="col">Artist
</th>
<th scope="col">Album
</th>
<th scope="col">Released
</th>
<th scope="col">Genre
</th>
<th scope="col"><a href="/wiki/Music_recording_certification" title="Music recording certification">Total certified copies</a><br/><small><span class="nowrap">(<a href="/wiki/List_of_music_recording_certifications" title="List of music recording certifications">from available markets</a>)</span></small>*
</th>
<th scope="col">Claimed sales*
</th>
<th class="unsortable" scope="col">Ref(s)
</th></tr>
<tr style="background:#FFFFFF;">
<td><span data-sort-value="Jackson, Michael"><span class="vcard"><span class="fn"><a href="/wiki/Michael_Jackson" title="Michael Jackson">Michael Jackson</a></span></span></span>
</td>
<td><i><a href="/wiki/Thriller_(album)" title="Thriller (album)">Thriller</a></i>
</td>
<td>1982
</td>
<td><a href="/wiki/Pop_music" title="Pop music"

In [209]:
if forty_mi_table:
        # Finding the table header (th) elements within the table
        header_cells = forty_mi_table.find_all('th')
        
        # Extracting text from header cells to get column names
        column_names = [cell.text.strip() for cell in header_cells]
        
        # Printing the extracted column names
        print("Column Names:", column_names)

Column Names: ['Artist', 'Album', 'Released', 'Genre', 'Total certified copies(from available markets)*', 'Claimed sales*', 'Ref(s)']


## Create a data frame by parsing the launch HTML tables

In [210]:
 if forty_mi_table:
        # Extracting table data cells (td) from the table
        table_rows = forty_mi_table.find_all('tr')
        
        # Initializing an empty list to store table data
        table_data = []
        
        # Looping through each table row
        for row in table_rows:
            # Extracting table data cells (td) from the row
            row_data = [cell.text.strip() for cell in row.find_all('td')]
            if row_data:
                # Adding non-empty rows to the table data list
                table_data.append(row_data)
        
        # Creating a DataFrame from the table data
        df_40 = pd.DataFrame(table_data, columns=column_names)

In [211]:
df_40

Unnamed: 0,Artist,Album,Released,Genre,Total certified copies(from available markets)*,Claimed sales*,Ref(s)
0,Michael Jackson,Thriller,1982,"Pop, post-disco, funk, rock","51.2\nUS: 34 million[8]\nJPN: 100,000[9]\nUK: ...",70,[4][5]
1,AC/DC,Back in Black,1980,Hard rock,30.1\nUS: 25 million[8]\nUK: 1 million[10]\nGE...,50,[36]
2,Whitney Houston / various artists,The Bodyguard,1992,"R&B, soul, pop, soundtrack",28.7\nUS: 18 million[8]\nJPN: 2 million[37][38...,45,[43][44]
3,Pink Floyd,The Dark Side of the Moon,1973,Progressive rock,24.8\nUS: 15 million[8]\nUK: 4.5 million[10]\n...,45,[46]
4,Eagles,Their Greatest Hits (1971–1975),1976,"Country rock, soft rock, folk rock","41.2\nUS: 38 million[8]\nUK: 600,000[10]\nCAN:...",44,[47]
5,Meat Loaf,Bat Out of Hell,1977,"Hard rock, glam rock, progressive rock",22\nUS: 14 million[8]\nUK: 3.3 million[10]\nGE...,43,[49]
6,Eagles,Hotel California,1976,Soft rock,31.8\nUS: 26 million[8]\nUK: 1.8 million[10]\n...,42,[51]
7,Shania Twain,Come On Over,1997,"Country, pop","30.4\nUS: 20 million[8]\nJPN: 100,000 [9]\nUK:...",40,[54][55]
8,Fleetwood Mac,Rumours,1977,Soft rock,30.3\nUS: 21 million[8]\nUK: 4.5 million[10]\n...,40,[59][60]
9,Bee Gees / Various artists,Saturday Night Fever,1977,Disco,22.1\nUS: 16 million[8]\nUK: 2.1 million[10]\n...,40,[62][63]


### Extract complimentary tables, create the dataframes and join them

In [212]:
# Print the '30 - 39 million copies' table and check its content
thirty_mi_table = html_tables[2]
print(thirty_mi_table)

<table class="wikitable sortable" style="width:100%; text-align: center">
<tbody><tr style="background:#e8e8e8;">
<th scope="col">Artist
</th>
<th scope="col">Album
</th>
<th scope="col">Released
</th>
<th scope="col">Genre
</th>
<th scope="col"><a class="mw-redirect" href="/wiki/Music_recording_sales_certification" title="Music recording sales certification">Total certified copies</a><br/><small><span class="nowrap">(<a href="/wiki/List_of_music_recording_certifications" title="List of music recording certifications">from available markets</a>)</span></small>*
</th>
<th scope="col">Claimed sales*
</th>
<th class="unsortable" scope="col">Ref(s)
</th></tr>
<tr style="background:#FFFFFF;">
<td><a href="/wiki/Led_Zeppelin" title="Led Zeppelin">Led Zeppelin</a>
</td>
<td><i><a href="/wiki/Led_Zeppelin_IV" title="Led Zeppelin IV">Led Zeppelin IV</a></i>
</td>
<td>1971
</td>
<td><a href="/wiki/Hard_rock" title="Hard rock">Hard rock</a>, <a href="/wiki/Heavy_metal_music" title="Heavy metal mu

In [213]:
 if thirty_mi_table:
        # Extracting table data cells (td) from the table
        table_rows = thirty_mi_table.find_all('tr')
        
        # Initializing an empty list to store table data
        table_data = []
        
        # Looping through each table row
        for row in table_rows:
            # Extracting table data cells (td) from the row
            row_data = [cell.text.strip() for cell in row.find_all('td')]
            if row_data:
                # Adding non-empty rows to the table data list
                table_data.append(row_data)
        
        # Creating a DataFrame from the table data
        df_30 = pd.DataFrame(table_data, columns=column_names)

In [214]:
# Print the '20 - 29 million copies' table and check its content
twenty_mi_table = html_tables[3]
print(twenty_mi_table)

<table class="wikitable sortable" style="width:100%; text-align: center">
<tbody><tr>
<th scope="col">Artist
</th>
<th scope="col">Album
</th>
<th scope="col">Released
</th>
<th scope="col">Genre
</th>
<th scope="col"><a href="/wiki/Music_recording_certification" title="Music recording certification">Total certified copies</a><br/><small><span class="nowrap">(<a href="/wiki/List_of_music_recording_certifications" title="List of music recording certifications">from available markets</a>)</span></small>*
</th>
<th scope="col">Claimed sales*
</th>
<th class="unsortable" scope="col">Ref(s)
</th></tr>
<tr style="background:#FFFFFF;">
<td><span data-sort-value="Jones, Norah"><span class="vcard"><span class="fn"><a href="/wiki/Norah_Jones" title="Norah Jones">Norah Jones</a></span></span></span>
</td>
<td><i><a href="/wiki/Come_Away_with_Me" title="Come Away with Me">Come Away with Me</a></i>
</td>
<td>2002
</td>
<td><a href="/wiki/Jazz" title="Jazz">Jazz</a>
</td>
<td><span data-sort-value="

</tbody></table>


In [215]:
if twenty_mi_table:
        # Extracting table data cells (td) from the table
        table_rows = twenty_mi_table.find_all('tr')
        
        # Initializing an empty list to store table data
        table_data = []
        
        # Looping through each table row
        for row in table_rows:
            # Extracting table data cells (td) from the row
            row_data = [cell.text.strip() for cell in row.find_all('td')]
            if row_data:
                # Adding non-empty rows to the table data list
                table_data.append(row_data)
        
        # Creating a DataFrame from the table data
        df_20 = pd.DataFrame(table_data, columns=column_names)

### Join dataframes, perform some data cleaning and save into a file

In [216]:
# Vertically concatenate (stack) the DataFrames
df = pd.concat([df_40, df_30, df_20], axis=0)
df

Unnamed: 0,Artist,Album,Released,Genre,Total certified copies(from available markets)*,Claimed sales*,Ref(s)
0,Michael Jackson,Thriller,1982,"Pop, post-disco, funk, rock","51.2\nUS: 34 million[8]\nJPN: 100,000[9]\nUK: ...",70,[4][5]
1,AC/DC,Back in Black,1980,Hard rock,30.1\nUS: 25 million[8]\nUK: 1 million[10]\nGE...,50,[36]
2,Whitney Houston / various artists,The Bodyguard,1992,"R&B, soul, pop, soundtrack",28.7\nUS: 18 million[8]\nJPN: 2 million[37][38...,45,[43][44]
3,Pink Floyd,The Dark Side of the Moon,1973,Progressive rock,24.8\nUS: 15 million[8]\nUK: 4.5 million[10]\n...,45,[46]
4,Eagles,Their Greatest Hits (1971–1975),1976,"Country rock, soft rock, folk rock","41.2\nUS: 38 million[8]\nUK: 600,000[10]\nCAN:...",44,[47]
...,...,...,...,...,...,...,...
42,Oasis,(What's the Story) Morning Glory?,1995,"Britpop, rock","11.6\nUS: 4 million[8]\nJPN: 200,000[9]\nUK: 4...",22,[206][207]
43,Celine Dion,The Colour of My Love,1993,Pop,"11.1\nUS: 6 million[8]\nJPN: 600,000[9]\nUK: 1...",20,[208]
44,Elton John,Goodbye Yellow Brick Road,1973,"rock, pop rock, glam rock",9.6\nUS: 8 million[8]\nUK: 1.4 million[10]\nAU...,20,[211]
45,Pink Floyd,Wish You Were Here,1975,"Progressive rock, art rock, experimental rock","9.2\nUS: 6 million[8]\nUK: 600,000[10]\nGER: 5...",20,[212]


In [217]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 78 entries, 0 to 46
Data columns (total 7 columns):
 #   Column                                           Non-Null Count  Dtype 
---  ------                                           --------------  ----- 
 0   Artist                                           78 non-null     object
 1   Album                                            78 non-null     object
 2   Released                                         78 non-null     object
 3   Genre                                            78 non-null     object
 4   Total certified copies(from available markets)*  78 non-null     object
 5   Claimed sales*                                   78 non-null     object
 6   Ref(s)                                           78 non-null     object
dtypes: object(7)
memory usage: 4.9+ KB


In [218]:
# Droping unnecessary column
df.drop('Ref(s)', axis=1, inplace=True)

In [219]:
# Changing columns name
df = df.rename(columns={'Total certified copies(from available markets)*': 'Total certified copies', 
                        'Claimed sales*': 'Claimed sales'})

In [220]:
# Cleaning string data
df['Total certified copies'] = df['Total certified copies'].str.split('\n').str[0]
df['Claimed sales'] = df['Claimed sales'].str[:2]

In [222]:
# Converting columns to float and datetime
df['Total certified copies'] = df['Total certified copies'].astype(float)
df['Claimed sales'] = df['Claimed sales'].astype(float)
df['Released'] = pd.to_datetime(df['Released']).dt.year

In [223]:
df

Unnamed: 0,Artist,Album,Released,Genre,Total certified copies,Claimed sales
0,Michael Jackson,Thriller,1982,"Pop, post-disco, funk, rock",51.2,70.0
1,AC/DC,Back in Black,1980,Hard rock,30.1,50.0
2,Whitney Houston / various artists,The Bodyguard,1992,"R&B, soul, pop, soundtrack",28.7,45.0
3,Pink Floyd,The Dark Side of the Moon,1973,Progressive rock,24.8,45.0
4,Eagles,Their Greatest Hits (1971–1975),1976,"Country rock, soft rock, folk rock",41.2,44.0
...,...,...,...,...,...,...
42,Oasis,(What's the Story) Morning Glory?,1995,"Britpop, rock",11.6,22.0
43,Celine Dion,The Colour of My Love,1993,Pop,11.1,20.0
44,Elton John,Goodbye Yellow Brick Road,1973,"rock, pop rock, glam rock",9.6,20.0
45,Pink Floyd,Wish You Were Here,1975,"Progressive rock, art rock, experimental rock",9.2,20.0


In [224]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 78 entries, 0 to 46
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Artist                  78 non-null     object 
 1   Album                   78 non-null     object 
 2   Released                78 non-null     int64  
 3   Genre                   78 non-null     object 
 4   Total certified copies  78 non-null     float64
 5   Claimed sales           78 non-null     float64
dtypes: float64(2), int64(1), object(3)
memory usage: 4.3+ KB


In [225]:
# Saving as a .csv file
df.to_csv('top_albums_web_scraped.csv', index=False)