In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import datetime


In [2]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3'}
html = requests.get('https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_top-ten_singles_in_1961',cookies={ 'sessionid': '123..'},headers=headers).text
soup = BeautifulSoup(html, 'lxml')
print(soup.prettify()[0:1000])

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of Billboard Hot 100 top-ten singles in 1961 - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonthNamesShort":["","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],"wgRequestId":"XgWMcgpAADoAAHkTRyYAAAEU","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_Billboard_Hot_100_top-ten_singles_in_1961","wgTitle":"List of Billboard Hot 100 top-ten singles in 1961","wgCurRevisionId":923651416,"wgRevisionId":923651416,"wgArticleId":41788798,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":

In [3]:
my_table = soup.find('table',{'class':'wikitable sortable'})
type(my_table)

NoneType

In [5]:
Titles = []
tableheaderincluded = True

# The following code mines the first table found on the Wikipedia page and extracts only table elements that do not correspond to a month and date. We don't care about the exact month and day that the song achieved Top 10 glory or their peek date. Furthermore, the way Wikipedia's tables are formatted, there are occasionally multiple songs that were all entered on the same day, and that date is listed only once, which creates problems for our script later down the road.

try:
    tds = my_table.findAll('td')
except AttributeError:
    my_table = soup.find('table',{'class':'wikitable'})
    tableheaderincluded = False
    tds = my_table.findAll('td')

for td in tds:
    try:
        d = datetime.datetime.strptime(td.get_text()[:-1], "%B %d") # Check if the current string is a Month Day format.
    except ValueError:
        try:
            d = datetime.datetime.strptime(td.get_text()[:td.get_text().rfind('(')-1], "%B %d") # Catch a rare case where a song from the previous year charted again.
        except ValueError:
            try:
                d = datetime.datetime.strptime(td.get_text()[:td.get_text().rfind('[')-1], "%B %d") # Catch a rare case where a song's top ten entry date has an annotation.
            except ValueError:
                if td.get_text()[0] != '[': # Drop annotations column.
                    Titles.append(td.get_text()[:-1]) # Drop the newline character present at the end of every single entry.



In [6]:
print(Titles)

['"Wonderland by Night"', 'Bert Kaempfert', '1', '10', '"Exodus"', 'Ferrante & Teicher', '2', '11', '"Corrina, Corinna"', 'Ray Peterson', '9', '5', '"Angel Baby"', 'Rosie and the Originals', '5', '7', '"Will You Love Me Tomorrow"', 'The Shirelles', '1', '7', '"Rubber Ball"', 'Bobby Vee', '6', '5', '"Calcutta"', 'Lawrence Welk', '1', '9', '"Shop Around"', 'The Miracles', '2', '6', '"Calendar Girl"', 'Neil Sedaka', '4', '5', '"Emotions"', 'Brenda Lee', '7', '4', '"My Empty Arms"', 'Jackie Wilson', '9', '1', '"Pony Time"', 'Chubby Checker', '1', '8', '"There\'s a Moon Out Tonight"', 'The Capris', '3', '4', '"Dedicated to the One I Love"', 'The Shirelles', '3', '9', '"Wheels"', 'The String-a-Longs', '3', '6', '"Surrender"', 'Elvis Presley', '1', '8', '"Don\'t Worry"', 'Marty Robbins', '3', '7', '"Where the Boys Are"', 'Connie Francis', '4', '5', '"Ebony Eyes"', 'The Everly Brothers', '8', '4', '"Baby Sittin\' Boogie"', 'Buzz Clifford', '6', '3', '"Spanish Harlem"', 'Ben E. King', '10', '1'

In [7]:
nptitles = np.asarray(Titles).copy()

In [8]:
len(nptitles)

420

In [9]:
numsongs = int(np.prod(nptitles.shape)/4) # Find out how many songs are in this list.
reshaped = nptitles.reshape(numsongs,4) # Reshape the array based on the number of songs and the number of columns.

In [36]:
songdata = pd.DataFrame(reshaped)
songdata.columns = ['Title','Artist','Peak','Number of weeks in top ten']
songdata['Year'] = [2019] * numsongs
songdata['Title'] = [i[1:i.rfind('\"')] for i in songdata['Title']] # Drops quotation marks and annotations in Title.
cleanweeks = []
for i in songdata['Number of weeks in top ten']: # This code searches the number of weeks in top ten column for asterisks and eliminates them. This is only a 
    try:                                         # problem in the current year.
        cleanweeks.append(int(i[:i.rindex('*')]))
    except ValueError:
        cleanweeks.append(int(i))

songdata['Number of weeks in top ten'] = cleanweeks
del songdata['Peak'] # This will not be relevant to our analysis.

In [38]:
songdata

Unnamed: 0,Title,Artist,Number of weeks in top ten,Year
0,Poor Little Fool,Ricky Nelson,6,2019
1,Patricia,Pérez Prado,6,2019
2,Splish Splash,Bobby Darin,3,2019
3,Hard Headed Woman,Elvis Presley,2,2019
4,When,Kalin Twins,5,2019
5,Rebel 'Rouser,Duane Eddy,3,2019
6,Yakety Yak,The Coasters,1,2019
7,My True Love,Jack Scott,6,2019
8,Willie and the Hand Jive,The Johnny Otis Show,2,2019
9,Fever,Peggy Lee,3,2019


In [37]:
songdata.to_csv(path_or_buf='data/1958top10songs.csv',index=False) # Export our sample data table.