# Scraping Pitchfork's "The 200 Best Songs of the 2010s List"

In [1]:
import requests as re
import pandas as pd
from bs4 import BeautifulSoup

## Scraping the HTML

In [3]:
URL = "https://pitchfork.com/features/lists-and-guides/the-200-best-songs-of-the-2010s/"
response = re.get(URL).text
document = BeautifulSoup(response, 'html.parser')

### Artist Name, Song Name, Release Year

In [4]:
songs = pd.Series(map(lambda t : t.text, document.find_all('h2')))
songs.name = 'artist_song_year'

### Rank

In [5]:
ranks = pd.Series(map(lambda t : t.text, document.find_all('h3')))
ranks.name = 'rank'

### Label

In [6]:
labels = pd.Series(map(lambda t : t.text.strip(), document.find_all('span', class_ = 'caption__text')))
labels.name = 'label'

### Images

In [7]:
#only find the images actually in the article (not the logo)
article_body = document.find('div', class_='article__chunks')
images = pd.Series(map(lambda t : t['srcset'], article_body.find_all('img', class_ = 'responsive-image__image')))
images.name = 'image_string'

## Consolidation & Cleaning

In [8]:
raw_df = pd.DataFrame(zip(songs, ranks, labels, images), columns=[songs.name, ranks.name, labels.name, images.name])

In [9]:
raw_df.head()

Unnamed: 0,artist_song_year,rank,label,image_string
0,Avicii: “Levels” (2011),200.0,Universal,https://media.pitchfork.com/photos/5d6fdbf4b34...
1,Stormzy: “Big for Your Boots” (2017),199.0,#Merky,https://media.pitchfork.com/photos/5d6fdb02240...
2,dvsn: “The Line” (2016),198.0,Self-released,https://media.pitchfork.com/photos/5d6fdc66b6c...
3,Icona Pop: “I Love It” [ft. Charli XCX] (2012),197.0,Record Company Ten/Big Beat Records,https://media.pitchfork.com/photos/5d6fd8b0613...
4,John Maus: “Believer” (2011),196.0,Ribbon Music/Upset the Rhythm,https://media.pitchfork.com/photos/5d6fdd8bb34...


### Parse the rank as a number

In [10]:
raw_df['rank'] = raw_df['rank'].str.replace('.', '').astype(int)

In [11]:
indexed_df = raw_df.set_index('rank')

### Extracting the artist, song, and year information from the scraped titles.

In [12]:
regexp = r'(?P<artist>.*): (?P<song>.*) \((?P<year>201\d)\)'

In [13]:
indexed_extracted_df = indexed_df.artist_song_year.str.extract(regexp)

In [14]:
df = pd.merge(indexed_extracted_df, indexed_df, left_index=True, right_index=True).drop("artist_song_year", axis=1)

### Generalizing the hosted image link

In [15]:
df['img_link'] = df.image_string.str.split(r" \d{3,4}w, ", n=1, expand=False).apply(lambda l : l[0]).str.replace(r"w_\d{3,4}", "{}")

In [16]:
df = df.drop("image_string", axis=1)

In [17]:
df['song'] = df.song.str.replace(r"[‘’]", "'")

In [18]:
df['song'] = df.song.str.replace(r"[^a-zA-Z0-9\.\[\]\-\&$é'‘’ ]", "")

### Identifying the featured artists on songs

In [20]:
df['features'] = df['song'].str.extract(r'\[ft\. (?P<features>.*)\]')
df['has_feature'] = ~df.features.isna()

In [21]:
df.head()

Unnamed: 0_level_0,artist,song,year,label,img_link,features,has_feature
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
200,Avicii,Levels,2011,Universal,https://media.pitchfork.com/photos/5d6fdbf4b34...,,False
199,Stormzy,Big for Your Boots,2017,#Merky,https://media.pitchfork.com/photos/5d6fdb02240...,,False
198,dvsn,The Line,2016,Self-released,https://media.pitchfork.com/photos/5d6fdc66b6c...,,False
197,Icona Pop,I Love It [ft. Charli XCX],2012,Record Company Ten/Big Beat Records,https://media.pitchfork.com/photos/5d6fd8b0613...,Charli XCX,True
196,John Maus,Believer,2011,Ribbon Music/Upset the Rhythm,https://media.pitchfork.com/photos/5d6fdd8bb34...,,False


In [22]:
df.to_csv("../site/src/data/p4k_top200_2010.csv", index=True)

### Extracting the labels for each song

In [23]:
label_df = pd.merge(df[['artist', 'song', 'year']], df['label'].str.split("/", expand=True), left_index=True, right_index=True)

In [24]:
x = label_df.melt(id_vars=['artist', 'song', 'year'], value_vars=[0, 1, 2])
singleton_label_df = x.dropna().drop('variable', axis=1).rename({'value' : 'label'})

In [25]:
label_df.to_csv("../site/src/data/top200_labels.csv", index=True)

In [28]:
label_df.head()

Unnamed: 0_level_0,artist,song,year,0,1,2
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
200,Avicii,Levels,2011,Universal,,
199,Stormzy,Big for Your Boots,2017,#Merky,,
198,dvsn,The Line,2016,Self-released,,
197,Icona Pop,I Love It [ft. Charli XCX],2012,Record Company Ten,Big Beat Records,
196,John Maus,Believer,2011,Ribbon Music,Upset the Rhythm,


In [29]:
label_df[0][label_df.index <= 100].value_counts().head(10)

XL               6
Columbia         6
4AD              5
Interscope       5
Self-released    4
G.O.O.D.         3
Def Jam          3
Cash Money       3
Merge            3
Jagjaguwar       2
Name: 0, dtype: int64

In [30]:
label_df[2][label_df.index <= 100].value_counts()

Republic    2
Top Dawg    2
DFA         1
Mute        1
Name: 2, dtype: int64

In [31]:
label_list = set(label_df[0].unique()) | set(label_df[1].unique()) | set(label_df[2].unique())


In [32]:
t = pd.DataFrame([label_df[0].value_counts(), label_df[1].value_counts(), label_df[2].value_counts()])


In [33]:
st = t.fillna(0).sum().sort_values(ascending=False)

In [34]:
st[st > 2]

Columbia          12.0
Interscope        12.0
XL                11.0
Self-released      9.0
RCA                9.0
Republic           9.0
Def Jam            8.0
Cash Money         8.0
Epic               7.0
Atlantic           6.0
4AD                6.0
Young Turks        5.0
Matador            5.0
Young Money        5.0
Top Dawg           4.0
Drag City          3.0
Island Def Jam     3.0
Sony               3.0
G.O.O.D.           3.0
Merge              3.0
Aftermath          3.0
Jagjaguwar         3.0
Konichiwa          3.0
dtype: float64

In [35]:
len(st)

120

In [36]:
label_df.head()

Unnamed: 0_level_0,artist,song,year,0,1,2
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
200,Avicii,Levels,2011,Universal,,
199,Stormzy,Big for Your Boots,2017,#Merky,,
198,dvsn,The Line,2016,Self-released,,
197,Icona Pop,I Love It [ft. Charli XCX],2012,Record Company Ten,Big Beat Records,
196,John Maus,Believer,2011,Ribbon Music,Upset the Rhythm,


### Counting artist appearances

In [38]:
df.artist[df.artist.str.contains("Drake", regex=False) | df.features.str.contains("Drake", regex=False)]

rank
173       DJ Khaled
138           Drake
118    Travis Scott
109           Drake
65            Drake
21            Drake
15          Rihanna
Name: artist, dtype: object

In [39]:
artists = df.artist.drop_duplicates()
artist_df = pd.DataFrame(artists).reset_index().drop("rank", axis=1)

In [69]:
appearances = artist_df.artist.apply(lambda a : len(df.artist[df['artist'].isin([a]) | df.features.str.contains(a, regex=False)]))
appearances_no_ft = artist_df.artist.apply(lambda a : len(df.artist[df.artist.isin([a])]))


In [70]:
artist_df['appearances'] = appearances

In [70]:
artist_df['appearances_no_ft'] = appearances_no_ft

In [71]:
artist_df.to_csv("../site/src/data/artist_counts.csv", index=False)

In [72]:
list(artist_df.nlargest(10, ['appearances']).artist)

['Drake',
 'Beyoncé',
 'Frank Ocean',
 'Nicki Minaj',
 'Kanye West',
 'Rihanna',
 'Future',
 'Charli XCX',
 'Rick Ross',
 'Robyn']

In [82]:
list(artist_df.nlargest(10, ['appearances']).appearances)

[7, 5, 4, 4, 4, 4, 3, 3, 3, 3]

In [1]:
import pandas as pd
df = pd.read_csv("../site/src/data/artist_counts.csv")

In [18]:
df.sort_values(by='appearances', ascending=False)[df['appearances'] > 1]

  """Entry point for launching an IPython kernel.


Unnamed: 0,artist,appearances,appearances_no_ft
60,Drake,7,4
18,Beyoncé,5,4
44,Kanye West,4,4
56,Rihanna,4,4
25,Nicki Minaj,4,3
16,Frank Ocean,4,4
55,Rick Ross,3,1
63,Robyn,3,3
12,Future,3,3
53,Charli XCX,3,2
