# Pitchfork Album Review Web Scraping

In [1]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import re
import numpy as np

!pip install eventlet

import eventlet
from eventlet.green.urllib.request import urlopen



## Step 1 - Album Review URL Scraping

In [None]:
# ALBUM REVIEW URL SCRAPING - using pooling

# url pagination for loop
    # update range to whatever number of pages you'd like to capture
    # the largest page number changes as more albums are added and reviews go back to 1999
urls = [f"https://pitchfork.com/reviews/albums/?page={i}" for i in range(1,4)]

# fetch url function
def fetch(url):
    print("opening", url)
    html = requests.get(url).content
    soup = bs(html)
    urls = ["https://pitchfork.com" + j.get('href') for j in soup.select("div.review > a")]
    return urls

# pooling > map function > convert to list
pool = eventlet.GreenPool(100)
urls_map_pool = pool.imap(fetch, urls)
lst_urls = list(urls_map_pool)

# list flattening to prepare for df
flat_list = [i for j in lst_urls for i in j]

# df conversion
df = pd.DataFrame(flat_list)

# df to csv file and checking
df.to_csv('album_review_urls.csv')
pd.read_csv('album_review_urls.csv')

## Step 2 - Replace Empty Values w/ Null Values

In [2]:
# REPLACE NULL VALUES FUNCTIONS

# key for album_rating function
rating_key = "div.score-circle > span.score"

# album_rating function
def rating_extract(soup, key):
    parse = soup.select(key)
    if parse == []:
        return np.nan
    else:
        return parse[0].text

# key for album_name function
album_name_key = "h1"

# album_name function
def album_name_extract(soup, key):
    parse = soup.select(key)
    if parse == []:
        return np.nan
    else:
        return parse[0].text

# key for artist_name function
artist_name_key = "h2"

# artist_name function
def artist_name_extract(soup, key):
    parse = soup.select(key)
    if parse == []:
        return np.nan
    else:
        return parse[0].get_text(separator="</a>")

# key for album_year function
year_key = "span.single-album-tombstone__meta-year"

# album_year function
def year_extract(soup, key):
    parse = soup.select(key)
    pattern = r"\d+"
    parse2 = re.findall(pattern, parse[0].text)
    if parse2 == []:
        return np.nan
    else:
        return parse2[0]

# key for record_label function
label_key = "li.labels-list__item"

# record_label function
def label_extract(soup, key):
    parse = soup.select(key)
    if parse == []:
        return np.nan
    else:
        return parse[0].text

## Step 3 - Import Collected Album Review URLs for Scraping

In [3]:
# url list pull from csv

# url list
album_review_urls_df = pd.read_csv('album_review_urls.csv')
album_review_urls_df.drop(columns="Unnamed: 0", axis=1, inplace=True)
album_review_urls_lst = list(album_review_urls_df["0"])
urls = album_review_urls_lst

urls

['https://pitchfork.com/reviews/albums/rosie-lowe-duval-timothy-son/',
 'https://pitchfork.com/reviews/albums/jon-hopkins-music-for-psychedelic-therapy/',
 'https://pitchfork.com/reviews/albums/irreversible-entanglements-open-the-gates/',
 'https://pitchfork.com/reviews/albums/amine-twopointfive/',
 'https://pitchfork.com/reviews/albums/idles-crawler/',
 'https://pitchfork.com/reviews/albums/speedy-ortiz-the-death-of-speedy-ortiz-and-cop-kickerforever/',
 'https://pitchfork.com/reviews/albums/makthaverskan-for-allting/',
 'https://pitchfork.com/reviews/albums/nation-of-language-a-way-forward/',
 'https://pitchfork.com/reviews/albums/silk-sonic-an-evening-with-silk-sonic/',
 'https://pitchfork.com/reviews/albums/lukah-why-look-up-gods-in-the-mirror/',
 'https://pitchfork.com/reviews/albums/mmm-on-the-edge/',
 'https://pitchfork.com/reviews/albums/lee-ranaldo-in-virus-times-ep/',
 'https://pitchfork.com/reviews/albums/taylor-swift-red-taylors-version/',
 'https://pitchfork.com/reviews/al

## OLD Step 4 - Develop & Run Functions for Each Data Value Scraped

In [4]:
# *fetch album ratings function

# album ratings function
def fetch_ratings(url):
    print("opening", url)
    html = requests.get(url).content
    soup = bs(html)
    album_ratings = rating_extract(soup, rating_key)
    return album_ratings

# pooling > map function > convert to list
pool = eventlet.GreenPool(25000)
ratings_map_pool = pool.imap(fetch_ratings, urls)
lst_ratings = list(ratings_map_pool)

lst_ratings

opening https://pitchfork.com/reviews/albums/model-home-both-feet-en-th-infinite/
opening https://pitchfork.com/reviews/albums/diana-ross-thank-you/
opening https://pitchfork.com/reviews/albums/xeno-and-oaklander-video/
opening https://pitchfork.com/reviews/albums/mick-jenkins-elephant-in-the-room/
opening https://pitchfork.com/reviews/albums/dijon-absolutely/
opening https://pitchfork.com/reviews/albums/hyd-hyd-ep/
opening https://pitchfork.com/reviews/albums/shubh-saran-inglish/
opening https://pitchfork.com/reviews/albums/mortiferum-preserved-in-torment/
opening https://pitchfork.com/reviews/albums/white-town-women-in-technology/
opening https://pitchfork.com/reviews/albums/rem-new-adventures-in-hi-fi-25th-anniversary-edition/
opening https://pitchfork.com/reviews/albums/radiohead-kid-a-mnesia/
opening https://pitchfork.com/reviews/albums/abba-voyage/
opening https://pitchfork.com/reviews/albums/darius-jones-raw-demoon-alchemy-a-lone-operation/
opening https://pitchfork.com/reviews/

KeyboardInterrupt: 

In [None]:
# *fetch album names function

# album ratings function
def fetch_album_names(url):
    print("opening", url)
    html = requests.get(url).content
    soup = bs(html)
    album_names = album_name_extract(soup, album_name_key)
    return album_names

# pooling > map function > convert to list
pool = eventlet.GreenPool(25000)
album_names_map_pool = pool.imap(fetch_album_names, urls)
lst_album_names = list(album_names_map_pool)

lst_album_names

In [None]:
# *fetch artist names function

# album ratings function
def fetch_artist_names(url):
    print("opening", url)
    html = requests.get(url).content
    soup = bs(html)
    artist_names = artist_name_extract(soup, artist_name_key)
    return artist_names

# pooling > map function > convert to list
pool = eventlet.GreenPool(25000)
artist_names_map_pool = pool.imap(fetch_artist_names, urls)
lst_artist_names = list(artist_names_map_pool)

lst_artist_names

In [None]:
# *fetch album years function

# album ratings function
def fetch_album_years(url):
    print("opening", url)
    html = requests.get(url).content
    soup = bs(html)
    album_years = year_extract(soup, year_key)
    return album_years

# pooling > map function > convert to list
pool = eventlet.GreenPool(25000)
album_years_map_pool = pool.imap(fetch_album_years, urls)
lst_album_years = list(album_years_map_pool)

lst_album_years

In [None]:
# *fetch record labels function

# album ratings function
def fetch_record_labels(url):
    print("opening", url)
    html = requests.get(url).content
    soup = bs(html)
    record_labels = label_extract(soup, label_key)
    return record_labels

# pooling > map function > convert to list
pool = eventlet.GreenPool(25000)
record_labels_map_pool = pool.imap(fetch_record_labels, urls)
lst_record_labels = list(record_labels_map_pool)

lst_record_labels

## Step 5 - Combine & Run ALL Functions for Each Data Value Scraped

In [4]:
# *fetch ALL values at same time / combine all functions together

def fetch_all(url):
    print("opening", url)
    html = requests.get(url).content
    soup = bs(html)
    record_labels = label_extract(soup, label_key)
    artist_names = artist_name_extract(soup, artist_name_key)
    album_names = album_name_extract(soup, album_name_key)
    album_ratings = rating_extract(soup, rating_key)
    return [record_labels, artist_names, album_names, album_ratings]


In [5]:
# pooling > map function > convert to list
pool = eventlet.GreenPool(25000)
all_map_pool = pool.imap(fetch_all, urls)
lst_all = list(all_map_pool)

lst_all

opening https://pitchfork.com/reviews/albums/rosie-lowe-duval-timothy-son/
opening https://pitchfork.com/reviews/albums/jon-hopkins-music-for-psychedelic-therapy/
opening https://pitchfork.com/reviews/albums/irreversible-entanglements-open-the-gates/
opening https://pitchfork.com/reviews/albums/amine-twopointfive/
opening https://pitchfork.com/reviews/albums/idles-crawler/
opening https://pitchfork.com/reviews/albums/speedy-ortiz-the-death-of-speedy-ortiz-and-cop-kickerforever/
opening https://pitchfork.com/reviews/albums/makthaverskan-for-allting/
opening https://pitchfork.com/reviews/albums/nation-of-language-a-way-forward/
opening https://pitchfork.com/reviews/albums/silk-sonic-an-evening-with-silk-sonic/
opening https://pitchfork.com/reviews/albums/lukah-why-look-up-gods-in-the-mirror/
opening https://pitchfork.com/reviews/albums/mmm-on-the-edge/
opening https://pitchfork.com/reviews/albums/lee-ranaldo-in-virus-times-ep/
opening https://pitchfork.com/reviews/albums/taylor-swift-red

[['Carrying Colour', 'Rosie Lowe</a>Duval Timothy', 'Son', '8.0'],
 ['Domino', 'Jon Hopkins', 'Music for Psychedelic Therapy', '7.4'],
 ['Don Giovanni', 'Irreversible Entanglements', 'Open the Gates', '7.8'],
 ['CLBN ', 'Aminé', 'TwoPointFive', '7.0'],
 ['Partisan', 'IDLES', 'Crawler', '7.0'],
 ['Carpark',
  'Speedy Ortiz',
  'The Death of Speedy Ortiz & Cop Kicker...Forever',
  '7.4'],
 ['Run for Cover', 'Makthaverskan', 'För Allting', '7.6'],
 ['PIAS', 'Nation of Language', 'A Way Forward', '7.0'],
 ['Atlantic',
  'Bruno Mars</a>Anderson .Paak</a>Silk Sonic',
  'An Evening With Silk Sonic',
  '6.4'],
 ['FXCK RXP', 'Lukah', 'Why Look Up, God’s in the Mirror', '7.6'],
 ['self-released', 'MMM', 'On the Edge', '7.4'],
 ['Mute', 'Lee Ranaldo', 'In Virus Times EP', '7.0'],
 ['Republic', 'Taylor Swift', 'Red (Taylor’s Version)', '8.5'],
 ['Rhymesayers', 'Aesop Rock</a>Blockhead', 'Garbology', '7.8'],
 ['Sargent House', 'Emma Ruth Rundle', 'Engine of Hell', '7.2'],
 ['Spinster', 'Doran', 'Do

## Step 6 - Create Dataframe From Lists and Export to CSV

In [6]:
# create df from all lists and check

final_pitchfork_df = pd.DataFrame.from_records(lst_all)

final_pitchfork_df

Unnamed: 0,0,1,2,3
0,Carrying Colour,Rosie Lowe</a>Duval Timothy,Son,8.0
1,Domino,Jon Hopkins,Music for Psychedelic Therapy,7.4
2,Don Giovanni,Irreversible Entanglements,Open the Gates,7.8
3,CLBN,Aminé,TwoPointFive,7.0
4,Partisan,IDLES,Crawler,7.0
...,...,...,...,...
5721,Pan,Steven Warwick,Nadir,7.6
5722,A Country Called Earth,Yasiin Bey,December 99th,3.5
5723,Babygrande,Smoke DZA</a>Pete Rock,Don’t Smoke Rock,7.4
5724,Soul Jazz,Various Artists,Punk45: Les Punks: The French Connection (The ...,6.6


In [7]:
final_pitchfork_df.to_csv(path_or_buf="final_pf_df_jan17_fwd.csv")

In [9]:
pd.read_csv("final_pf_df_jan17_fwd.csv")

Unnamed: 0.1,Unnamed: 0,0,1,2,3
0,0,Carrying Colour,Rosie Lowe</a>Duval Timothy,Son,8.0
1,1,Domino,Jon Hopkins,Music for Psychedelic Therapy,7.4
2,2,Don Giovanni,Irreversible Entanglements,Open the Gates,7.8
3,3,CLBN,Aminé,TwoPointFive,7.0
4,4,Partisan,IDLES,Crawler,7.0
...,...,...,...,...,...
5721,5721,Pan,Steven Warwick,Nadir,7.6
5722,5722,A Country Called Earth,Yasiin Bey,December 99th,3.5
5723,5723,Babygrande,Smoke DZA</a>Pete Rock,Don’t Smoke Rock,7.4
5724,5724,Soul Jazz,Various Artists,Punk45: Les Punks: The French Connection (The ...,6.6
