In [168]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import json
import math
import random
from pathlib import Path
from datetime import datetime

In [3]:
headers = {'User-Agent': f'ds6021-project/0.0 (jhb7ek@virginia.edu) python-requests/{requests.__version__}'}
headers

{'User-Agent': 'ds6021-project/0.0 (jhb7ek@virginia.edu) python-requests/2.32.5'}

In [165]:
with open("album_urls.json", "r") as f:
    urls = json.load(f)
urls[0]

'https://www.pitchfork.com/reviews/albums/jeff-tweedy-twilight-override/'

#### Raw parse

In [None]:
url_test = 'https://pitchfork.com/reviews/albums/bibio-ribbons/'
r = requests.get(url_test, headers=headers)
mysoup1 = BeautifulSoup(r.text, 'html.parser')

## Album name
album = mysoup1.find('h1', attrs={'data-testid': 'ContentHeaderHed'}).string
## Artist
artist = mysoup1.find('div', 'SplitScreenContentHeaderArtist-fyEeJx').string
## Score
score = float(mysoup1.find('p', 'Rating-iQoWYo').string)
## Genre
genre = mysoup1.find_all('p', 'InfoSliceValue-yycxB')[0].string
## Label
label = mysoup1.find_all('p', 'InfoSliceKey-gsmHBp')[1].find_next_sibling('p').string
## Reviewer
reviewer = mysoup1.find('span', attrs={'data-testid': "BylineName"}).find('a').string
## Album year
year = int(mysoup1.find('time', attrs={'data-testid': "SplitScreenContentHeaderReleaseYear"}).string)
## Review date
review_date = mysoup1.find_all('p', 'InfoSliceKey-gsmHBp')[2].find_next_sibling('p').string
review_date = datetime.strptime(review_date, "%B %d, %Y")
## Review length
length = int(len(json.loads(mysoup1.find('script', type="application/ld+json").string)['reviewBody'].split()))

#### Safe parse function

In [157]:
def parse_pitchfork_review(html):
    mysoup = BeautifulSoup(html, "html.parser")

    # Helper: safely extract element text
    def safe_text(selector, class_name=None, index=0, attr=None):
        try:
            if attr:
                el = mysoup.find(selector, attrs=attr)
            elif class_name:
                el = mysoup.find_all(selector, class_name)[index]
            else:
                el = mysoup.find_all(selector)[index]
            return el.get_text(strip=True)
        except Exception:
            return None

    # Artist & Album
    album = safe_text('h1', attr={'data-testid': 'ContentHeaderHed'})
    artist = safe_text('div', 'SplitScreenContentHeaderArtist-fyEeJx')

    # Score (float)
    score = safe_text('p', 'Rating-iQoWYo')
    score = float(score) if score else None

    # Genre (first info-slice value)
    genre = safe_text('p', 'InfoSliceValue-yycxB', 0)

    # Label (find by "Label:" key)
    label = None
    for key_tag in mysoup.find_all('p', 'InfoSliceKey-gsmHBp'):
        if key_tag.get_text(strip=True) == "Label:":
            label = key_tag.find_next_sibling('p').get_text(strip=True)
            break

    # Reviewer
    reviewer = None
    try:
        reviewer = mysoup.find('span', attrs={'data-testid': "BylineName"}).find('a').get_text(strip=True)
    except Exception:
        pass

    # Year (int)
    year = safe_text('time', attr={'data-testid': "SplitScreenContentHeaderReleaseYear"})
    year = int(year) if year and year.isdigit() else None

    # Review date (datetime)
    review_date = None
    try:
        review_date_str = None
        for key_tag in mysoup.find_all('p', 'InfoSliceKey-gsmHBp'):
            if key_tag.get_text(strip=True) == "Reviewed:":
                review_date_str = key_tag.find_next_sibling('p').get_text(strip=True)
                break
        if review_date_str:
            review_date = datetime.strptime(review_date_str, "%B %d, %Y")
    except Exception:
        pass

    # Review length (word count)
    review_length = None
    try:
        ld_json = mysoup.find('script', type="application/ld+json")
        data = json.loads(ld_json.string)
        review_length = len(data['reviewBody'].split())
    except Exception:
        pass

    # Build structured output
    review_data = {
        "artist": artist,
        "album": album,
        "score": score,
        "genre": genre,
        "label": label,
        "reviewer": reviewer,
        "year": year,
        "review_date": review_date,
        "length": review_length,
    }

    return review_data

In [173]:
TIMEOUT = 25
RETRIES = 3
BACKOFF = 2.0   # seconds, exponential backoff
BATCH_SIZE = 500
rows = []
failures = []

In [170]:
def fetch_html(session: requests.Session, url):
    for attempt in range(1, RETRIES + 1):
        try:
            resp = session.get(url, headers=headers, timeout=TIMEOUT)
            if 200 <= resp.status_code < 300:
                return resp.text
            # backoff on non-200s as well
        except requests.RequestException:
            pass
        time.sleep(BACKOFF * attempt)
    return None

In [None]:
with requests.Session() as s:
    for i, url in enumerate(urls, 1):
        html = fetch_html(s, url)
        if not html:
            failures.append({"url": url, "reason": "fetch_failed"})
            continue
        try:
            data = parse_pitchfork_review(html)
            data["url"] = url  # keep the source URL
            rows.append(data)
        except Exception as e:
            failures.append({"url": url, "reason": f"parse_error: {e}"})
        # polite tiny pause to reduce load (tune as needed)
        if i % 25 == 0:
            time.sleep(0.5)
        
        # Checkpoint of 500 urls outputs to dataframe
        if i % BATCH_SIZE == 0:
            df_batch = pd.DataFrame(rows)
            df_batch.to_csv(f"out/pitchfork_reviews_batch_{i}.csv", index=False)
            pd.DataFrame(failures).to_csv(f"out/pitchfork_failures_batch_{i}.csv", index=False)
            print(f"Checkpoint: saved {i} reviews so far.")

df = pd.DataFrame(rows)

# Save outputs
Path("out").mkdir(exist_ok=True)
df.to_csv("out/pitchfork_reviews.csv", index=False)
pd.DataFrame(failures).to_csv("out/pitchfork_failures.csv", index=False)
print(f"Done. {len(df)} succeeded, {len(failures)} failed.")

In [176]:
df = pd.read_csv('out/pitchfork_reviews.csv')
df.head()

Unnamed: 0,artist,album,score,genre,label,reviewer,year,review_date,length,url
0,Jeff Tweedy,Twilight Override,8.0,Rock,dBpm,Elizabeth Nelson,2025.0,2025-09-27,1262,https://www.pitchfork.com/reviews/albums/jeff-...
1,Geese,Getting Killed,9.0,Rock,Partisan / Play It Again Sam,Sam Sodomsky,2025.0,2025-09-26,827,https://www.pitchfork.com/reviews/albums/geese...
2,Piotr Kurek,Songs and Bodies,7.8,Experimental,Unsound,Philip Sherburne,2025.0,2025-09-26,602,https://www.pitchfork.com/reviews/albums/piotr...
3,Paul St. Hilaire,w/ the Producers,7.1,Electronic,Kynant / N.E.W.S.,Daniel Bromfield,2025.0,2025-09-26,556,https://www.pitchfork.com/reviews/albums/paul-...
4,Ø,Sysivalo,8.3,Electronic,Sähkö,Philip Sherburne,2025.0,2025-09-25,1170,https://www.pitchfork.com/reviews/albums/o-sys...


In [177]:
df.tail()

Unnamed: 0,artist,album,score,genre,label,reviewer,year,review_date,length,url
8921,,,9.0,Global,,Michael J. Agovino,,2017-03-07,1111,https://www.pitchfork.com/reviews/albums/22908...
8922,Elliott Smith,Either/Or: Expanded Edition,10.0,Rock,Kill Rock Stars,Matt LeMay,2017.0,2017-03-07,713,https://www.pitchfork.com/reviews/albums/22947...
8923,WHY?,Moh Lhean,7.7,Rap / Rock,Joyful Noise,Ian Cohen,2017.0,2017-03-07,720,https://www.pitchfork.com/reviews/albums/22945...
8924,DJ Seinfeld,Sunrise EP,6.9,Electronic,Endotherm,Jesse Weiss,2017.0,2017-03-06,642,https://www.pitchfork.com/reviews/albums/22937...
8925,The Magnetic Fields,50 Song Memoir,7.4,Rock,Nonesuch,Sam Sodomsky,2017.0,2017-03-06,916,https://www.pitchfork.com/reviews/albums/22930...


In [182]:
import plotly.express as px

In [194]:
df2 = df.loc[df['year'] > 2016]
px.box(df2, x='year', y='score')

In [191]:
px.scatter(df, x='length', y='score')