In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# URL of Taylor Swift's reviews
url = "https://pitchfork.com/artists/28495-taylor-swift/review/"



In [2]:
# Fetch the webpage
response = requests.get(url)
if response.status_code == 200:
    print("Page fetched successfully!")
else:
    print(f"Failed to fetch the page. Status code: {response.status_code}")
    exit()

# Parse HTML
soup = BeautifulSoup(response.text, 'html.parser')

Page fetched successfully!


In [6]:
# Extract review items
reviews = soup.find_all('a', class_="SummaryItemHedLink-civMjp PNQqc summary-item-tracking__hed-link summary-item__hed-link")
data = []

for review in reviews:
    # Extract title and link
    title_tag = review.find('h3', class_="SummaryItemHedBase-hiFYpQ jwYeiM summary-item__hed")
    link = review['href']
    
    # Some titles might be in <em> tags; handle gracefully
    title = title_tag.text if title_tag else "No Title"
    full_link = f"https://pitchfork.com{link}"

    data.append({"Title": title, "Link": full_link})

# Save to DataFrame
df = pd.DataFrame(data)

# Export to CSV
df.to_csv("taylor_swift_reviews.csv", index=False)

print("Scraping complete! Data saved to 'taylor_swift_reviews.csv'.")

Scraping complete! Data saved to 'taylor_swift_reviews.csv'.


In [4]:
# Load the CSV with review links
input_csv = "taylor_swift_reviews.csv"  # Replace with your actual file name
reviews_df = pd.read_csv(input_csv)

# Ensure the "Link" and "Title" columns exist
if "Link" not in reviews_df.columns or "Title" not in reviews_df.columns:
    print("The CSV file must contain 'Link' and 'Title' columns!")
    exit()

# Extract the titles and links
review_titles = reviews_df["Title"].tolist()
review_links = reviews_df["Link"].tolist()

# List to store the scraped data
review_data = []

In [5]:
# Scrape each review
for idx, link in enumerate(review_links):
    try:
        response = requests.get(link)
        if response.status_code != 200:
            print(f"Failed to fetch {link}: {response.status_code}")
            continue

        soup = BeautifulSoup(response.text, 'html.parser')

        # Title from the CSV
        title = review_titles[idx]

        # Extract the author
        author_tag = soup.find('a', class_='byline__name-link')
        author = author_tag.text.strip() if author_tag else "No Author"

        # Extract the date (look for sibling after "Reviewed:")
        date_key = soup.find('p', text="Reviewed:")
        date_tag = date_key.find_next_sibling('p') if date_key else None
        date = date_tag.text.strip() if date_tag else "No Date"

        # Extract the score
        score_tag = soup.find('div', class_='ScoreCircle-jAxRuP')
        score = score_tag.find('p').text.strip() if score_tag else "No Score"

        # Extract the review text from multiple containers
        review_text = ""

        # Primary container
        primary_container = soup.find('div', class_='body__inner-container')
        if primary_container:
            paragraphs = primary_container.find_all('p')
            for p in paragraphs:
                # Handle anchor tags in the paragraph
                for a in p.find_all('a'):
                    a.insert_before(' ')  # Add space before the <a> tag
                    a.insert_after(' ')   # Add space after the <a> tag
                    a.unwrap()  # Unwrap the anchor tag to get its text content
                review_text += p.get_text(strip=True) + " "

        # Additional container(s)
        additional_containers = soup.find_all('div', class_='BodyWrapper-kufPGa')  # Adjust based on class
        for container in additional_containers:
            paragraphs = container.find_all('p')
            for p in paragraphs:
                # Handle anchor tags in the paragraph
                for a in p.find_all('a'):
                    a.insert_before(' ')  # Add space before the <a> tag
                    a.insert_after(' ')   # Add space after the <a> tag
                    a.unwrap()  # Unwrap the anchor tag to get its text content
                review_text += p.get_text(strip=True) + " "
                
        # Append data to the list
        review_data.append({
            "Title": title,
            "Author": author,
            "Date": date,
            "Score": score,
            "Text": review_text,
            "Link": link
        })

        print(f"Successfully scraped: {title}")

        # Avoid hitting the server too quickly
        time.sleep(1)

    except Exception as e:
        print(f"Error scraping {link}: {e}")
        continue

  date_key = soup.find('p', text="Reviewed:")


Successfully scraped: The Tortured Poets Department / The Anthology
Successfully scraped: 1989 (Taylor’s Version)
Successfully scraped: Speak Now (Taylor’s Version)
Successfully scraped: Midnights
Successfully scraped: Red (Taylor’s Version)
Successfully scraped: Fearless (Taylor’s Version)
Successfully scraped: Evermore
Successfully scraped: Folklore
Successfully scraped: Lover
Successfully scraped: Red
Successfully scraped: Fearless
Successfully scraped: Speak Now
Successfully scraped: Taylor Swift
Successfully scraped: 1989
Successfully scraped: Reputation


In [6]:
# Save the scraped data to a new CSV file
output_csv = "taylor_swift_review_details.csv"
df = pd.DataFrame(review_data)
df.to_csv(output_csv, index=False)

print(f"Scraping complete! Data saved to '{output_csv}'.")

Scraping complete! Data saved to 'taylor_swift_review_details.csv'.
