In [13]:

import requests
from bs4 import BeautifulSoup
import time

BASE_URL = "https://www.thrashermagazine.com"

page_urls = [f"{BASE_URL}/tag/interviews/"]
page_urls += [f"{BASE_URL}/tag/interviews/page/{i}/" for i in range(1, 6)]

print("Libraries imported and base URLs set.")
print("Pages to scrape:", page_urls)


Libraries imported and base URLs set.
Pages to scrape: ['https://www.thrashermagazine.com/tag/interviews/', 'https://www.thrashermagazine.com/tag/interviews/page/1/', 'https://www.thrashermagazine.com/tag/interviews/page/2/', 'https://www.thrashermagazine.com/tag/interviews/page/3/', 'https://www.thrashermagazine.com/tag/interviews/page/4/', 'https://www.thrashermagazine.com/tag/interviews/page/5/']


In [14]:

interview_links = []

for url in page_urls:
    print(f"Processing index page: {url}")
    response = requests.get(url)
    if response.status_code != 200:
        print(f"  Failed to fetch {url} with status code: {response.status_code}")
        continue

    soup = BeautifulSoup(response.text, 'html.parser')
    
    post_list = soup.find('ul', class_='post-list')
    if not post_list:
        print("  No post list found on the page.")
        continue
    
    for li in post_list.find_all('li', class_='post-list-item'):
        a_tag = li.find('a', class_='post-title-link')
        if a_tag and a_tag.has_attr('href'):
            href = a_tag['href']
            if not href.startswith("http"):
                href = BASE_URL + href
            if href not in interview_links:
                interview_links.append(href)
    
    time.sleep(1)

print(f"Total interview links found: {len(interview_links)}")
print(interview_links[:5])


Processing index page: https://www.thrashermagazine.com/tag/interviews/
Processing index page: https://www.thrashermagazine.com/tag/interviews/page/1/
Processing index page: https://www.thrashermagazine.com/tag/interviews/page/2/
Processing index page: https://www.thrashermagazine.com/tag/interviews/page/3/
Processing index page: https://www.thrashermagazine.com/tag/interviews/page/4/
Processing index page: https://www.thrashermagazine.com/tag/interviews/page/5/
Total interview links found: 60
['https://www.thrashermagazine.com/articles/the-follow-up-alexey-krasniy/', 'https://www.thrashermagazine.com/articles/chris-russell-breaks-out-interview/', 'https://www.thrashermagazine.com/articles/the-follow-up-adam-davies/', 'https://www.thrashermagazine.com/articles/vitoria-mendonca-on-heelflips-homies-and-going-pro/', 'https://www.thrashermagazine.com/articles/spanky-five-greats/']


In [15]:

interviews = []  

for idx, url in enumerate(interview_links):
    try:
        print(f"Scraping interview {idx+1}/{len(interview_links)}: {url}")
        page_response = requests.get(url)
        if page_response.status_code != 200:
            print(f"  Skipping {url} due to status code: {page_response.status_code}")
            continue

        page_soup = BeautifulSoup(page_response.text, 'html.parser')
        
        article_div = page_soup.find('div', class_='article-text')
        
        body_div = article_div.find('div', class_='body-text') if article_div else None
        
        if body_div:
            interview_text = body_div.get_text(separator='\n', strip=True)
            interviews.append(interview_text)
        else:
            print(f"  Interview content not found in {url}")
        
        time.sleep(1)
        
    except Exception as e:
        print(f"Error scraping {url}: {e}")

print(f"Successfully scraped {len(interviews)} interviews out of {len(interview_links)} links.")


Scraping interview 1/60: https://www.thrashermagazine.com/articles/the-follow-up-alexey-krasniy/
Scraping interview 2/60: https://www.thrashermagazine.com/articles/chris-russell-breaks-out-interview/
Scraping interview 3/60: https://www.thrashermagazine.com/articles/the-follow-up-adam-davies/
Scraping interview 4/60: https://www.thrashermagazine.com/articles/vitoria-mendonca-on-heelflips-homies-and-going-pro/
Scraping interview 5/60: https://www.thrashermagazine.com/articles/spanky-five-greats/
Scraping interview 6/60: https://www.thrashermagazine.com/articles/kevin-baekkel-s-gangreen-interview/
Scraping interview 7/60: https://www.thrashermagazine.com/articles/kevin-baekkel-interview/
Scraping interview 8/60: https://www.thrashermagazine.com/articles/sebo-walker-interview/
Scraping interview 9/60: https://www.thrashermagazine.com/articles/windsor-james-interview/
Scraping interview 10/60: https://www.thrashermagazine.com/articles/justin-figgy-figueroa-interview/
Scraping interview 11/

In [16]:

output_filename = "thrasher_interviews.txt"

with open(output_filename, 'w', encoding='utf-8') as f:
    for interview in interviews:
        f.write(interview + "\n\n---\n\n")  

print(f"Interviews saved to {output_filename}.")


Interviews saved to thrasher_interviews.txt.
