In [2]:
import requests
from bs4 import BeautifulSoup
import time


def get_page_soup(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
    }
    print(f"Fetching URL: {url}")
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to retrieve page: {url} (status code {response.status_code})")
        return None
    return BeautifulSoup(response.text, 'html.parser')


def get_article_links_from_page(url):
    
    soup = get_page_soup(url)
    if not soup:
        return []
    links = []
    for article in soup.select("article.elementor-post"):
        a = article.select_one("h3.elementor-post__title a")
        if a:
            href = a.get("href")
            if href:
                links.append(href)
    return links

def get_all_article_links(start_url):
    
    all_links = []
    page_num = 1
    while True:
        if page_num == 1:
            page_url = start_url
        else:
            page_url = start_url.rstrip('/') + f'/page/{page_num}/'
        print(f"Processing page: {page_url}")
        links = get_article_links_from_page(page_url)
        if not links:
            print("No more articles found, stopping pagination.")
            break
        all_links.extend(links)
        page_num += 1
        time.sleep(1)
    return list(set(all_links))


def scrape_article_text(url):
    
    soup = get_page_soup(url)
    if not soup:
        return ""
    
    content_div = soup.select_one("div.elementor-widget-theme-post-content")
    if not content_div:
        print("Main content not found for:", url)
        return ""
    
    for fig in content_div.find_all("figure"):
        fig.decompose()
    
    text_content = content_div.get_text(separator="\n", strip=True)
    return text_content


start_url = "https://skatejawn.com/category/print-content/interview/"

article_links = get_all_article_links(start_url)
print(f"Total interviews found: {len(article_links)}")

output_filename = "skatejawn_interviews.txt"
all_articles_text = []

for link in article_links:
    print(f"Scraping interview: {link}")
    article_text = scrape_article_text(link)
    if article_text:
        all_articles_text.append(article_text)
    else:
        print("No content extracted for:", link)
    time.sleep(1)

combined_text = ("\n\n" + ("=" * 80) + "\n\n").join(all_articles_text)

with open(output_filename, "w", encoding="utf-8") as f:
    f.write(combined_text)

print(f"Scraping complete. All interview texts have been saved to '{output_filename}'.")


Processing page: https://skatejawn.com/category/print-content/interview/
Fetching URL: https://skatejawn.com/category/print-content/interview/
Processing page: https://skatejawn.com/category/print-content/interview/page/2/
Fetching URL: https://skatejawn.com/category/print-content/interview/page/2/
Processing page: https://skatejawn.com/category/print-content/interview/page/3/
Fetching URL: https://skatejawn.com/category/print-content/interview/page/3/
Processing page: https://skatejawn.com/category/print-content/interview/page/4/
Fetching URL: https://skatejawn.com/category/print-content/interview/page/4/
Processing page: https://skatejawn.com/category/print-content/interview/page/5/
Fetching URL: https://skatejawn.com/category/print-content/interview/page/5/
Processing page: https://skatejawn.com/category/print-content/interview/page/6/
Fetching URL: https://skatejawn.com/category/print-content/interview/page/6/
Processing page: https://skatejawn.com/category/print-content/interview/

In [4]:
import re

def clean_text(text):
    
    text = text.lower()
    
    text = re.sub(r'<[^>]+>', '', text)
    
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()

input_file = "skatejawn_interviews.txt"      
output_file = "skate_jawn_cleaned.txt"   

total_lines_processed = 0

with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", encoding="utf-8") as outfile:
    for line in infile:
        total_lines_processed += 1
        
        cleaned_line = clean_text(line)
        
        if cleaned_line:
            outfile.write(cleaned_line + "\n")

print(f"Total lines processed: {total_lines_processed}")
print(f"Cleaned data saved to: {output_file}")


Total lines processed: 11398
Cleaned data saved to: skate_jawn_cleaned.txt
