In [15]:
import requests
from bs4 import BeautifulSoup
import time


def get_page_soup(url):
    print(f"Fetching URL: {url}")
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve page: {url} (status code {response.status_code})")
        return None
    return BeautifulSoup(response.text, 'html.parser')

def get_article_links_from_page(url):
    soup = get_page_soup(url)
    if not soup:
        return []
    links = []
    for article in soup.select("div.infinite.col-detail article"):
        a = article.select_one("a.title-link")
        if a:
            href = a.get("href")
            if href:
                links.append(href)
    return links

def get_all_article_links(start_url):

    all_links = []
    page_url = start_url
    while page_url:
        print(f"Processing page: {page_url}")
        links = get_article_links_from_page(page_url)
        if not links:
            print("No articles found on this page. Ending pagination.")
            break
        all_links.extend(links)
        soup = get_page_soup(page_url)
        next_link = soup.select_one("div#pagination a.next")
        if next_link:
            page_url = next_link.get("href")
            time.sleep(1)
        else:
            print("No next page found.")
            break
    unique_links = list(set(all_links))
    return unique_links

start_url = "https://www.freeskatemag.com/category/articles/"
article_links = get_all_article_links(start_url)
print(f"Total articles found: {len(article_links)}")



def scrape_article_text(url):
    
    soup = get_page_soup(url)
    if not soup:
        return ""
    
    article = soup.select_one("div.col-detail.article-single article")
    if not article:
        # Fallback: try selecting the article element more broadly.
        article = soup.find("article")
        if not article:
            print("Main article not found for:", url)
            return ""
    
    main_section = article.find("section")
    if not main_section:
        print("No content section found for:", url)
        return ""
    
    for unwanted in main_section.find_all(class_=["tags", "share"]):
        unwanted.decompose()
    
    text_content = main_section.get_text(separator="\n", strip=True)
    return text_content

output_filename = "articles.txt"
all_articles_text = []

for link in article_links:
    print(f"Scraping article: {link}")
    article_text = scrape_article_text(link)
    if article_text:
        all_articles_text.append(article_text)
    else:
        print("No content extracted for:", link)
    time.sleep(1)

combined_text = ("\n\n" + ("=" * 80) + "\n\n").join(all_articles_text)

with open(output_filename, "w", encoding="utf-8") as f:
    f.write(combined_text)

print(f"Scraping complete. All article texts have been saved to '{output_filename}'.")


Processing page: https://www.freeskatemag.com/category/articles/
Fetching URL: https://www.freeskatemag.com/category/articles/
Fetching URL: https://www.freeskatemag.com/category/articles/
Processing page: https://www.freeskatemag.com/category/articles/page/2/
Fetching URL: https://www.freeskatemag.com/category/articles/page/2/
Fetching URL: https://www.freeskatemag.com/category/articles/page/2/
Processing page: https://www.freeskatemag.com/category/articles/page/3/
Fetching URL: https://www.freeskatemag.com/category/articles/page/3/
Fetching URL: https://www.freeskatemag.com/category/articles/page/3/
Processing page: https://www.freeskatemag.com/category/articles/page/4/
Fetching URL: https://www.freeskatemag.com/category/articles/page/4/
Fetching URL: https://www.freeskatemag.com/category/articles/page/4/
Processing page: https://www.freeskatemag.com/category/articles/page/5/
Fetching URL: https://www.freeskatemag.com/category/articles/page/5/
Fetching URL: https://www.freeskatemag.c

In [20]:
i re

def clean_text(text):
    
    text = text.lower()
    
    text = re.sub(r'<[^>]+>', '', text)
    
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()


In [22]:
input_file = "thrasher_raw.txt"      # Your raw Thrasher data file
output_file = "thrasher_cleaned.txt"  # File where the cleaned data will be saved

total_lines_processed = 0

with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", encoding="utf-8") as outfile:
    for line in infile:
        total_lines_processed += 1
        
        cleaned_line = clean_text(line)
        
        if cleaned_line:
            outfile.write(cleaned_line + "\n")

print(f"Total lines processed: {total_lines_processed}")
print(f"Cleaned data saved to: {output_file}")


Total lines processed: 4380
Cleaned data saved to: thrasher_cleaned.txt


In [23]:
input_file = "freeskate_raw.txt"      
output_file = "freeskate_cleaned.txt"  

total_lines_processed = 0

with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", encoding="utf-8") as outfile:
    for line in infile:
        total_lines_processed += 1
        
        cleaned_line = clean_text(line)
        
        if cleaned_line:
            outfile.write(cleaned_line + "\n")

print(f"Total lines processed: {total_lines_processed}")
print(f"Cleaned data saved to: {output_file}")


Total lines processed: 15747
Cleaned data saved to: freeskate_cleaned.txt
