In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time
import pandas as pd

In [None]:
how_many_pages_to_scrape = 10
where_to_save_csv = "/Users/rodrigocarrillo/Documents/Natural Language Processing Projects/Rotafono Scrape/01_Data_Text/"

In [None]:
def scrape_rotafono_selenium(max_pages=None):
    """
    Scrape rotafono news using Selenium for JavaScript-rendered content
    """
    
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    
    driver = webdriver.Chrome(options=chrome_options)
    articles = []
    
    try:
        page_num = 1
        
        while True:
            # Build URL with correct parameter structure: ?page=X&departamento=lima
            url = f"https://rotafono.pe/casos/?page={page_num}&departamento=lima"
            
            print(f"\n{'='*60}")
            print(f"Scraping page {page_num}: {url}")
            print('='*60)
            
            driver.get(url)
            
            # Wait for content to load
            try:
                WebDriverWait(driver, 10).until(
                    EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a[href*='/casos/']"))
                )
            except:
                print("Timeout waiting for content to load")
                break
            
            time.sleep(5)
            
            # Get all article links
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            
            # Find all links that point to specific cases (structure: /casos/<district>-...)
            # This filters out pagination and other generic links
            # Note: cases can be from any district, not just lima (e.g., /casos/carabayllo-..., /casos/chorrillos-...)
            case_links = []
            for link in soup.find_all('a', href=True):
                href = link.get('href', '')
                # Look for links with pattern: /casos/<case-name>-<id>/
                # Match /casos/ followed by a district name and ends with /
                # Exclude generic /casos/ or /casos/ with only one part
                if href.startswith('/casos/') and href.endswith('/') and href != '/casos/':
                    # Additional check: must have at least one hyphen to be a case link
                    path_part = href[7:-1]  # Remove /casos/ prefix and trailing /
                    if '-' in path_part:
                        case_links.append(link)
            
            if not case_links:
                print("No article links found on this page")
                break
            
            print(f"Found {len(case_links)} links on page {page_num}")
            
            # Deduplicate articles on this page by URL first
            # Store {url: title} to get the best (non-empty) title for each URL
            url_to_article = {}
            
            for link in case_links:
                title = link.get_text(strip=True)
                href = link.get('href', '')
                
                if not href:
                    continue
                
                # Convert relative URLs to absolute
                if href.startswith('/'):
                    href = 'https://rotafono.pe' + href
                elif not href.startswith('http'):
                    href = 'https://rotafono.pe/' + href
                
                # Keep the first non-empty title for each URL
                if href not in url_to_article:
                    url_to_article[href] = title
                elif len(title) > len(url_to_article[href]):
                    # Update if we find a longer (likely more complete) title
                    url_to_article[href] = title
            
            # Create articles list from deduplicated URLs
            page_articles = []
            for url, title in url_to_article.items():
                # Skip if title is empty or too short
                if not title or len(title) < 5:
                    continue
                
                page_articles.append({
                    'title': title,
                    'url': url,
                    'page': page_num
                })
            
            articles.extend(page_articles)
            print(f"Added {len(page_articles)} unique articles to results")
            
            for i, article in enumerate(page_articles, 1):
                print(f"  {i}. {article['title'][:70]}...")
            
            # Check for next page button or link
            next_button = None
            for link in soup.find_all('a', href=True):
                href = link.get('href', '')
                link_text = link.get_text(strip=True)
                # Check if link points to next page using correct parameter structure
                if f'page={page_num + 1}' in href and 'departamento=lima' in href:
                    next_button = link
                    break
            
            # Stop if max_pages reached
            if max_pages and page_num >= max_pages:
                print(f"\nReached maximum pages limit ({max_pages})")
                break
            
            # Stop if no next page
            if not next_button:
                print(f"\nNo more pages found")
                break
            
            page_num += 1
            time.sleep(5)  # Be respectful to the server
    
    finally:
        driver.quit()
    
    df = pd.DataFrame(articles)
    return df

In [None]:
# Start scraping
print("Starting Selenium-based scraper...")
print("This may take a few minutes...")
articles_df = scrape_rotafono_selenium(max_pages = how_many_pages_to_scrape)

print(f"\n{'='*60}")
print(f"SCRAPING COMPLETE")
print(f"{'='*60}")
print(f"Total articles scraped: {len(articles_df)}")
print(f"\nFirst 15 articles:")
print(articles_df.head(15).to_string())

In [None]:
def scrape_article_content(article_url):
    """
    Scrape the full content of a specific article
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
    }
    
    try:
        response = requests.get(article_url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract article title
        title_elem = soup.find('h1', class_='title')
        title_text = title_elem.get_text(strip=True) if title_elem else "N/A"
        
        # Extract article category
        category_elem = soup.find('h2', class_='category')
        category = category_elem.get_text(strip=True) if category_elem else ""
        
        # Extract publish date
        date_elem = soup.find('time', class_='date-published')
        publish_date = date_elem.get_text(strip=True) if date_elem else ""
        
        # Extract full article content from the main case container
        content_text = ""
        
        # Get the main article container
        main_case = soup.find('div', class_='container-main-case')
        
        if main_case:
            # Extract all text from the card div (which contains the main content)
            card = main_case.find('div', class_='card')
            
            if card:
                # Get all paragraphs and other text elements
                paragraphs = card.find_all(['p', 'h2', 'h3', 'h4'])
                
                content_parts = []
                
                for elem in paragraphs:
                    # Skip navigation and social elements
                    if 'question' in elem.get('class', []) or 'answer' in elem.get('class', []):
                        # Handle Q&A format
                        if 'question' in elem.get('class', []):
                            content_parts.append(f"\n{elem.get_text(strip=True)}\n")
                        else:
                            content_parts.append(elem.get_text(strip=True))
                    elif elem.name in ['h2', 'h3', 'h4']:
                        # Add headings with formatting
                        content_parts.append(f"\n\n{elem.get_text(strip=True)}\n")
                    elif 'figcaption' not in str(elem.get('class', [])):
                        # Regular paragraphs
                        text = elem.get_text(strip=True)
                        if text and len(text) > 3:
                            content_parts.append(text)
                
                content_text = "\n\n".join(content_parts)
            
            # If no content from card, get from main case container directly
            if not content_text:
                content_text = main_case.get_text(strip=True)
        
        # Clean up excessive whitespace
        if content_text:
            content_text = '\n\n'.join([line.strip() for line in content_text.split('\n') if line.strip()])
        else:
            content_text = "Content not found"
        
        return {
            'title': title_text,
            'url': article_url,
            'category': category,
            'publish_date': publish_date,
            'content': content_text,
            'full_content': content_text
        }
    except Exception as e:
        return {
            'title': "Error",
            'url': article_url,
            'category': "",
            'publish_date': "",
            'content': str(e),
            'full_content': str(e)
        }

In [None]:
# Scrape content for all URLs in articles_df
def scrape_and_extract(row):
    article_content_raw = scrape_article_content(row['url'])
    return pd.Series({
        'category': article_content_raw['category'],
        'publish_date': article_content_raw['publish_date'],
        'article_content': article_content_raw['content']
    })

articles_df[['category', 'publish_date', 'article_content']] = articles_df.apply(scrape_and_extract, axis=1)

In [None]:
articles_df.head()

In [None]:
import datetime
today_str = datetime.date.today().strftime("%Y%m%d")
articles_df.to_pickle(where_to_save_csv + f"rotafono_articles_scraped_{today_str}.pkl")