In [1]:
import os
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from urllib.parse import unquote, urlparse, parse_qs
import time
import pandas as pd

In [None]:
def find_latest_file_simple(folder_path, pattern = 'rotafono_articles_scraped_'):
    files = [f for f in os.listdir(folder_path) 
             if f.startswith(pattern) and f.endswith('.pkl')]
    if not files:
        return None
    # Sort by filename (works because YYYYMMDD format sorts correctly)
    return os.path.join(folder_path, sorted(files)[-1])

'/Users/rodrigocarrillo/Documents/Natural Language Processing Projects/Rotafono Scrape/01_Data_Text/rotafono_articles_scraped_20251227.pkl'

In [4]:
df_with_scraped_data = find_latest_file_simple("/Users/rodrigocarrillo/Documents/Natural Language Processing Projects/Rotafono Scrape/01_Data_Text/")
articles_df = pd.read_pickle(df_with_scraped_data)
articles_df

Unnamed: 0,title,url,page,category,publish_date,article_content
0,San Luis: joven pide ayuda para encontrar a su...,https://rotafono.pe/casos/lima-san-luis-san-lu...,1,Mascotas,Publicado el 17-12-25,Mascotas\n\n¿Qué pasó?\n\nRotafono de RPP | Be...
1,Carabayllo: reportan acumulación de basura en ...,https://rotafono.pe/casos/lima-carabayllo-cara...,1,Servicios públicos,Publicado el 16-12-25,Servicios públicos\n\nHay varias bolsas de bas...
2,Villa El Salvador: joven reporta que su paloma...,https://rotafono.pe/casos/lima-villa-el-salvad...,1,Mascotas,Publicado el 15-12-25,Mascotas\n\nBlondy es una paloma de raza blond...
3,Villa María del Triunfo: adulto mayor de 84 añ...,https://rotafono.pe/casos/lima-villa-maria-del...,1,Servicios a la comunidad,Publicado el 15-12-25,Servicios a la comunidad\n\nEl señor tiene var...
4,Carabayllo: reportan acumulación de basura en ...,https://rotafono.pe/casos/lima-carabayllo-cara...,1,Servicios públicos,Publicado el 14-12-25,Servicios públicos\n\nHay bolsas de basura tir...
5,Chorrillos: reportan acumulación de basura en ...,https://rotafono.pe/casos/chorrillos-reportan-...,2,Municipal y regional,Publicado el 10-12-25,Municipal y regional\n\n¿Qué pasó?\n\nRotafono...
6,Hospital Cayetano Heredia: adulta mayor de 85 ...,https://rotafono.pe/casos/lima-san-martin-de-p...,2,Servicios públicos,Publicado el 14-11-25,Servicios públicos\n\nEl Hospital Cayetano Her...
7,El Agustino: vecinos de Villa Hermosa exigen c...,https://rotafono.pe/casos/lima-el-agustino-vec...,2,Servicios públicos,Publicado el 14-11-25,Servicios públicos\n\n¿Qué pasó?\n\nRotafono d...
8,San Martín de Porres: vecinos reportan que un ...,https://rotafono.pe/casos/lima-san-martin-de-p...,2,Servicios públicos,Publicado el 04-11-25,Servicios públicos\n\nEl señor no sabe a que e...
9,Carabayllo: vecino advierte peligro de desbord...,https://rotafono.pe/casos/carabayllo-vecino-ad...,2,Emergencias,Publicado el 04-11-25,Emergencias\n\n¿Qué pasó?\n\nRotafono de RPP |...


In [None]:
def download_all_article_audios(articles_df, output_dir = "/Users/rodrigocarrillo/Documents/Natural Language Processing Projects/Rotafono Scrape/02_Data_Audio/"):
    """
    Download audio files from all articles and add filenames to dataframe.
    
    This function:
    1. Extracts audio info from each article (Radio GRPP or JW Player) -> Only the audio file, **not** the videos.
    2. Uses Selenium to load Radio GRPP iframes and get the mdstrm.com URLs
    3. Downloads audio files directly
    4. Saves with proper filenames
    5. Updates the dataframe with audio_filename column
    
    Parameters:
    -----------
    articles_df : DataFrame - Articles with 'title' and 'url' columns
    output_dir : str - Directory to save audio files (default: "downloaded_audio")
    
    Returns:
    --------
    DataFrame - Updated articles_df with 'audio_filename' column. The audio files (.mp3) are saved in output_dir.
    """
    
    os.makedirs(output_dir, exist_ok=True)
    audio_filenames = []
    
    print(f"DOWNLOADING AUDIO FROM {len(articles_df)} ARTICLES")
    print("="*80 + "\n")
    
    for idx, row in articles_df.iterrows():
        article_url = row['url']
        article_title = row['title']
        
        print(f"[{idx+1}/{len(articles_df)}] {article_title[:60]}...")
        audio_file = None

        time.sleep(10)  # Be polite with server requests
        
        try:
            # Step 1: Extract media info from article
            headers = {
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
            }
            response = requests.get(article_url, headers=headers, timeout=10)
            soup = __import__('bs4').BeautifulSoup(response.content, 'html.parser')
            
            # Look for Radio GRPP audio embeds
            audio_items = []
            radio_iframes = soup.find_all('iframe', src=lambda x: x and 'eaudioplayer.radio-grpp.io' in x)
            
            for iframe in radio_iframes:
                src = iframe.get('src', '')
                if '?' in src:
                    params = parse_qs(urlparse(src).query)
                    audio_id = params.get('id', [''])[0]
                    audio_title = unquote(params.get('title', [''])[0])
                    
                    if audio_id:
                        audio_items.append({
                            'title': audio_title or 'Audio',
                            'type': 'audio',
                            'iframe_src': src
                        })
            
            if not audio_items:
                print(f"      ✗ No audio found")
                audio_filenames.append(None)
                continue
            
            # Step 2: Get the first audio item
            audio = audio_items[0]
            media_title = audio['title']
            iframe_src = audio['iframe_src']
            
            # Create safe filename
            safe_filename = ''.join(
                c if c.isalnum() or c in ' -_áéíóúñ' else '_'
                for c in media_title
            )[:80].strip('_')
            
            if not safe_filename:
                safe_filename = 'audio'
            
            output_filename = os.path.join(output_dir, f"{safe_filename}.mp3")
            
            # Avoid duplicate filenames
            if os.path.exists(output_filename):
                base, ext = os.path.splitext(output_filename)
                counter = 1
                while os.path.exists(f"{base}_{counter}{ext}"):
                    counter += 1
                output_filename = f"{base}_{counter}{ext}"
            
            # Step 3: Use Selenium to extract the actual mdstrm.com audio URL from the iframe
            chrome_options = Options()
            chrome_options.add_argument('--headless')
            chrome_options.add_argument('--no-sandbox')
            chrome_options.add_argument('--disable-dev-shm-usage')
            
            driver = webdriver.Chrome(options=chrome_options)
            audio_url = None
            
            try:
                driver.get(iframe_src)
                audio_element = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.TAG_NAME, 'audio'))
                )
                audio_url = audio_element.get_attribute('src')
            finally:
                driver.quit()
            
            if not audio_url:
                print(f"      ✗ Could not extract audio URL")
                audio_filenames.append(None)
                continue
            
            # Step 4: Download the audio file
            response = requests.get(audio_url, headers=headers, timeout=60, stream=True)
            response.raise_for_status()
            
            with open(output_filename, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
            
            # Verify file was saved
            if os.path.exists(output_filename) and os.path.getsize(output_filename) > 1000:
                filename = os.path.basename(output_filename)
                size_mb = os.path.getsize(output_filename) / (1024*1024)
                print(f"      ✓ {filename} ({size_mb:.2f} MB)")
                audio_filenames.append(filename)
            else:
                print(f"      ✗ Download failed")
                audio_filenames.append(None)
        
        except Exception as e:
            print(f"      ✗ Error: {str(e)[:60]}")
            audio_filenames.append(None)
    
    # Add audio filenames to dataframe
    df = articles_df.copy()
    df['audio_filename'] = audio_filenames
    
    # Print summary
    print(f"\n{'='*80}")
    print("DOWNLOAD COMPLETE")
    print(f"{'='*80}")
    print(f"Total articles: {len(df)}")
    print(f"Audio files downloaded: {df['audio_filename'].notna().sum()}")
    print(f"No audio available: {df['audio_filename'].isna().sum()}")
    
    return df

In [6]:
# Run the download
articles_df = download_all_article_audios(articles_df)

DOWNLOADING AUDIO FROM 50 ARTICLES

[1/50] San Luis: joven pide ayuda para encontrar a su perrita llama...
      ✓ La dueña de Bella dio detalles de cómo se perdió su perrita.mp3 (1.11 MB)
[2/50] Carabayllo: reportan acumulación de basura en distintas aven...
      ✓ El vecino contó detalles de la situación que se vive en Carabayllo.mp3 (0.93 MB)
[3/50] Villa El Salvador: joven reporta que su paloma llamada Blond...
      ✓ Su dueña pidió apoyo para encontrar a su mascota.mp3 (1.35 MB)
[4/50] Villa María del Triunfo: adulto mayor de 84 años con diabete...
      ✓ Su hijo narró detalles de cómo desapareció su padre.mp3 (2.12 MB)
[5/50] Carabayllo: reportan acumulación de basura en la urbanizació...
      ✓ La vecina contó que la municipalidad no se ha pronunciado al respecto.mp3 (1.33 MB)
[6/50] Chorrillos: reportan acumulación de basura en la urbanizació...
      ✗ Could not extract audio URL
[7/50] Hospital Cayetano Heredia: adulta mayor de 85 años se someti...
      ✓ El familiar de 