In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
url = 'https://www.opinion.com.bo/'

In [3]:
opinion = requests.get(url)
if opinion.status_code == 200:
    s = BeautifulSoup(opinion.text,'lxml')
    secciones = s.find('div', attrs={'class':'main-menu-border'}).find_all('li')
    links_secciones = list(set([url + seccion.a.get('href')[1:] for seccion in secciones]))
    print(links_secciones)
else:
    print('Error: problemas con la pagina')


['https://www.opinion.com.bo/video/', 'https://www.opinion.com.bo/blog/section/cobocitos', 'https://www.opinion.com.bo/blog/section/ciencia', 'https://www.opinion.com.bo/blog/section/escena-del-crimen', 'https://www.opinion.com.bo/tag/club-motor', 'https://www.opinion.com.bo/blog/section/revista-asi', 'https://www.opinion.com.bo/tag/tramitologia', 'https://www.opinion.com.bo/blog/section/avisos-necrologicos', 'https://www.opinion.com.bo/blog/section/policial', 'https://www.opinion.com.bo/blog/section/cochabamba', 'https://www.opinion.com.bo/album/', 'https://www.opinion.com.bo/blog/section/deportes', 'https://www.opinion.com.bo/blog/section/cultura', 'https://www.opinion.com.bo/blog/section/tecnologia', 'https://www.opinion.com.bo/blog/section/ramona', 'https://www.opinion.com.bo/blog/section/salud', 'https://www.opinion.com.bo/blog/section/catar', 'https://www.opinion.com.bo/blog/section/virales', 'https://www.opinion.com.bo/blog/section/buenanoche', 'https://www.opinion.com.bo/opinio

In [None]:
def obtener_articulos(soup):
    """
    Funcion que recibe un objeto Beautifulsoup de una seccion de una pagina 
    y devuelve una lista de URLs de cada nota en la seccion.
    """
    list_notes_links = []
    # se obtiene el listado de articulos principales
    try:
        list_articles = soup.find('div', attrs={'class':'archive-contents'}).find_all('article')
        for article in list_articles:
            if article.a:
                if 'https://www.opinion.com.bo' in article.a.get('href'):
                    list_notes_links.append(article.a.get('href'))
                else:
                    list_notes_links.append('https://www.opinion.com.bo' + article.a.get('href'))      
    except:
        print('No se pudo obtener los link de los articulos en:', soup.find('meta', attrs={'property':'og:url'}).get('content'))
        
    return list_notes_links

In [None]:
def obtener_info(s_nota, url):
    # se crea un dict vacio para llenarlo de información
    info_dict = {}
    # Extraemos la sección
    seccion = s_nota.find('body').get('data-category')
    if seccion:
        info_dict['seccion'] = seccion
    else:
        info_dict['seccion'] = None

    # Extraemos el titulo
    titulo = s_nota.find('h2', attrs={'class':'title'})
    if titulo:
        try:
            info_dict['titulo'] = titulo.text
        except Exception as e:
            info_dict['titulo'] = None
            print('Error extrayendo titulo: ', e)
    else:
        info_dict['titulo'] = None
        
    # Extraemos la fecha
    cont_fecha = s_nota.find('script', attrs={'type':'application/ld+json'})
    if cont_fecha:
        try:
            if cont_fecha.text.find('datePublished'):
                pos_ini = cont_fecha.text.find('datePublished') + len('datePublished') + 4
                pos_fin = pos_ini + len('2022-07-30T18:12:31-04:00')
                info_dict['fecha'] = cont_fecha.text[pos_ini:pos_fin]
            else:
                raise AttributeError('No se pudo encontrar la fecha')
        except AttributeError as ae:
            info_dict['fecha'] = None
            print('Error extrayendo fecha:')
            print(ae)
    else:
        info_dict['fecha'] = None

    # Extraemos el resumen
    resumen = s_nota.find('div', attrs={'class':'summary'})
    if resumen:
        try:
            info_dict['resumen'] = resumen.text[:-3]
        except Exception as e:
            info_dict['resumen'] = None
            print('Error extrayendo resumen: ',e)
    else:
        info_dict['resumen'] = None

    # Extraemos el autor de la nota
    autor = s_nota.find('span', attrs={'class':'author-name'})
    if autor: 
            try:
                info_dict['autor'] = autor.a.text[1:-1]
            except:
                try:
                    info_dict['autor'] = autor.text[1:-1]
                except Exception as e:
                    info_dict['autor'] = None
                    print('Error 1er y 2do comando: autor --> dato no extraido')
                    print(e)
    else:
        info_dict['autor'] = None
        
    # Extraemos el contenido
    contenido = s_nota.find('div', attrs={'class':'body'})
    try:
        if len(contenido.find_all('p')) > 1:
            contenido = contenido.find_all('p')
            contenido_unido = ''
            for parrafo in contenido:
                contenido_unido = contenido_unido + parrafo.text + ' '
            info_dict['texto'] = contenido_unido.replace('\xa0', ' ')
            
        elif len(contenido.find_all('p')) == 1:
            contenido = contenido.text.replace('\n\n', ' ').replace('\n','')
            info_dict['texto'] = contenido
        else:
            print(f'Contenido no encontrado en: {url}')
            info_dict['texto'] = None
            
    except Exception as e:
        info_dict['texto'] = None
        print('Error extrayendo contenido ',e)
        
    return info_dict

In [None]:
def scrapear_nota(url):
    try:
        nota = requests.get(url)
    except Exception as e:
        print('Error scrapeando la url: ', url)
        print(e)
        return None
    if nota.status_code != 200:
        print(f'Error obteniendo la nota: {url}')
        print(f'Status Code: {nota.status_code}')
        return None
    s_nota = BeautifulSoup(nota.text, 'lxml')
    
    info_dict = obtener_info(s_nota, url)
    info_dict['url'] = url
    
    return info_dict

In [None]:
notas = []
for link in links_secciones:
    try:
        r = requests.get(link)
        if r.status_code == 200:
            soup = BeautifulSoup(r.text, 'lxml')
            notas.extend(obtener_articulos(soup))
        else:
            print('No se pudo obtener la sección', link)
    except Exception as e:
        print('No se pudo obtener la seccion', link)

print(f'Recoleccion de notas terminada: {len(notas)} notas obtenidas')

In [None]:
notas

In [None]:
data = []
for i, nota in enumerate(notas):
    print(f'Scrapeando nota {i+1}/{len(notas)}')
    data.append(scrapear_nota(nota))

In [None]:
len(data)

In [None]:
df = pd.DataFrame(data)

In [None]:
df.head()

In [None]:
df.to_csv('notas_opinion.csv')