In [37]:
# == importar bibliotecas == 

from bs4 import BeautifulSoup as bs
import unicodedata as ucd
import requests
import pandas as pd
import numpy as np
from tqdm import tqdm
from joblib import Parallel, delayed
import os
from itertools import chain 

In [40]:
# == settando variáveis == 

JORNAL = 'PL'
LABEL = 'direita' 
URL = "https://partidoliberal.org.br/noticias-partido-liberal/"
NUM_PAG = 568
class_1 = 'col-md-8'
class_2 = 'post__content'

In [10]:
# == obter html == 

def get_soup(url):
    response = requests.get(url)
    html = response.content
    soup = bs(html, 'lxml')
    return soup

In [12]:
# == obter dataframe do texto == 

def return_df(url, text):
    return pd.DataFrame({
        'jornal': JORNAL,
        'url': url,
        'artigo': text,
        'label': LABEL
    }, index=[1])

In [26]:
# == lista de URLs ==

list_url = []

In [27]:
# == coletar os sites de artigos == 

def get_links(pag_num):
    cur_list = []
    soup = get_soup(f'{URL}page/{pag_num}')
    for noticia in soup.find_all("div", class_=class_1):
        for artigo in noticia.find_all("a", href=True):
            cur_list.append(artigo.get('href'))
    return cur_list
    
list_url = Parallel(n_jobs=os.cpu_count())\
                      (delayed(get_links)\
                      (n)\
                      for n in tqdm(range(NUM_PAG)))



  0%|          | 0/568 [00:00<?, ?it/s][A[A

  2%|▏         | 12/568 [00:00<00:04, 119.50it/s][A[A

  4%|▍         | 24/568 [00:02<00:37, 14.64it/s] [A[A

  6%|▋         | 36/568 [00:04<00:55,  9.51it/s][A[A

  8%|▊         | 48/568 [00:07<01:10,  7.34it/s][A[A

 11%|█         | 60/568 [00:10<01:31,  5.54it/s][A[A

 13%|█▎        | 72/568 [00:13<01:32,  5.35it/s][A[A

 15%|█▍        | 84/568 [00:15<01:32,  5.25it/s][A[A

 17%|█▋        | 96/568 [00:19<01:43,  4.57it/s][A[A

 19%|█▉        | 108/568 [00:21<01:37,  4.72it/s][A[A

 21%|██        | 120/568 [00:23<01:26,  5.15it/s][A[A

 23%|██▎       | 132/568 [00:25<01:23,  5.23it/s][A[A

 25%|██▌       | 144/568 [00:27<01:18,  5.38it/s][A[A

 27%|██▋       | 156/568 [00:30<01:21,  5.08it/s][A[A

 30%|██▉       | 168/568 [00:32<01:20,  4.95it/s][A[A

 32%|███▏      | 180/568 [00:35<01:19,  4.90it/s][A[A

 34%|███▍      | 192/568 [00:37<01:10,  5.34it/s][A[A

 36%|███▌      | 204/568 [00:39<01:11,  5.11i

In [36]:
# == remover url duplicado == 

list_url = list(chain.from_iterable(list_url))
list_url = list(set(list_url))

In [61]:
# == remover links que não são notícias == 

list_url = [
    url for url in list_url if 'https://partidoliberal.org.br/noticias-partido-liberal/page/' not in url
]

3402

In [73]:
# == obter conteúdo da notícia == 

def get_artigo(url):
    soup = get_soup(url)
    list_text = []
    for div in soup.find_all("div", class_=class_2):
        for p in div.find_all("p"):
            list_text.append(ucd.normalize("NFKD", p.text))
        return return_df(url, " ".join(list_text))
    
res = Parallel(n_jobs=os.cpu_count())\
              (delayed(get_artigo)\
              (url)\
              for url in tqdm(list_url))
scrap = pd.concat(res)




  0%|          | 0/3402 [00:00<?, ?it/s][A[A[A


  0%|          | 12/3402 [00:00<00:28, 117.96it/s][A[A[A


  1%|          | 24/3402 [00:02<03:10, 17.73it/s] [A[A[A


  1%|          | 36/3402 [00:03<03:58, 14.14it/s][A[A[A


  1%|▏         | 48/3402 [00:04<04:57, 11.27it/s][A[A[A


  2%|▏         | 60/3402 [00:06<06:00,  9.28it/s][A[A[A


  2%|▏         | 72/3402 [00:08<06:34,  8.45it/s][A[A[A


  2%|▏         | 84/3402 [00:09<06:37,  8.34it/s][A[A[A


  3%|▎         | 96/3402 [00:11<06:37,  8.32it/s][A[A[A


  3%|▎         | 108/3402 [00:13<07:10,  7.64it/s][A[A[A


  4%|▎         | 120/3402 [00:14<07:17,  7.50it/s][A[A[A


  4%|▍         | 132/3402 [00:16<07:05,  7.68it/s][A[A[A


  4%|▍         | 144/3402 [00:17<07:00,  7.74it/s][A[A[A


  5%|▍         | 156/3402 [00:19<06:54,  7.84it/s][A[A[A


  5%|▍         | 168/3402 [00:20<06:43,  8.02it/s][A[A[A


  5%|▌         | 180/3402 [00:22<06:16,  8.56it/s][A[A[A


  6%|▌         | 192/

 46%|████▌     | 1572/3402 [03:26<03:38,  8.38it/s][A[A[A


 47%|████▋     | 1584/3402 [03:28<03:52,  7.80it/s][A[A[A


 47%|████▋     | 1596/3402 [03:30<03:51,  7.81it/s][A[A[A


 47%|████▋     | 1608/3402 [03:31<03:47,  7.89it/s][A[A[A


 48%|████▊     | 1620/3402 [03:33<03:41,  8.03it/s][A[A[A


 48%|████▊     | 1632/3402 [03:35<04:06,  7.19it/s][A[A[A


 48%|████▊     | 1644/3402 [03:36<04:02,  7.24it/s][A[A[A


 49%|████▊     | 1656/3402 [03:38<03:55,  7.40it/s][A[A[A


 49%|████▉     | 1668/3402 [03:39<03:51,  7.50it/s][A[A[A


 49%|████▉     | 1680/3402 [03:41<03:52,  7.40it/s][A[A[A


 50%|████▉     | 1692/3402 [03:43<03:44,  7.61it/s][A[A[A


 50%|█████     | 1704/3402 [03:44<03:40,  7.69it/s][A[A[A


 50%|█████     | 1716/3402 [03:46<03:42,  7.59it/s][A[A[A


 51%|█████     | 1728/3402 [03:47<03:33,  7.85it/s][A[A[A


 51%|█████     | 1740/3402 [03:49<03:33,  7.77it/s][A[A[A


 51%|█████▏    | 1752/3402 [03:50<03:35,  7.67it/s][A

 92%|█████████▏| 3132/3402 [06:52<00:36,  7.44it/s][A[A[A


 92%|█████████▏| 3144/3402 [06:54<00:33,  7.66it/s][A[A[A


 93%|█████████▎| 3156/3402 [06:55<00:33,  7.34it/s][A[A[A


 93%|█████████▎| 3168/3402 [06:57<00:29,  7.83it/s][A[A[A


 93%|█████████▎| 3180/3402 [06:58<00:29,  7.54it/s][A[A[A


 94%|█████████▍| 3192/3402 [07:00<00:27,  7.70it/s][A[A[A


 94%|█████████▍| 3204/3402 [07:01<00:25,  7.68it/s][A[A[A


 95%|█████████▍| 3216/3402 [07:03<00:23,  7.83it/s][A[A[A


 95%|█████████▍| 3228/3402 [07:05<00:23,  7.32it/s][A[A[A


 95%|█████████▌| 3240/3402 [07:07<00:22,  7.11it/s][A[A[A


 96%|█████████▌| 3252/3402 [07:08<00:20,  7.34it/s][A[A[A


 96%|█████████▌| 3264/3402 [07:10<00:20,  6.73it/s][A[A[A


 96%|█████████▋| 3276/3402 [07:12<00:17,  7.07it/s][A[A[A


 97%|█████████▋| 3288/3402 [07:13<00:15,  7.14it/s][A[A[A


 97%|█████████▋| 3300/3402 [07:15<00:14,  6.94it/s][A[A[A


 97%|█████████▋| 3312/3402 [07:16<00:11,  7.53it/s][A

In [75]:
# == salvar o scrapping == 

scrap.to_parquet('../dataset/scrapping/PL.parquet')

In [76]:
scrap.head(10)

Unnamed: 0,jornal,url,artigo,label
1,PL,https://partidoliberal.org.br/aprovado-projeto...,Rio Grande do Norte – A Assembleia Legislativa...,direita
1,PL,https://partidoliberal.org.br/deputado-federal...,"O Projeto prevê reclusão de 3 a 6 anos, alé...",direita
1,PL,https://partidoliberal.org.br/medeiros-lamenta...,Brasília – O deputado federal José Medeiros ...,direita
1,PL,https://partidoliberal.org.br/o-presidente-bol...,"Brasília – O líder do PL na Câmara, deputad...",direita
1,PL,https://partidoliberal.org.br/senador-wellingt...,Brasília – A duplicação da BR-364 entre Ron...,direita
1,PL,https://partidoliberal.org.br/projeto-de-carlo...,Brasília – Aguarda apreciação em Plenário ...,direita
1,PL,https://partidoliberal.org.br/comissao-aprova-...,BrasÃ­lia – A ComissÃ£o de SeguranÃ§a PÃob...,direita
1,PL,https://partidoliberal.org.br/jair-bolsonaro-s...,Brasília – O dia 11 de agosto tornou-se a dat...,direita
1,PL,https://partidoliberal.org.br/michelle-bolsona...,"A cerimônia foi realizada neste sábado (6), ...",direita
1,PL,https://partidoliberal.org.br/bolsonaro-destac...,Brasília – O presidente Jair Bolsonaro (PL) p...,direita
