Skip to content

Commit

Permalink
Revert to scraperwiki.scrape instead of requests
Browse files Browse the repository at this point in the history
  • Loading branch information
slow-mo committed Jan 12, 2022
1 parent 6fa13df commit 93bde37
Showing 1 changed file with 11 additions and 15 deletions.
26 changes: 11 additions & 15 deletions scraper.py
@@ -1,23 +1,19 @@
# -*- coding: utf-8 -*-
import requests
import scraperwiki
from bs4 import BeautifulSoup as bs
import re

import os
# morph.io requires this db filename, but scraperwiki doesn't nicely
# expose a way to alter this. So we'll fiddle our environment ourselves
# before our pipeline modules load.
# expose a way to alter this.
os.environ['SCRAPERWIKI_DATABASE_NAME'] = 'sqlite:///data.sqlite'

# Trying to fix ssl.SSLError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:748)
import certifi
import ssl
import scraperwiki
from bs4 import BeautifulSoup as bs
import re

USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0'

def parse_lettera(url):
r = requests.get(url, headers={'user-agent': USER_AGENT})
soup = bs(r.text, 'html.parser')
html = scraperwiki.scrape(url, None, USER_AGENT)
soup = bs(html, 'html.parser')
anchors = soup.select('div.post-content ul li a')
for idx, item in enumerate(anchors):
record = {
Expand All @@ -30,8 +26,8 @@ def parse_lettera(url):
scraperwiki.sqlite.save(unique_keys=['id'], data=record, table_name='autori')

def parse_scheda_autore(author_id, url):
r = requests.get(url, headers={'user-agent': USER_AGENT})
soup = bs(r.text, 'html.parser')
html = scraperwiki.scrape(url, None, USER_AGENT)
soup = bs(html, 'html.parser')

anchors = soup.select('span.ll_autore_elenco_opera_titolo a')
for idx, item in enumerate(anchors):
Expand All @@ -55,8 +51,8 @@ def parse_scheda_autore(author_id, url):
scraperwiki.sqlite.save(unique_keys=['id'], data=record, table_name='autori')

def parse_scheda_opera(book_id, author_id, url):
r = requests.get(url, headers={'user-agent': USER_AGENT})
soup = bs(r.text, 'html.parser')
html = scraperwiki.scrape(url, None, USER_AGENT)
soup = bs(html, 'html.parser')

for idx, item in enumerate(soup.select('div.post-content div.ll_opera_riga:-soup-contains("Scarica gratis") ~ a')):
imgs = item.select('img')
Expand Down

0 comments on commit 93bde37

Please sign in to comment.