Revert to scraperwiki.scrape instead of requests

slow-mo · Jan 12, 2022 · 93bde37 · 93bde37
1 parent 6fa13df
commit 93bde37
Showing 1 changed file with 11 additions and 15 deletions.
diff --git a/scraper.py b/scraper.py
@@ -1,23 +1,19 @@
 # -*- coding: utf-8 -*-
-import requests
-import scraperwiki
-from bs4 import BeautifulSoup as bs
-import re
+
 import os
 # morph.io requires this db filename, but scraperwiki doesn't nicely
-# expose a way to alter this. So we'll fiddle our environment ourselves
-# before our pipeline modules load.
+# expose a way to alter this.
 os.environ['SCRAPERWIKI_DATABASE_NAME'] = 'sqlite:///data.sqlite'
 
-# Trying to fix ssl.SSLError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:748)
-import certifi
-import ssl
+import scraperwiki
+from bs4 import BeautifulSoup as bs
+import re
 
 USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0'
 
 def parse_lettera(url):
-    r = requests.get(url, headers={'user-agent': USER_AGENT})
-    soup = bs(r.text, 'html.parser')
+    html = scraperwiki.scrape(url, None, USER_AGENT)
+    soup = bs(html, 'html.parser')
     anchors = soup.select('div.post-content ul li a')
     for idx, item in enumerate(anchors):
         record = {
@@ -30,8 +26,8 @@ def parse_lettera(url):
         scraperwiki.sqlite.save(unique_keys=['id'], data=record, table_name='autori')
 
 def parse_scheda_autore(author_id, url): 
-    r = requests.get(url, headers={'user-agent': USER_AGENT})
-    soup = bs(r.text, 'html.parser')
+    html = scraperwiki.scrape(url, None, USER_AGENT)
+    soup = bs(html, 'html.parser')
 
     anchors = soup.select('span.ll_autore_elenco_opera_titolo a')
     for idx, item in enumerate(anchors):
@@ -55,8 +51,8 @@ def parse_scheda_autore(author_id, url):
         scraperwiki.sqlite.save(unique_keys=['id'], data=record, table_name='autori')
 
 def parse_scheda_opera(book_id, author_id, url):      
-    r = requests.get(url, headers={'user-agent': USER_AGENT})
-    soup = bs(r.text, 'html.parser')
+    html = scraperwiki.scrape(url, None, USER_AGENT)
+    soup = bs(html, 'html.parser')
 
     for idx, item in enumerate(soup.select('div.post-content div.ll_opera_riga:-soup-contains("Scarica gratis") ~ a')):
         imgs = item.select('img')