scraperwiki.sqlite.save_var workaround

slow-mo · Jan 25, 2022 · e274cb1 · e274cb1
1 parent 6b22965
commit e274cb1
Showing 1 changed file with 32 additions and 22 deletions.
diff --git a/scraper.py b/scraper.py
@@ -16,6 +16,7 @@
 from time import sleep
 from functools import lru_cache
 from datetime import datetime
+from sqlalchemy.exc import OperationalError
 
 # Workaround for:
 # urllib.error.URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:748)>
@@ -45,8 +46,8 @@ def log(*args):
     servisse fare qualcosa di più complesso in futuro.
     """
     print(datetime.now(), *args)
-
-
+    
+    
 def soupify(url, bs_parser='html.parser'):
     """
     Scarica un url e restituisce un oggetto BeautifulSoup.
@@ -93,14 +94,17 @@ def scrape_url(url):
     l = len(url.replace('https://www.liberliber.it/', '').rstrip('/').split('/'))
     # Lettera es. URL:
     # https://www.liberliber.it/online/autori/autori-a/
+    #                              1     2       3
     if l == 3:
         scrape_letter(url)
     # Autore es. URL:
     # https://www.liberliber.it/online/autori/autori-a/antonio-abati/
+    #                              1     2       3           4 
     elif l == 4:
         scrape_author(url)
     # Opera es. URL:
     # https://www.liberliber.it/online/autori/autori-a/antonio-abati/delle-frascherie-di-antonio-abati-fasci-tre/
+    #                              1     2       3           4                   5
     elif l == 5:
         scrape_book(url)
     else:
@@ -129,7 +133,7 @@ def scrape_author(url):
     # ll_autore_elenco_musica
     anchors = soup.select('li.ll_autore_elenco_libro span.ll_autore_elenco_opera_titolo a')
     # Se non ci sono ci fermiamo qui...
-    if anchors is None:
+    if len(anchors) == 0:
         log('No books for this author found at', url)
         return
     # ...altrimenti raccoglie i dati sull'autore
@@ -176,9 +180,16 @@ def scrape_book(url):
     """
     # Es. URL https://www.liberliber.it/online/autori/autori-a/antonio-abati/delle-frascherie-di-antonio-abati-fasci-tre/
     soup = soupify(url)
+
+    # Prima verifichiamo che ci sono libri associati all'opera
+    anchors = soup.select('div.post-content div.ll_opera_riga:-soup-contains("Scarica gratis") ~ a')
+    # Se non ci sono non andiamo oltre...
+    if len(anchors) == 0:
+        log('No books at:', url)
+        return
+
+    # ... altrimenti raccogliamo le informazioni sull'opera
     id = id_from_soup(soup)
-
-    # Prima raccogliamo le informazioni sull'opera
     record = {'id' : id, 'url' : url}
 
     headers = [u'titolo',
@@ -235,7 +246,7 @@ def scrape_book(url):
     scraperwiki.sqlite.save(unique_keys=['id'], data=record, table_name='opere')
 
     # Poi i file associati all'opera
-    for idx, item in enumerate(soup.select('div.post-content div.ll_opera_riga:-soup-contains("Scarica gratis") ~ a')):
+    for idx, item in enumerate(anchors):
         img = item.find('img')        
         record = {
             'id' : f'{id}-{idx}',
@@ -302,7 +313,17 @@ def parse_feed(url='https://www.liberliber.it/online/feed/'):
     essi. Salva nella variabile last_pubDate del database l'id dell'ultimo post
     analizzato. Da utilizzare per l'aggiornamento quotidiano del database.
     """
-    last_postId = scraperwiki.sqlite.get_var('last_pubDate') or 0 # Se non c'è var o db restituisce None
+    # scraperwiki.sqlite.set_var doesn't seem to work
+    # last_postId = scraperwiki.sqlite.get_var('lastpost')
+    # so...
+    try:
+        q = scraperwiki.sqlite.select('value from myvar where name is "lastpost"')
+    except OperationalError:
+        last_postId = 0
+    else:
+        last_postId = q[0]['value']
+        log('last_postId is', last_postId)
+
     tmp_postId = last_postId
     soup = soupify(url, 'lxml')
 
@@ -318,21 +339,10 @@ def parse_feed(url='https://www.liberliber.it/online/feed/'):
                 tmp_postId = postId
 
     if tmp_postId > last_postId:
-        scraperwiki.sqlite.save_var('last_pubDate', tmp_postId)
-    # for i in soup.find_all('p', class_='ll_dl'):
-        # link = i.find('a')['href']
-        # 
-
-    # for i in items:
-        # record = {
-            # 'id' : i.find('post-id').get_text(),
-            # 'title' : i.find('title').get_text(),
-            # 'link' : i.find('guid').get_text(),
-            # 'date' : i.find('pubdate').get_text(),
-            # 'description' : i.find('description').get_text(),
-            # 'content' : i.find('content:encoded').get_text()
-        # }
-        # scraperwiki.sql.save(unique_keys=['id'], data=record, table_name='feed')
+        # scraperwiki.sqlite.set_var doesn't seem to work
+        # scraperwiki.sqlite.save_var('last_pubDate', tmp_postId)
+        # so...
+        scraperwiki.sql.save(unique_keys=['name'], data={'name' : 'lastpost', 'value' : tmp_postId}, table_name='myvar')
 
 
 def main():