Skip to content

Commit

Permalink
-soup-contains and typos
Browse files Browse the repository at this point in the history
  • Loading branch information
slow-mo committed Jan 22, 2022
1 parent ddd048d commit 83d2000
Showing 1 changed file with 6 additions and 6 deletions.
12 changes: 6 additions & 6 deletions scraper.py
Expand Up @@ -89,7 +89,7 @@ def scrape_author(url):

# Raccoglie le opere dell'autore presenti nella pagina
# ed esegue scape_book() su ognuna di esse
anchors = soup.select('span.MORPH_autore_elenco_opera_titolo a')
anchors = soup.select('span.ll_autore_elenco_opera_titolo a')
for i in anchors:
scrape_book(i['href'])

Expand All @@ -110,7 +110,7 @@ def scrape_author_data(soup):
record = {'id' : id, 'url' : url}
headers = ['autore', 'ordinamento', 'elenco']
for item in headers:
divs = soup.select('div.MORPH_metadati_etichetta:contains("' + item + ':") + div.MORPH_metadati_dato')
divs = soup.select('div.ll_metadati_etichetta:-soup-contains("' + item + ':") + div.ll_metadati_dato')
if divs: record[item] = divs[0].get_text()

# Non processiamo le schede autore vuote Es.
Expand Down Expand Up @@ -156,7 +156,7 @@ def scrape_book(url):
]

for i in headers:
for j in soup.select(u'div.MORPH_metadati_etichetta:contains("' + i + u':")'):
for j in soup.select(u'div.ll_metadati_etichetta:-soup-contains("' + i + u':")'):
if j.get_text() == 'soggetto BISAC:':
for l in filter(None, re.split('([A-Z, ]+ / .*?[a-z](?=[A-Z]))', j.next_sibling.get_text())):
make_bisac(id, l)
Expand All @@ -178,7 +178,7 @@ def scrape_book(url):
scraperwiki.sqlite.save(unique_keys=['id'], data=record, table_name='opere')

# Poi i file associati all'opera
for idx, item in enumerate(soup.select('div.post-content div.MORPH_opera_riga:contains("Scarica gratis") ~ a')):
for idx, item in enumerate(soup.select('div.post-content div.ll_opera_riga:-soup-contains("Scarica gratis") ~ a')):
img = item.find('img')
record = {
'id' : f'{id}-{idx}',
Expand All @@ -189,7 +189,7 @@ def scrape_book(url):
scraperwiki.sqlite.save(unique_keys=['id'], data=record, table_name='file')

# File musicali
for idx, item in enumerate(soup.select('ul.MORPH_musica_elenco_mp3 li a, ul.MORPH_musica_elenco_ogg li a')):
for idx, item in enumerate(soup.select('ul.ll_musica_elenco_mp3 li a, ul.ll_musica_elenco_ogg li a')):
record = {
'id' : f'{id}-m{idx}',
'opera_id' : id,
Expand Down Expand Up @@ -242,7 +242,7 @@ def parse_feed(url='https://www.liberliber.it/online/feed/'):
for i in items:
postId = int(i.find('post-id').get_text())
if postId > last_postId:
link = i.find('p', class_='MORPH_dl').find('a')['href']
link = i.find('p', class_='ll_dl').find('a')['href']
print("Found", link)
scrape_book(link)

Expand Down

0 comments on commit 83d2000

Please sign in to comment.