In [1]:
from bs4 import BeautifulSoup
import requests 
import re
from selenium import webdriver
from selenium.common.exceptions import WebDriverException
import sys

import gzip
import json
import codecs       

from tqdm import tqdm
from multiprocessing.dummy import Pool, Queue, Lock

import pandas as pd
from itertools import chain     
from contextlib import ExitStack    
from typing import Generator, Dict, Any

## Этап 1 - получаем ссылки на книги##

In [2]:
def get_book_urls(num_pages):
    urls = []
    shop_link = 'https://shop.relod.ru'
    collection_path = '/catalog-products/4577/?sort=PROPERTY_RATING&order=desc'
    book_class = 'col-xs-12 col-sm-4 col-md-4 col-lg-6 col-xml-4 col-xmll-3 col-xl-3'
    
    for i in range(num_pages):
        cur_link = shop_link + collection_path + (f'&PAGEN_1={i+1}' if i else '')
        r = requests.get(cur_link)
        soup = BeautifulSoup(r.text, 'html.parser')
        books = soup.find_all("div", {"class": book_class})
        for book in books:
            urls.append(shop_link + book.find('a', {"class": "bxr-font-color bxr-font-color-hover"})['href'])
    return urls

In [3]:
book_links = get_book_urls(250)

In [4]:
len(book_links), len(set(book_links))

(5000, 4994)

In [5]:
book_links = list(set(book_links))

In [6]:
assert len(book_links) == len(set(book_links))

## Этап 2 - сохраняем информацию о кнгиах##

In [7]:
def get_page(url, log_lock, n_attempts=5, t_sleep=1):
    for i in range(n_attempts):
        try:
            r = requests.get(url)
        except requests.exceptions.RequestException:
            time.sleep(t_sleep)
            continue
        except:
            err = sys.exc_info()[0]
            log_lock.acquire()
            with open('log.txt', w) as f:
                f.write("Cant download page 'url':\n\t" + err + "\n")
            log_lock.release()
            return None
        return r.text
        
    try:
        r.raise_for_status()
    except requests.exceptions.HTTPError as err:
        log_lock.acquire()
        with open('log.txt', w) as f:
            f.write("Cant download page:\n\t" + err + "\n")
        log_lock.release()
    return None

In [8]:
def parse_page(book_link, log_lock):
    result = {'url' : book_link}
    text = get_page(book_link, log_lock)
    if text:
        soup = BeautifulSoup(text, 'html.parser')
        base = soup.find_all('div' , {'class': 'bxr-cloud-all'})[4]
        
        # метки и иллюстрации
        book_info = base.find('div', {'class' : 'bxr-element-slider'})
        if book_info:
            tags = list(filter(None, book_info.text.split('\n')))
            if tags!=[]:
                result['Метки'] = tags
            imgs = book_info.find_all('img')
            if imgs!=[]:
                result['Иллюстрации'] = []
                for img in imgs:
                    if img.has_attr('data-src'):
                        result['Иллюстрации'].append('https:' + img['data-src'])
        
        # название
        name = base.find('h1') 
        if name:
            result['Название'] = name.text
        
        # оценка, число голосов и последние несколько пунктов
        add_book_info = base.find_all('div', {'class' : 'bxr-detail-col'})[1]
        if add_book_info:
            # rating
            rating = add_book_info.find('div', {'class' : 'bxr-rating-detail'})
            if rating:  
                all_meta = rating.find_all('meta')
                if all_meta!=[]:
                    result['Оценка'] = all_meta[0]['content']
                    result['Число голосов'] = all_meta[1]['content']
            # other info
            table = add_book_info.find('table')
            if table:
                add_info = table.find_all('tr')
                for info in add_info:
                    res_info = info.find_all('td')
                    if len(res_info)==2:
                        name = res_info[0].text
                        if name=='Издатель':
                            result[name] = res_info[1].text.strip('\n').split('\n')[0]
                        else:
                            result[name] = res_info[1].text.strip('\n')
        
        # наличие
        out_shop = base.find('div', {'class' : 'bxr-outstock-wrap'})
        if out_shop:
            result['Наличие'] = out_shop.text
        in_shop = base.find('div', {'class' : 'bxr-instock-wrap'})
        if in_shop:
            result['Наличие'] = in_shop.text
        
        # цена
        price_info = base.find('div', {'class' : 'bxr-right-col-detail bxr-detail-col'})
        if price_info:
            all_meta = price_info.find_all('meta')
            if all_meta!=[]:
                result['Цена'] = float(all_meta[0]['content'])
        
        # цена со скидкой
        if base.find('label', {'class' : 'ptv-label'}):
            try:
                driver = webdriver.Firefox()
                driver.get(book_link)
                button = driver.find_element_by_class_name('ptv-icon-checkbox')
                button.click()
                new_price = driver.find_element_by_class_name('js-ptv-price')
                result['Цена (скидка)'] = float(new_price.text.split(' ')[0])
                driver.close()
            except WebDriverException as err:
                driver.close()
                log_lock.acquire()
                with open('log.txt', 'w') as f:
                    f.write("Selenium exception:\n\t" + str(err) + "\n")
                log_lock.release()
            except:
                pass
            
        # описание
        desc = base.find('div', {'class' : 'bxr-detail-tab-content'})
        if desc:
            result['Описание'] = desc.text[1:]
    
    return result

In [9]:
def parse_page_wrapper(i):
    with gzip.open('data/part_{:05d}.jsonl.gz'.format(i), mode='wb') as f_json:
        f_json = codecs.getwriter('utf8')(f_json)

        while not queue.empty():
            url = queue.get()
            record = parse_page(url, f_lock)
            record_str = json.dumps(record, ensure_ascii=False)
            print(record_str, file=f_json)
            
            # счетчик должен атомарно обновиться
            with pb_lock:
                pbar.update(1)

In [10]:
queue = Queue() 
for url in book_links:
    queue.put(url)

In [11]:
with Pool(processes=8) as pool, tqdm(total=queue.qsize()) as pbar:
    pb_lock = pbar.get_lock()
    f_lock = Lock()
    pool.map(parse_page_wrapper, range(pool._processes))

100%|██████████| 4994/4994 [42:52<00:00,  1.94it/s]  


## Этап 3 - объединяем файлы ##

In [12]:
def records_reader(dirname: str) -> Generator[Dict[str, Any], None, None]:
    for i in range(8):
        with gzip.open(dirname + '/part_{:05d}.jsonl.gz'.format(i), mode='rb') as f:
            while True:
                json_bytes = f.readline()                      # 3. bytes (i.e. UTF-8)
                if not json_bytes:
                    break
                json_str = json_bytes.decode('utf-8')
                json_data = json.loads(json_str)
                yield json_data

In [13]:
df = pd.DataFrame(records_reader('data'))
df.to_csv('hw_3.csv', index=False)

In [14]:
df.tail()

Unnamed: 0,url,Иллюстрации,Название,ISBN,Издатель,Автор,Язык,Возрастные ограничения,Рекомендованный возраст,Формат,...,Цена (скидка),Читательская аудитория,Серия,Оценка,Число голосов,Рейтинг,Носитель,Тип продукта,Издание,Обзор/Ролик
4989,https://shop.relod.ru/catalog-products/to_davy...,[https://opt-1458870.ssl.1c-bitrix-cdn.ru/uplo...,To Davy Jones Below,9781849015196,C & R Crime,Dunn Carola,English,16+,16+,Paperback,...,,,,,,,,,,
4990,https://shop.relod.ru/catalog-products/the_lon...,[https://opt-1458870.ssl.1c-bitrix-cdn.ru/uplo...,The Long Sword,9781409137511,Orion,Cameron Christian,English,16+,16+,Paperback,...,,,,,,,,,,
4991,https://shop.relod.ru/catalog-products/the_wom...,[https://opt-1458870.ssl.1c-bitrix-cdn.ru/uplo...,The Woman in Black: Book 3,9781447225676,Pan Macmillan,Kerry Wilkinson,English,16+,16+,Paperback,...,,,,,,,,,,
4992,https://shop.relod.ru/catalog-products/thrones...,[https://opt-1458870.ssl.1c-bitrix-cdn.ru/uplo...,"Thrones, Dominations",9781444792959,Hodder & Stoughton,"Sayers Dorothy, ...",English,16+,16+,Paperback,...,,,,,,,,,,
4993,https://shop.relod.ru/catalog-products/the_sam...,[https://opt-1458870.ssl.1c-bitrix-cdn.ru/uplo...,The Samurai Inheritance,9780552167932,Transworld Publishers,James Douglas,English,16+,16+,Paperback,...,,,,,,,,,,
