In [1]:
import requests
import functools
from bs4 import BeautifulSoup
from lxml import html
import re
from multiprocessing import Pool, Lock, Value
from time import sleep
import csv

In [2]:
def get_page(url, n_attempts=5, t_sleep=1, **kwargs):
    for i in range(n_attempts):
        page = requests.get(url, kwargs)
        if page.status_code == 200:
            return page
        sleep(t_sleep)
    return None
    
def log_authors(func):
    f_log = open('log_authors.txt', 'a')
    @functools.wraps(func)
    def wrapper(author_id):
        print(func.__name__, author_id, file=f_log)
        result = func(author_id)
        print('Number of books:', len(result), file=f_log)
        return result
    return wrapper

@log_authors
def parse_author(author_id):
    books = []
    page_books = ['start']
    page_number = 1
    
    while page_books:
        page = get_page('https://www.respublica.ru/authors/{0}?page={1}'.format(author_id, page_number))
        if page is None:
            print('Cannot get the page. author_id:', author_id, 'page_mumber:', page_number)
            return books
            
        soup = BeautifulSoup(page.text, 'html.parser')
    
        listing_products = soup.find('div', {'class': "rd-page-listing__products"})
        items = listing_products.find_all('div', {'class': "rd-listing-product-item__title"})
        
        page_books = []
        for i in items:
            page_books.append('https://www.respublica.ru{}'.format(i.find('a').get('href')))

        books += page_books
        page_number += 1
        
    return books

In [3]:
author_ids = [15753, 19769, 15844, 19768, 20948, 43102, 19073, 19071, 20416,
            17675, 19076, 20700, 28169, 19805, 20010, 16471, 17717, 19806,
            43162, 20935, 16340, 20282, 19075, 15764, 16297, 20591, 26822,
            19072, 19725, 20542, 17896, 19070, 20487, 19727, 20540]

In [4]:
book_hrefs = []
for id_ in author_ids:
    book_hrefs += parse_author(id_)
len(book_hrefs)

2453

In [5]:
with open('book_hrefs.csv', 'w') as f_csv:
    csv_writer = csv.writer(f_csv)
    csv_writer.writerows(book_hrefs)

In [6]:
def parse_book(href):
    card = dict()
    #URL
    card['URL'] = href
    
    page = get_page(href)
    if page is None:
        print('Error occured requesting', href)
        return card
    soup = BeautifulSoup(page.text, 'html.parser')

    try:
        #Артикул
        item = soup.find('div', {'class' : 'rd-page-product__article'})
        if item is not None:
            item = item.find(itemprop = 'sku')
            card['ID'] = item.text

        #Название
        card['Название'] = soup.find('h1', {'class' : 'rd-page-product__title'}).text

        #Автор
        item = soup.find('div', {'class' : 'rd-page-product__underline'})
        if item is not None:
            card['Автор'] = item.text

        #Превью
        item = soup.find('div', {'class' : 'pages-view'})
        if item is not None:
            card['Превью'] = 'https://www.respublica.ru{}'.format(
                            item.find('a').get('href'))

        #Изображение
        item = soup.find('img', {'class' : 'rd-page-product__img'})
        if item is not None:
            card['Изображение'] = 'https://www.respublica.ru{}'.format(
                            item.get('src'))

        #Описание
        item = soup.find('div', {'class' : 'rd-page-product__desc-body'})
        if item is not None:
            card['Описание'] = item.text

        #Цена
        item = soup.find('div', {'class' : 'rd-page-product__price'})
        if item is not None:
            card['Цена'] = float(item.find('span', {'class' : 'num'}).text.replace(' ', ''))

        #Цена (старая)
        item = soup.find('div', {'class' : 'rd-page-product__price-old'})
        if item is not None:
            card['Цена (старая)'] = float(item.find('span', {'class' : 'prev'}).text.split()[0])

        #В наличии
        item = soup.find('div', {'class' : 'rd-page-product__buttons'})
        card['В наличии'] = item is not None and item.find('a').text != 'Сообщить о поступлении'

        #Категория
        item = soup.find('div', {'class' : 'rd-page-breadcrumbs rd-page-product__breadcrumbs'})
        if item is not None:
            breadcrumps = item.find_all('span', {'class' : 'rd-page-breadcrumbs-item'})
            breadcrumps = (br.find(itemprop = 'name').text for br in breadcrumps)
            breadcrumps = '; '.join(breadcrumps)
        card['Категория'] = breadcrumps

        #Число отзывов
        #Число оценок
        #Оценка
        item = soup.find('div', {'class' : 'rd-rating-stars'})
        if item is not None:
            review_count = soup.find('meta', itemprop = 'reviewCount')
            rating_count = soup.find('meta', itemprop = 'ratingCount')
            rating_value = soup.find('meta', itemprop = 'ratingValue')

            if review_count is not None:
                card['Число отзывов'] = int(review_count.get('content'))
            else:
                card['Число отзывов'] = 0

            if rating_count is not None:
                card['Число оценок'] = int(rating_count.get('content'))
            else:
                card['Число оценок'] = 0

            if review_count is not None:
                card['Оценка'] = float(rating_value.get('content'))

        #Карточка
        item = soup.find('div', {'class' : 'rd-page-product__desc-params'})
        if item is not None:
            params = item.find_all('p', {'class' : 'rd-page-product__desc-param'})
            for param in params:
                name = param.find(itemprop = 'name')
                value = param.find(itemprop = 'value')
                if name is not None and value is not None:
                    card[name.text] = value.text
    except:
        print('Error occured parsing', href)
        
    return card

In [8]:
mutex = Lock()
n_processed = Value('i', 0)

def parse_book_wrapper(href):
    result = parse_book(href)
    with mutex:
        global n_processed       
        n_processed.value += 1
        if n_processed.value % 10 == 0:
            print(f"\r{n_processed.value} objects are processed...", end='', flush=True)
    return result

with Pool(processes=10) as pool:
    result = pool.map(parse_book_wrapper, book_hrefs)
    
with open('hw_3.csv', 'w') as f_csv:
    csv_writer = csv.writer(f_csv)
    csv_writer.writerows(result)


2450 objects are processed...