**Категорія - "Крупная и встраиваемая бытовая техника"<br/>**
https://rozetka.com.ua/big-builtin-bt/c4628180/

In [1]:
import requests, re, json, pdb
from bs4 import BeautifulSoup
from urllib.parse import unquote, urljoin
from time import sleep
from csv import DictWriter
from progressbar import ProgressBar

### Collect links to individual products

In [2]:
base_url = 'https://rozetka.com.ua/big-builtin-bt/c4628180/'

base_r = requests.get(base_url)
base_soup = BeautifulSoup(base_r.text, 'lxml')
category_links = [{'catname': cat.text.strip(),
                   # add '/filter/' to url path, and magically get full list of products in category
                   # even if link leads to another sub-catalogue
                   'link': urljoin(cat.get('href'), 'filter'),}
                  for cat in base_soup.select('#menu_categories_left a.m-cat-l-i-title-link')]

In [3]:
def get_product_info(pr, catname):
    price = re.search('pricerawjson = \'(.+)\'', str(pr.find_previous('script')))
    if not price:
        pdb.set_trace()
        
    try:
        price = json.loads(unquote(price.group(1)))['price']
        total_reviews = pr.get('data-count')
        reviews_link = pr.get('href')
        product_id = re.search('/p?(\d+)', reviews_link).group(1)
        assert isinstance(price, int)
    except Exception as e:
        pdb.set_trace()
    
    return {
        'price': price,
        'total_reviews': total_reviews,
        'reviews_link': reviews_link,
        'product_id': product_id,
        'category': catname,
    }

In [4]:
def get_product_links(cat, writer):
    r1 = requests.get(cat['link'], allow_redirects=True)
    soup1 = BeautifulSoup(r1.text, 'lxml')
    pagination = soup1.select('.paginator-catalog a')
    
    # in case there are still sub-catalogues - go recursive    
    if len(pagination) == 0 and len(soup1.select('div.pab-table')) > 0:
        subcat_links = [{'link': urljoin(subcat.get('href'), 'filter'),
                         'catname': cat['catname'],}
                        for subcat in soup1.select('.pab-table .pab-h3 a')]
        for subcat in subcat_links:
            get_product_links(subcat, writer)
        return
    # single-page category
    elif len(pagination) == 0:
        pagination = 1
        
    else:
        try:
            pagination = int(pagination[-1].text)
        except Exception as e:
            pdb.set_trace()
    products = [get_product_info(pr, cat['catname'])
                for pr in [
                    stars.find_parent('a')
                    for stars in soup1.select('#catalog_goods_block span.g-rating-stars')
                ]]

    for page in range(2, pagination + 1):
        rp = requests.get(urljoin(cat['link'], f'page={page}'), allow_redirects=True)
        soup_p = BeautifulSoup(rp.text, 'lxml')
        ranked_products = soup_p.select('#catalog_goods_block span.g-rating-stars')

        products += [get_product_info(pr, cat['catname'])
                     for pr in [
                         stars.find_parent('a')
                         for stars in soup_p.select('#catalog_goods_block span.g-rating-stars')
                     ]]
        sleep(0.1)

    for pr in products:
        writer.writerow(pr)

In [5]:
f = open('review_links.csv', 'w')
colnames = ['product_id',
            'reviews_link',
            'category',
            'price',
            'total_reviews',]
writer = DictWriter(f, fieldnames=colnames)
writer.writerow({h:h for h in colnames})

bar = ProgressBar()
for cat in bar(category_links):
    get_product_links(cat, writer)
    
f.close()

100% (22 of 22) |#########################| Elapsed Time: 0:03:17 Time: 0:03:17


 \>1300 products with at least one "self-annotated" review

### scrape reviews themselves

In [7]:
import pandas as pd

In [8]:
df = pd.read_csv('review_links.csv')

In [55]:
reviews_link = df.loc[5, ].to_dict()
reviews_link

{'category': 'Холодильники',
 'price': 16499,
 'product_id': 13991066,
 'reviews_link': 'https://bt.rozetka.com.ua/samsung_rb37j5100sa_ua/p13991066/comments/',
 'total_reviews': 92}

In [56]:
def parse_reviews(rv):
    id = rv.get('name')
    ranking = rv.select('span.sprite.g-rating-stars-i')[0].get('content')
    text = '\n'.join([p.get_text().strip() for p in rv.select('.pp-review-text-i')]).replace('\xa0', '')
    return {
        'id': id,
        'text': text,
        'ranking': ranking,
    }

In [57]:
def scrape_reviews(soup):
    reviews = list(filter(lambda r: len(r.select('span.sprite.g-rating-stars-i')) > 0, # has product ranking
                          soup.select('article.pp-review-i')))
    return list(map(parse_reviews, reviews))

In [71]:
def get_reviews(product):
    r = requests.get(product['reviews_link'])
    soup = BeautifulSoup(r.text, 'lxml')
    pagination = soup.select('.paginator-catalog-l-link')
    
    # single-page category
    if len(pagination) == 0:
        pagination = 1
        
    else:
        try:
            pagination = int(pagination[-1].text)
        except Exception as e:
            pdb.set_trace()
            
    reviews = scrape_reviews(soup)

    for page in range(2, pagination + 1):
        rp = requests.get(urljoin(product['reviews_link'], f'page={page}'), allow_redirects=True)
        soup_p = BeautifulSoup(rp.text, 'lxml')
        ranked_products = soup_p.select('#catalog_goods_block span.g-rating-stars')

        reviews += scrape_reviews(soup_p)
        sleep(0.1)

    reviews = pd.DataFrame(reviews)
    reviews['product_id'] = product['product_id']
    reviews['category'] = product['category']
    reviews['price'] = product['price']
    return reviews

In [72]:
allreviews = None

bar = ProgressBar()
for product in bar(df.to_dict(orient='records')):
    allreviews = pd.concat([allreviews,
                            get_reviews(product)])       


100% (1360 of 1360) |#####################| Elapsed Time: 0:39:55 Time: 0:39:55


### Language Detection

In [76]:
from polyglot.detect import Detector
# based on cld2

In [None]:
allreviews['lang'] = allreviews.text.apply(lambda t: Detector(t, quiet=True).language.code)
allreviews = allreviews.loc[allreviews.lang == 'uk', ]

In [94]:
allreviews.to_csv('reviews.tsv', sep='\t', index=False)