In [1]:
import bs4
import pandas as pd
import pickle
import random
import requests
from time import sleep
from tqdm import tqdm 

In [2]:
main_url = 'https://vseotzyvy.ru/' 
req = requests.get(main_url)

In [3]:
parser = bs4.BeautifulSoup(req.text, 'lxml')
all_categories = parser.findAll('div', 
                                attrs={'class':'home_sub_cat clearfix'})

Here we get links for all categories.

In [4]:
hrefs = set()
for columns in all_categories:
    cur_categories = columns.select('a')
    for num, href in enumerate(cur_categories):
        if num != len(cur_categories) - 1:
            hrefs.add(href.get('href'))

In [5]:
for href in hrefs:
    req = requests.get(main_url + href)
    

And here we get links to all objects in all categories.

In [6]:
items_set = set()
for href in tqdm(hrefs):
    item_page_prev = []
    for i in range(1, 100):
        sleep(0.3)
        params = {'page': i}
        req = requests.get(main_url + href, params)
        parser = bs4.BeautifulSoup(req.text, 'lxml')
        item_page = parser.findAll('h3')
        if len(item_page) == 0:  # no more pages available
            break
        if item_page == item_page_prev:  # if prev_page repeats then break
            break
        for item in item_page:
            items_set.add(item.select('a')[0].get('href'))
        item_page_prev = item_page

  0%|          | 0/122 [00:00<?, ?it/s]  1%|          | 1/122 [00:10<21:22, 10.60s/it]  2%|▏         | 2/122 [00:18<17:37,  8.81s/it]  2%|▏         | 3/122 [00:18<10:03,  5.07s/it]  3%|▎         | 4/122 [00:20<07:01,  3.58s/it]  4%|▍         | 5/122 [00:27<09:45,  5.00s/it]  5%|▍         | 6/122 [00:55<24:56, 12.90s/it]  6%|▌         | 7/122 [01:10<25:34, 13.35s/it]  7%|▋         | 8/122 [01:20<23:27, 12.35s/it]  7%|▋         | 9/122 [01:42<28:53, 15.34s/it]  8%|▊         | 10/122 [01:54<27:07, 14.53s/it]  9%|▉         | 11/122 [01:57<19:52, 10.74s/it] 10%|▉         | 12/122 [02:01<16:14,  8.86s/it] 11%|█         | 13/122 [02:20<21:32, 11.85s/it] 11%|█▏        | 14/122 [02:26<18:04, 10.04s/it] 12%|█▏        | 15/122 [02:33<16:21,  9.17s/it] 13%|█▎        | 16/122 [02:39<14:28,  8.19s/it] 14%|█▍        | 17/122 [03:18<30:32, 17.45s/it] 15%|█▍        | 18/122 [03:22<23:14, 13.40s/it] 16%|█▌        | 19/122 [03:30<20:05, 11.71s/it] 16%|█▋        | 20/122 [03:44<21:21,

In [7]:
with open('items_set.pickle', 'rb') as handle:
    items_set = pickle.load(handle)  # Save it just in case

In [8]:
def binarize_mark(mark):
    if mark < 4:
        return 0
    return 1

def get_train_df(items_num, pages_max, emotion):
    marks = []
    reviews = []
    for item_url in tqdm(random.sample(items_set, items_num)):
        sleep(0.5)
        for page in range(1, pages_max + 1):
            if pages_max > 1:
                sleep(0.3)
            '''To avoid the problem of unbalanced classes, we will leave
               reviews of only one sentiment and collect them separately'''
            params = {'page': page, 'sort': emotion}  
            req = requests.get(main_url + item_url, params)
            parser = bs4.BeautifulSoup(req.text, 'lxml')
            marks_html = parser.findAll('span',  attrs={'class': 'bold', 'itemprop': 'ratingValue'})
            reviews_html = parser.findAll('span',  attrs={'itemprop': 'reviewBody'})
            if (len(marks_html) == 0) or (len(marks_html) != len(reviews_html)):  # no more pages available
                break
            for mark in marks_html:
                marks.append(int(mark.text))
            for review in reviews_html:
                reviews.append(review.text)
        # print(len(marks), len(reviews))

    df = pd.DataFrame({'review': reviews, 'mark': marks})
    df['mark'] = df['mark'].apply(binarize_mark)
    return df

In [10]:
neg_df = get_train_df(2000, 1, 'negative')
pos_df = get_train_df(500, 1, 'positive')  # There are much fewer negative ones, so we will parse more pages on them
train_df = pd.concat((neg_df, pos_df))

train_df.to_csv('sentiment_df.csv', index=False)

  0%|          | 0/2000 [00:00<?, ?it/s]  0%|          | 1/2000 [00:00<24:39,  1.35it/s]  0%|          | 2/2000 [00:01<24:01,  1.39it/s]  0%|          | 3/2000 [00:02<24:17,  1.37it/s]  0%|          | 4/2000 [00:02<24:24,  1.36it/s]  0%|          | 5/2000 [00:03<25:18,  1.31it/s]  0%|          | 6/2000 [00:04<24:44,  1.34it/s]  0%|          | 7/2000 [00:05<25:34,  1.30it/s]  0%|          | 8/2000 [00:06<25:16,  1.31it/s]  0%|          | 9/2000 [00:06<24:49,  1.34it/s]  0%|          | 10/2000 [00:07<24:31,  1.35it/s]  1%|          | 11/2000 [00:08<25:26,  1.30it/s]  1%|          | 12/2000 [00:09<25:00,  1.32it/s]  1%|          | 13/2000 [00:09<24:41,  1.34it/s]  1%|          | 14/2000 [00:10<24:46,  1.34it/s]  1%|          | 15/2000 [00:11<24:27,  1.35it/s]  1%|          | 16/2000 [00:11<24:17,  1.36it/s]  1%|          | 17/2000 [00:12<24:32,  1.35it/s]  1%|          | 18/2000 [00:13<24:15,  1.36it/s]  1%|          | 19/2000 [00:14<24:01,  1.37it/s]  1%|          | 