In [1]:
import json
import codecs
import logging
from collections import defaultdict

import regex
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
from multiprocessing.dummy import Pool as ThreadPool

# In case of Windows System go to https://www.lfd.uci.edu/~gohlke/pythonlibs/#pycurl and download binary
# ! pip install --user ./pycurl‑7.43.0.3‑cp37‑cp37m‑win_amd64.whl

### Get All Movie Links

In [2]:
base_url = 'https://www.kinopoisk.ru/lists/navigator/{0}/?page={1}&tab=all'
MOVIE_URLS = set()

In [3]:
cookies = '''
gdpr=0; yandexuid=9658735171644283443; yuidss=9658735171644283443; _ym_isad=1; user-geo-region-id=213; user-geo-country-id=2; desktop_session_key=07c7f29bc2fc1aa053d4e7afa3d64e0f2d83a1c2605ce4a88901b98a1a13a608b94f9e459df484abb5bbf3430921654178cd0a6b7b0645b4bf58fb188603501dec45e6e03966edb267af6a33ff0e730f35cca987746d5edee13805ec219abfec; desktop_session_key.sig=o2rC9yYuvyl4HoJtmza-1uxmpZA; mda_exp_enabled=1; location=1; cmtchd=MTY0NDYwMjA3ODEyNw==; crookie=85Xcn5gPcFsWg2O8Hc7gSE9nrp9GpIUm6XrHAIa1iRa39uwv3w5yfT5lUvsn6TSwiS9EAZpe4WRJG1vuF7JWf8mk9Ro=; ya_sess_id=3:1644602078.5.0.1644595873783:jZqlXw:14.1.2:1|941132219.0.2|30:205017.45837.GB9g_yYovgJfXzweK6-KALwTqK4; yandex_login=mmpprepods; ys=udn.cDptbXBwcmVwb2Rz#c_chck.3848231965; i=C/R8mgF42N5TZqWGQqNJejN3emnpqaWEK9KGRg55OE2Q8oKWccY6uAHUCEagjd6oFHNH5svcn+y61iMKU12WcXwqLHE=; mda2_beacon=1644602078233; sso_status=sso.passport.yandex.ru:synchronized; adblock-warning-toast-hide=1; _csrf=T8G5KYQAFPdYh-PdCNUxVx9e; PHPSESSID=90879c3ab82bb0aa2a03b4f3069c5677; user_country=ru; yandex_gid=213; tc=1; uid=41641957; _csrf_csrf_token=3Jvr8qYicvk3Aw90dvFq1ivP20IP2xOlOA0za0YgYo8; mobile=no; _ym_visorc=b; _ym_uid=1644602049624964346; yandex_plus_metrika_cookie=true; spravka=dD0xNjQ0NjA4NDQzO2k9OTUuMTY1LjE1NC4xNDE7RD1FNkIyNDNCRUZDNzY2NDZEQkM3MjYzMTZENDQ1RjUzMDFDNDg3QTU4MUY0NEI0NzA4MkJEOUY4NDQ3QjhCQTIwNTk5MjJDMTU7dT0xNjQ0NjA4NDQzNzUwNzU0NzE4O2g9NzQwYWM0YzUwOTQ1N2Q1ZjYyYTVmNTI2YWMzOWIxM2Q=; _ym_d=1644608764
'''.replace('\n', '').split('; ')

cookies = {
    key: '='.join(values) for key, *values in [cookie.split('=') for cookie in cookies]
}

In [4]:
for year in [2021, 2020, 2019, 2018, 2017, 2016, 2015]:
    for page_idx in range(1, 30):
        result = requests.request('GET', base_url.format(year, page_idx), cookies=cookies)
        soup = BeautifulSoup(result.content.decode(), 'lxml')
        
        films_div = soup.find_all('div', {'class': regex.compile('desktop-seo-selection-film-item selection-list__film')})
        for film_div in films_div:
            film_href = film_div.findAll('a', {'class': regex.compile('selection-film-item-meta__link')})[0].attrs['href']
            MOVIE_URLS.add(film_href)

In [5]:
MOVIE_URLS = list(MOVIE_URLS)
print('Total movie urls: %d' % len(MOVIE_URLS))
print('Examples: \n%s' % '\n'.join(MOVIE_URLS[:5]))

Total movie urls: 3135
Examples: 
/film/839646/
/film/780306/
/film/908348/
/film/765099/
/film/1179628/


In [6]:
base_reviews_url = 'https://www.kinopoisk.ru{0}reviews/ord/date/status/all/perpage/200/'

In [7]:
def text_with_newlines(elem):
    text = ''
    for e in elem.recursiveChildGenerator():
        if isinstance(e, str):
            text += e.strip()
        elif e.name == 'br':
            text += '\n'
    return text

In [8]:
def process_movie(movie_href):
    results = []
    try:
        for idx in range(1):
            result = requests.request('GET', base_reviews_url.format(movie_href), cookies=cookies)
            soup = BeautifulSoup(result.content.decode(), 'lxml')

            reviews = soup.find_all('div', {'class': 'reviewItem userReview'})
            if not reviews:
                base = {'content': soup.extract(), 'name': movie_href}
                results.append(base)
                break
            for rewiev in reviews:
                rewiev_content = rewiev.find('div', {'class': regex.compile('response .*')})
                sentiment = rewiev_content.attrs['class'][1]
                text = rewiev_content.find('span', {'class': '_reachbanner_'})
                text = text_with_newlines(text).replace('\n', '')
                
                base = {'content': soup.extract(), 'name': movie_href}
                base['sentiment'] = sentiment
                base['text'] = text
                results.append(base)
            else:
                break
    except Exception as e:
        logging.error('%s %s' % (movie_href, str(e)))
    return results

In [9]:
def chunks(l, n):
    for i in range(0, len(l), n):
        yield l[i:i + n]

In [10]:
pool_size = 24

all_reviews = []

total_reviews = 0
class_counters = defaultdict(int)
with open('./data.csv', 'w', encoding='utf-8', errors='ignore') as file:
    for urls_chunk in tqdm(chunks(MOVIE_URLS, pool_size), total=len(MOVIE_URLS) // pool_size):
        pool = ThreadPool(pool_size)
        movies_reviews = pool.map(process_movie, urls_chunk)
        pool.close()
        pool.join()
        for movie_reviews in movies_reviews:
            all_reviews += movie_reviews
            for review in movie_reviews:
                if 'text' in review:
                    file.write('{0}\t{1}\t{2}\n'.format(review['name'], review['sentiment'], review['text']))
                    total_reviews += 1
                    class_counters[review['sentiment']] += 1
        file.flush()
                
print('Total {0:d} reviews'.format(total_reviews))
print('Class balance: \n{0}',format(json.dumps(class_counters, indent=4)))

131it [01:36,  1.36it/s]                         

Total 4432 reviews
Class balance: 
{0} {
    "good": 2879,
    "neutral": 752,
    "bad": 801
}



