# Получение информации о фильмах и критиках с Кинопоиска

In [4]:
import math
import time
import re
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
import numpy as np
import PySimpleGUI as sg


# Пустые листы для добавления данных с последующим добавлением в общий data frame
all_reviews_name = []
all_reviews_link = []
all_reviews_date = []
all_reviews_film = []
all_opinions = []
all_press_names = []
all_films_names = []
all_films_years = []
all_films_countries = []
all_films_platforms = []
all_films_genries = []
all_films_creators = []
all_films_main_directors = []
all_films_prices = []
with webdriver.Chrome(service=Service(ChromeDriverManager().install())) as driver:
    # XPATHs
    show_selector_xpath = '//select[@class="navigator_per_page"]'
    max_show_selector_xpath = '//option[@value="200"]'
    num_reviews_xpath = '//div[@class="pagesFromTo"]'
    press_name_xpath = '//h1[@class="breadcrumbs__head"]'
    positives_xpath = '//div[contains(@class, "item positive")]'
    negatives_xpath = '//div[contains(@class, "item negative")]'
    film_name_xpath = '//span[@data-tid="75209b22" or @data-tid="2da92aed"]'
    film_year_xpath = '//a[contains(@href, "movies/year")]'
    film_countries_xpath = '//a[contains(@href, "movies/country")]'
    film_platforms_xpath = '//a[contains(@href, "movies/company-originals")]'
    film_genre_xpath = '//a[contains(@href, "movies/genre")]'
    film_creators_xpath = '//a[contains(@href, "/name/") and @data-tid="603f73a4"]'
    film_price_xpath = '//span[contains(@class, "film-rating-value styles_root")]'
    # Проходим все СМИ
    for press_id in range(1, 156 + 1):
        driver.get(f'https://www.kinopoisk.ru/press/source/{press_id}')
        try:
            press_name =  driver.find_elements(By.XPATH, press_name_xpath)[1].text
        except IndexError:  # обработка удаленных URL СМИ, например https://www.kinopoisk.ru/press/source/5
            continue
        try:
            driver.find_element(By.XPATH, show_selector_xpath).click()
            driver.find_element(By.XPATH, max_show_selector_xpath).click()
            num_reviews = driver.find_element(By.XPATH, num_reviews_xpath).text.split()[2]
            num_pages = math.ceil(int(num_reviews) / 200)
        except NoSuchElementException:
            num_pages = 1
        # Проходим все страницы для выбранного СМИ
        for page_number in range(1, num_pages + 1):
            driver.get(f'https://www.kinopoisk.ru/press/source/{press_id}/page/{page_number}/#list')
            # Проходим все рецензии критиков на странице
            for press_opinion_xpath in [positives_xpath, negatives_xpath]:
                reviews_name = driver.find_elements(By.XPATH, press_opinion_xpath + '//div[@class="name"]/a')
                all_reviews_name.extend([review.text for review in reviews_name])
                reviews_link = driver.find_elements(By.XPATH, press_opinion_xpath + '//div[@class="fullText"]/a')
                all_reviews_link.extend([review.get_attribute('href') for review in reviews_link])
                reviews_date = driver.find_elements(By.XPATH, press_opinion_xpath + '//div[@class="date"]')
                all_reviews_date.extend([review.text for review in reviews_date])
                reviews_film = driver.find_elements(By.XPATH, press_opinion_xpath + '//div[@class="subject"]/a')
                all_reviews_film.extend([review.get_attribute('href') for review in reviews_film])
                press_opinions = [re.search(r'positive|negative', press_opinion_xpath)[0] for _ in range(len(reviews_name))]
                all_press_names.extend(press_name for _ in range(len(reviews_name)))
                all_opinions.extend(press_opinions)
    # Проходим все фильмы, на которые ссылались рецензии критиков            
    for link in all_reviews_film:
        # прогресс бар
        sg.one_line_progress_meter('Progress meter', i+1, len(all_reviews_film) + 1, '-key-')
        film_id = re.search(r'\d+', link)[0]
        driver.get(f'https://www.kinopoisk.ru/film/{film_id}/')
        try:
            film_name = driver.find_element(By.XPATH, film_name_xpath).text
        except NoSuchElementException:
            driver.get(f'https://www.kinopoisk.ru/series/{film_id}/')
            film_name = driver.find_element(By.XPATH, film_name_xpath).text
        film_year = driver.find_element(By.XPATH, film_year_xpath).text
        film_countries = [country.text for country in driver.find_elements(By.XPATH, film_countries_xpath)]
        try:
            film_platforms = [platform.text for platform in driver.find_elements(By.XPATH, film_platforms_xpath)]
        except NoSuchElementException:
            film_platforms= None
        film_genries = [genre.text for genre in driver.find_elements(By.XPATH, film_genre_xpath)]
        film_creators = {creator.text for creator in driver.find_elements(By.XPATH, film_creators_xpath)}
        film_main_director = driver.find_element(By.XPATH, film_creators_xpath).text
        film_price = driver.find_element(By.XPATH, film_price_xpath).text
        all_films_names.append(film_name)
        all_films_years.append(film_year)
        all_films_countries.append(film_countries)
        all_films_platforms.append(film_platforms)
        all_films_genries.append(film_genries)
        all_films_creators.append(film_creators)
        all_films_main_directors.append(film_main_director)
        all_films_prices.append(film_price)

# Создание pd.DataFrame с полученной информацией

In [6]:
data = {'press_name': all_press_names, 
        'positive/negative': all_opinions, 
        'film_name': all_films_names, 
        'film_year': all_films_years,
        'film_director': all_films_main_directors,
        'film_price': all_films_prices,
        'film_countries': all_films_countries,
        'film_platforms': all_films_platforms,
        'film_genries': all_films_genres,
        'film_creators': all_films_creators,
        'links': all_reviews_film,
        'reviewers': all_reviews_name, 
        'full_link': all_reviews_link}
df = pd.DataFrame(data)

# Обработка и анализ данных

In [124]:
for col in df.iloc[:, 6:10]:
    df[col] = df[col].apply(lambda x: re.sub(r'[\[\]\'\{\}]','', x))
df['positive/negative'] = df['positive/negative'].apply(lambda x: 1 if x == 'positive' else 0)

# Россия

In [125]:
df_ru = df.loc[df['film_countries'].apply(lambda x: True if re.search(r'Россия', x) else False)]
df_ru_groupby = df_ru.groupby('press_name')['positive/negative'].agg(['count', 'sum'])
df_ru_groupby['percent'] = round(df_ru_groupby['sum'] / df_ru_groupby['count'] * 100, 0)

# Кроме России

In [128]:
df_not_ru = df.loc[df['film_countries'].apply(lambda x: False if re.search(r'Россия', x) else True)]
df_groupby = df_not_ru.groupby('press_name')['positive/negative'].agg(['count', 'sum'])
df_groupby['percent'] = round(df_groupby['sum'] / df_groupby['count'] * 100, 0)

# Сортировка СМИ по предпочтению иностранного кино российскому

In [130]:
df_all = pd.DataFrame({'count':df_groupby['count'],
                       'percent':df_groupby.percent,
                       'ru_count':df_ru_groupby['count'],
                       'ru_percent':df_ru_groupby.percent
                      })
df_all = df_all.dropna()
df_all['delta'] = df_all['percent'] - df_all['ru_percent']
df_all
df_all.loc[(df_all['ru_count'] > 10) & (df_all['count'] > 10)].sort_values(by='delta', ascending=False).head(20)

Unnamed: 0_level_0,count,percent,ru_count,ru_percent,delta
press_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Огонек,13.0,92.0,82.0,56.0,36.0
Вечерняя Москва,53.0,89.0,51.0,67.0,22.0
Мир фантастики,244.0,75.0,19.0,53.0,22.0
Эксперт,63.0,87.0,42.0,67.0,20.0
Профиль,49.0,98.0,29.0,79.0,19.0
7 дней,260.0,87.0,50.0,70.0,17.0
Лайфхакер,222.0,75.0,30.0,60.0,15.0
Канобу,572.0,64.0,104.0,49.0,15.0
Газета.ru,932.0,73.0,235.0,58.0,15.0
Коммерсантъ Стиль,35.0,97.0,12.0,83.0,14.0
