In [1]:
import requests
from bs4 import BeautifulSoup
import json
import re
import logging
import time

In [6]:
logging.basicConfig(level=logging.DEBUG, filename='debug.log', format='[%(asctime)s] %(levelname)s:%(message)s')

In [7]:
imdb = 'https://www.imdb.com'

params = {
    'title_type' : 'feature', #feature, tv_movie, tv_series, tv_episode, tv_special, tv_miniseries, tv_miniseries, documentary, video_game, short, video, tv_short
    'release_date-min' : '',#'2019-10-01', #Format: YYYY-MM-DD, YYYY-MM, or YYYY
    'release_date-max' : '2019-10-25', #Format: YYYY-MM-DD, YYYY-MM, or YYYY
    'genres' : [], #action adventure animation biography comedy crime documentary drama family fantasy film_noir game_show history horror music musical mystery news reality_tv romance sci_fi sport talk_show thriller war western
    'user_rating-min' : '1', #1.0 to 10.0
    'user_rating-max' : '',#'7.0', #1.0 to 10.0
    'countries' : ['ru'],
    'count' : '250', #Number of films per page
    'start' : '1',
}

headers = {
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "accept-encoding": "gzip, deflate, br",
    "accept-language": "ru-RU,ru;q=0.9",
    "content-type": "application/x-www-form-urlencoded"
}

In [8]:
# Парсинг всех фильмов на странице

def parse_one_page(soup):

    films_soup = soup.findAll('div', class_="lister-item-image float-left")
    films_parsed = 0

    res = {}

    for film in films_soup:
        try:
            film_dict = {}

            films_parsed += 1

            film_link = film.a.get('href')
            film_id = film_link.split('/')[-2]

            film_dict['title_link'] = imdb + film_link

            film_name = film.img.get('alt')
            film_dict['title_name'] = film_name

            film_html = requests.post(imdb + film_link, headers=headers)

            if not film_html.ok:
                retry_number = 0
                while film_html.ok == False or retry_number != 4:
                    time.sleep(5)
                    logging.error(f'Error in page loading, retry №{retry_number}')
                    film_html = requests.post(imdb + film_link, headers=headers)
                    retry_number += 1

            else:
                logging.info(f'Successfully loaded page id={film_id}')

            film_soup = BeautifulSoup(film_html.text, 'html.parser')
            json_film_soup = film_soup.find("script", type="application/ld+json")

            if json_film_soup is not None:
                json_film = json.loads(json_film_soup.string)


                try:
                    film_dict['genre'] = json_film['genre']
                    logging.info(f'Successfully added genre')

                except Exception as exception:
                    logging.error(f'Genre not found {exception}')


                try:
                    film_dict['rating'] = json_film['aggregateRating']['ratingValue']
                    logging.info(f'Successfully added rating')

                except Exception as exception:
                    logging.error(f'Rating not found {exception}')


                try:
                    actors = []
                    for actor in json_film['actor']:
                        actors.append(actor['name'])

                    film_dict['actors'] = actors
                    logging.info(f'Successfully added actors')

                except Exception as exception:
                    logging.error(f'Actors not found {exception}')


                try:
                    film_dict['type']  = json_film['@type']
                    logging.info(f'Successfully added type')

                except Exception as exception:
                    logging.error(f'Type not found {exception}')

            else:
                logging.error('Json not found')

            film_details_soup = film_soup.find('div', id='titleDetails')

            # Ссылки на официальные сайты очень длинные, но этот кусок кода работает
            # if film_details_soup is not None:
            #     try:
            #         official_sites_soup = film_details_soup.find('h4', text='Official Sites:')
            #         official_sites_dirty = official_sites_soup.find_next_siblings('a')
            #         official_sites = []
            #
            #         for site in official_sites_dirty:
            #             official_sites.append(imdb + site['href'])
            #
            #         film_dict['official_sites'] = official_sites
            #
            #         logging.info('Successfully added official sites ')
            #
            #     except Exception as exception:
            #         logging.error('Official sites not loaded {exception}')
            #
            # else:
            #     logging.error('Official sites not found')

            if film_details_soup is not None:
                for child in film_details_soup.children:
                    if child.name == 'h2' or child.name == 'h3':
                        cell_title = child.text.strip()
                        cell_is_good = cell_title not in 'Company Credits'

                        if cell_is_good: film_dict[cell_title] = {}

                    if child.name == 'div' and 'txt-block' in child['class'] and cell_is_good:
                        key_word = child.find('h4')

                        if key_word is not None:
                            good_key_word = key_word.text.strip(':')
                            child.h4.decompose()
                            details_value = []

                            for child_text in child.text.replace('See more\xa0»', '').strip().split('|'):
                                details_value.append(re.sub(r'\n', '', child_text).strip())

                            if len(details_value) == 1:
                                film_dict[cell_title][good_key_word] = details_value.pop()

                            else:
                                film_dict[cell_title][good_key_word] = details_value

                logging.info('Successfully added details block')

            else:
                logging.error('Details block not found')

            res[film_id] = film_dict
            # print(film_dict)

        except Exception as exception:
            logging.error(f'Failed parsing of this title {exception}')

    return res, films_parsed

In [10]:
# Поиск с заданными вверху параметрами

def load_html_list(head=headers, par=params):
    html_page = requests.post(imdb + '/search/title/', headers=head, params=par)
    if html_page.ok:
        logging.info(f'Successfully got html from page')
        return BeautifulSoup(html_page.text, 'html.parser')
    else:
        logging.error(f'Failed to get html from page')
        return None

In [11]:
# Что-то вроде Main

logging.info(f'Started parsing')

soup = load_html_list()

if soup is not None:
    number_of_films_soup = soup.find('div', class_='desc')
    number_of_films = max([int(el) for el in number_of_films_soup.text.replace(',', '').split() if el.isdigit()])

    res_dict, films_parsed = parse_one_page(soup)

    all_res = [res_dict]
    print(f'Parsed {films_parsed} of {number_of_films} titles')

    if films_parsed < number_of_films:
        for i in range(1, 4):
            params["start"] = str(250 * i + 1)

            soup = load_html_list()

            if soup is not None:
                tmp_res_dict, tmp_films_parsed = parse_one_page(soup)

                films_parsed += tmp_films_parsed

                logging.info(f'Parsed {films_parsed} films')
                all_res.append(tmp_res_dict)
                print(f'Parsed {films_parsed} of {number_of_films} titles')

                if films_parsed == number_of_films:
                    break

            else:
                logging.error(f'Failed to load')

    logging.info(f'Parsing finished')

else:
    logging.info(f'Parsing failed, stopping')

Parsed 250 of 2696 titles
Parsed 500 of 2696 titles
Parsed 750 of 2696 titles
Parsed 1000 of 2696 titles


In [15]:
# Сохранение в файл

try:
    with open('parsedTitles.json', 'w', encoding='utf-8') as json_file:
        json.dump(all_res, json_file, ensure_ascii=False)
        logging.info('Json file created')

except Exception as exception:
    logging.error(f'Json file failed to create {exception}')

# Дальше идут ячейки, которые я использовал для отладки, они не важны

In [None]:
for res in all_res:
    print(res)

In [83]:
html = soup.prettify()
with open("out.html","w", encoding='utf-8') as out:
    for i in range(0, len(html)):
        try:
            out.write(html[i])
        except Exception:
            pass

In [118]:
import webbrowser

url = './out.html'

# Windows
chrome_path = 'C:/Program Files/Google/Chrome/Application/chrome.exe %s'

webbrowser.get(chrome_path).open(url)


True