For Wikipedia page parsing requests and BeautifulSoup libraries are used. Os and re are auxiliary libraries.

In [2]:
import requests
from bs4 import BeautifulSoup
import os
import re

# URL of the Wikipedia page
url = "https://en.wikipedia.org/wiki/List_of_highest-grossing_films"

# GET request to fetch the page content and ensure that request was successful
response = requests.get(url)
response.raise_for_status()

# Parse HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Find the table containing the highest-grossing films
table = soup.find('table', {'class': 'wikitable'})

# a list to store the extracted film data
films = []

# index for the sequential count of films
index = 1

# Iterate through each row in the table except the header
for row in table.find_all('tr')[1:]:

    # Extract all columns with tag 'td' in the row
    columns = row.find_all('td')

    # Extract film release year
    release_year = int(columns[3].text.strip())

    # Extract film box office
    # and handle cases with excess symbols such as 'F8$1,238,764,765' or 'T$2,257,844,554'
    box_office = columns[2].text.strip()
    dollar_index = box_office.find('$')
    box_office = box_office[dollar_index:]

    # Extract all columns with tag 'th' in the row
    columns = row.find_all('th')

    # Extract film title
    # and handle cases with excess symbols such as 'Ne Zha 2 †'
    title = columns[0].text.strip()
    translation_table = str.maketrans('', '', '†')
    title = title.translate(translation_table)

    # Extract URL of specific film page
    link = columns[0].find('a')
    url = 'https://en.wikipedia.org' + link['href']

    # GET request to fetch the page content and ensure that request was successful
    response = requests.get(url)
    response.raise_for_status()

    # Parse page content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract table with info about film
    table = soup.find('table', {'class': 'infobox vevent'})

    # Create folder to save film covers
    output_folder = "covers"
    os.makedirs(output_folder, exist_ok=True)

    # Extract cover URL and get request
    image = table.find('img')
    if image:
        image_url = 'https:' + image['src']
    response = requests.get(image_url)

    # If request was successful, save cover in .jpg format to specified folder
    # named as 'cover{sequential number of the film}.jpg'
    if response.status_code == 200:
      filename = f"cover{index}.jpg"
      save_path = os.path.join(output_folder, filename)
      with open(save_path, "wb") as file:
        file.write(response.content)
    index += 1

    # Extract film country(-ies),
    # handling different cases related to number of countries and html tags
    country_th = table.find('th', string='Country')
    if country_th:
      country_td = country_th.find_next('td')
      country = country_td.text.strip()
    else:
      country_th = table.find('th', string='Countries')
      country_td = country_th.find_next('td')
      if country_td.find_all('li'):
        country = (', ').join([c.contents[0].strip() for c in country_td.find_all('li')])
      else:
        parts = [text.strip() for text in country_td.stripped_strings]
        country = ', '.join(parts)

    # Extract film director(s)
    director_th = table.find('th', string='Directed by')
    director_td = director_th.find_next('td')
    director = set()

    # Handle case when there are several directors
    if director_td.find_all('li'):
      li_tags = director_td.find_all('li')

      for li in li_tags:
        li_text = li.get_text()
        if li_text:
          dirs = li_text.split('\n')
          for dir in dirs:
            if dir:
              director.add(dir)

      for li in li_tags:
        a_tag = li.find('a')
        if not a_tag:
          director.add(li.get_text(strip=True))

    # Case when there is one director
    else:
      director.add(director_td.get_text(separator=', '))

    # Handle case when directors are duplicated
    director_copy = director.copy()
    for d1 in director_copy:
      for d2 in director_copy:
        if d1 != d2:
          if d1 in d2 and d2 in director:
            director.remove(d2)

    # Convert set to string
    director = ', '.join(director)
    translation_table = str.maketrans('', '', '0123456789[]')
    director = director.translate(translation_table)

    # Add a dictionary with film data to the list with all films
    films.append({
            'title': title,
            'release_year': release_year,
            'director': director,
            'box_office': box_office,
            'country': country
        })

For data storage SQLite is used.

In [3]:
import sqlite3

# Connect to the SQLite database (or create it if it doesn't exist)
conn = sqlite3.connect('films.db')
cursor = conn.cursor()

# Create the 'films' table
cursor.execute('''
CREATE TABLE IF NOT EXISTS films (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    title TEXT NOT NULL,
    release_year INTEGER,
    director TEXT,
    box_office TEXT,
    country TEXT
)
''')

# Commit the changes
conn.commit()

# Extract data from list films and add to the database
for film in films:
    cursor.execute('''
    INSERT INTO films (title, release_year, director, box_office, country)
    VALUES (?, ?, ?, ?, ?)
    ''', (film['title'], film['release_year'], film['director'], film['box_office'], film['country']))

# Commit the changes
conn.commit()

# Close the connection
conn.close()

Since GitHub Pages is static hosting, database content is exported to a JSON file.

In [4]:
import json

# Connect to database
conn = sqlite3.connect('films.db')
cursor = conn.cursor()

# Fetch all films with their id and features
cursor.execute('SELECT id, title, release_year, director, box_office, country FROM films')
films = cursor.fetchall()

# Convert to a list of dictionaries
films_list = []
for film in films:
    films_list.append({'id': film[0],
        'title': film[1],
        'release_year': film[2],
        'director': film[3],
        'box_office': film[4],
        'country': film[5]
    })

# Write to a JSON file
with open('films.json', 'w') as f:
    json.dump(films_list, f, indent=4)