In [4]:
!pip install requests beautifulsoup4



In [79]:
import requests
from bs4 import BeautifulSoup
import sqlite3
import json
import re
import time
from typing import Dict, List, Optional, Any

BASE_WIKI_URL = "https://en.wikipedia.org"
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def fetch_page(url: str):
    response = requests.get(url, headers=HEADERS)
    response.raise_for_status()
    return BeautifulSoup(response.text, 'html.parser')

def extract_currency_value(amount_str: str):
    if not amount_str:
        return None
    match = re.search(r'\$([\d,.]+)', amount_str)
    if not match:
        return None

    numeric_str = match.group(1).replace(',', '')
    try:
        return float(numeric_str)
    except ValueError:
        pass

def parse_release_year(text: str, fallback_cell: Optional[BeautifulSoup]):
    year_match = re.search(r'(?:\((\d{4})\)|(\b(19|20)\d{2}\b))', text)
    if year_match:
        return int(year_match.group(1) or year_match.group(2))

    if fallback_cell and fallback_cell.text.strip().isdigit():
        return int(fallback_cell.text.strip())
    return None

def extract_infobox_data(infobox: BeautifulSoup):
    data = {'director': 'Unknown', 'country': 'Unknown'}

    director_th = infobox.find('th', string=re.compile(r'Direct(ed by|or)'))
    if director_th:
        director_td = director_th.find_next_sibling('td')
        if director_td:
            directors = [d.strip() for d in director_td.stripped_strings]
            data['director'] = ', '.join(directors)

    country_th = infobox.find('th', string=re.compile(r'Countr(y|ies)'))
    if country_th:
        country_td = country_th.find_next_sibling('td')
        if country_td:
            country_text = re.sub(r'\[\d+\]', '', country_td.get_text())
            countries = re.split(r'[,/]\s*', country_text.split('\n')[0])
            data['country'] = countries[0].strip()

    return data

def get_film_details(film_path: str):
    details = {'director': 'Unknown', 'country': 'Unknown'}
    try:
        soup = fetch_page(f"{BASE_WIKI_URL}{film_path}")
        infobox = soup.find('table', class_='infobox')

        infobox_data = extract_infobox_data(infobox)
        details.update(infobox_data)

    except Exception:
        pass

    return details

def find_data_table(soup: BeautifulSoup):
    for table in soup.find_all('table', class_='wikitable'):
        headers = [th.get_text(strip=True) for th in table.find_all('th')]
        if {'Rank', 'Title', 'Worldwide gross'}.issubset(headers):
            return table
    return None

def save_to_database(films: List[Dict[str, Any]]):
    with sqlite3.connect('highest_grossing_films.db') as conn:
        cursor = conn.cursor()
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS films (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                title TEXT NOT NULL,
                release_year INTEGER,
                director TEXT,
                box_office REAL,
                country TEXT
            )
        ''')

        insert_data = [
            (f['title'], f['release_year'], f['director'], f['box_office'], f['country'])
            for f in films
        ]
        cursor.executemany('''
            INSERT INTO films (title, release_year, director, box_office, country)
            VALUES (?, ?, ?, ?, ?)
        ''', insert_data)
        conn.commit()

def main():
    main_page = fetch_page("https://en.wikipedia.org/wiki/List_of_highest-grossing_films")
    data_table = find_data_table(main_page)

    if not data_table:
        print("Main data table not found")
        return

    films = []
    count = 1
    for row in data_table.find_all('tr')[1:]:
        cells = row.find_all(['th', 'td'])
        if len(cells) < 5:
            continue

        try:
            title_cell = cells[2].find('a')
            if not title_cell:
                continue

            film_data = {
                'id': count,
                'title': title_cell.get_text(strip=True),
                'release_year': parse_release_year(
                    title_cell.get_text(),
                    cells[4] if len(cells) > 4 else None
                ),
                'box_office': extract_currency_value(cells[3].get_text())
            }

            details = get_film_details(title_cell.get('href'))
            film_data.update(details)

            films.append(film_data)

        except Exception:
            pass
        count += 1

    save_to_database(films)

    with open('films_data.json', 'w', encoding='utf-8') as f:
        json.dump(films, f, indent=2, ensure_ascii=False)

if __name__ == "__main__":
    main()