# 1. Parsing

In [2]:
import requests
from bs4 import BeautifulSoup
import json
import time  # To avoid excessive requests
from tqdm import tqdm
import re
import sqlite3

# Wikipedia URL
BASE_URL = "https://en.wikipedia.org"
LIST_URL = BASE_URL + "/wiki/List_of_highest-grossing_films"

# Database setup
def setup_database():
    conn = sqlite3.connect("films.db")
    cursor = conn.cursor()
    
    # Drop the table if it exists to avoid duplicates
    cursor.execute("DROP TABLE IF EXISTS films")
    
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS films (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            rank TEXT,
            peak TEXT,
            title TEXT NOT NULL,
            box_office_revenue TEXT,
            release_year INTEGER,
            country TEXT,
            director TEXT
        )
    """)
    
    return conn, cursor

# Clean text with joined words and remove duplicates
def clean_joined_words(text):
    # Insert a comma and space before a capital letter if it's preceded by a lowercase letter
    # This handles cases like "United KingdomUnited States" -> "United Kingdom, United States"
    cleaned_text = re.sub(r'([a-z])([A-Z])', r'\1, \2', text)
    
    # Replace newlines with commas
    cleaned_text = re.sub(r'\n', ', ', cleaned_text)
    
    # Clean up any double commas or spaces
    cleaned_text = re.sub(r',\s*,', ',', cleaned_text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    
    # Trim whitespace
    cleaned_text = cleaned_text.strip()
    
    # Remove duplicates by splitting, creating a unique set, and rejoining
    if ',' in cleaned_text:
        items = [item.strip() for item in cleaned_text.split(',')]
        # Create a list with unique items while preserving order
        unique_items = []
        for item in items:
            if item and item not in unique_items:
                unique_items.append(item)
        cleaned_text = ', '.join(unique_items)
    
    return cleaned_text

# Extract plainlist content properly
def extract_plainlist_content(element):
    if not element:
        return "Unknown"
    
    # Check if there's a plainlist with <ul> and <li> elements
    plainlist = element.find('div', class_='plainlist')
    if plainlist:
        lis = plainlist.find_all('li')
        if lis:
            # Join all <li> elements with proper separator
            return ', '.join(li.get_text(strip=True) for li in lis)
    
    # If no plainlist or no <li> elements found, try alternative approaches
    links = element.find_all('a')
    if links:
        return ', '.join(link.get_text(strip=True) for link in links)
    
    # Just get all text with commas if other methods fail
    text = element.get_text(separator=', ', strip=True)
    
    # Clean up the text to handle joined words
    return clean_joined_words(text)

# Fetch film data from Wikipedia
def fetch_film_data():
    # Fetch the page
    response = requests.get(LIST_URL)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Locate the table (first table with class 'wikitable')
    table = soup.find('table', {'class': 'wikitable'})

    # Extract film data
    films = []
    for row in tqdm(table.find_all('tr')[1:]):  # Skip header row
        cols = row.find_all('td')
        title_elem = row.find('th')  # Title is inside <th> instead of <td>
        
        if len(cols) < 4 or not title_elem:  # Ensure we have enough columns
            continue
        
        rank = cols[0].text.strip()
        peak = cols[1].text.strip()
        title = title_elem.text.strip()
        box_office = cols[2].text.strip()
        year = cols[3].text.strip()

        # Process Box Office: remove any letters before the first '$'
        box_office = re.sub(r'^.*?(\$)', r'\1', box_office)

        # Convert year to integer if possible
        try:
            year = int(year)
        except ValueError:
            year = None
        
        # Default values
        country = "Unknown"
        director = "Unknown"

        # Visit the film page if a link is available
        link_tag = title_elem.find('a')
        film_url = BASE_URL + link_tag['href'] if link_tag else None
        if film_url:
            try:
                film_response = requests.get(film_url)
                film_soup = BeautifulSoup(film_response.text, 'html.parser')
        
                # Find the infobox (table with class 'infobox')
                infobox = film_soup.find('table', {'class': 'infobox'})
                if infobox:
                    for row in infobox.find_all('tr'):
                        header = row.find('th')
                        if header:
                            header_text = header.text.strip()
                            content_cell = row.find('td')
                            
                            # Extract director(s)
                            if "Directed by" in header_text and content_cell:
                                director = extract_plainlist_content(content_cell)
                                director = re.sub(r'\[\d+\]', '', director)  # Remove citation numbers
                                director = clean_joined_words(director)  # Apply additional cleaning
                                
                            # Extract country information
                            elif ('Countries' in header_text or 'Country' in header_text) and content_cell:
                                country = extract_plainlist_content(content_cell)
                                country = re.sub(r'\[\d+\]', '', country)  # Remove citation numbers
                                country = clean_joined_words(country)  # Apply additional cleaning
            except Exception as e:
                print(f"Error fetching {title}: {e}")
            time.sleep(1)  # Prevent hitting Wikipedia too quickly

        films.append({
            "rank": rank, 
            "peak": peak, 
            "title": title, 
            "box_office_revenue": box_office,
            "release_year": year,
            "country": country,
            "director": director
        })

    return films

# Store films in the database
def store_films_in_db(conn, cursor, films):
    cursor.executemany("""
        INSERT INTO films (rank, peak, title, box_office_revenue, release_year, country, director)
        VALUES (:rank, :peak, :title, :box_office_revenue, :release_year, :country, :director)
    """, films)
    conn.commit()

# Read films from the database
def read_films_from_db(conn):
    cursor = conn.cursor()
    cursor.execute("SELECT rank, peak, title, box_office_revenue, release_year, country, director FROM films")
    
    films = []
    for row in cursor.fetchall():
        films.append({
            "rank": row[0],
            "peak": row[1],
            "title": row[2],
            "box_office_revenue": row[3],
            "release_year": row[4],
            "country": row[5],
            "director": row[6]
        })
    
    return films

# Save films to JSON
def save_to_json(films, filename="temp/films.json"):
    with open(filename, "w") as f:
        json.dump(films, f, indent=4)
        

# Main execution
print("Setting up database...")
conn, cursor = setup_database()

print("Fetching film data from Wikipedia...")
films = fetch_film_data()

print("Storing data in database...")
store_films_in_db(conn, cursor, films)

print("Reading data from database...")
db_films = read_films_from_db(conn)

print("Saving data to JSON file...")
save_to_json(db_films)

print("Sample of parsed data:")
for film in db_films[:3]:
    print(json.dumps(film, indent=2))

conn.close()
print("Process completed successfully!")

Setting up database...
Fetching film data from Wikipedia...


100%|██████████| 50/50 [01:22<00:00,  1.66s/it]

Storing data in database...
Reading data from database...
Saving data to JSON file...
Sample of parsed data:
{
  "rank": "1",
  "peak": "1",
  "title": "Avatar",
  "box_office_revenue": "$2,923,706,026",
  "release_year": 2009,
  "country": "United Kingdom, United States",
  "director": "James Cameron"
}
{
  "rank": "2",
  "peak": "1",
  "title": "Avengers: Endgame",
  "box_office_revenue": "$2,797,501,328",
  "release_year": 2019,
  "country": "United States",
  "director": "Anthony Russo, Joe Russo"
}
{
  "rank": "3",
  "peak": "3",
  "title": "Avatar: The Way of Water",
  "box_office_revenue": "$2,320,250,281",
  "release_year": 2022,
  "country": "United States",
  "director": "James Cameron"
}
Process completed successfully!



