# Box Office Bomb Data Pipeline Assignment

**Total Marks: 20**

This notebook implements an end-to-end pipeline to:
1. Scrape box office bomb data from Wikipedia
2. Validate & clean data using Pydantic
3. Enrich with OMDb API data
4. Perform consistency checks
5. Create final categorized dataset

## Setup and Imports

In [None]:
# Import required libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup
from pydantic import BaseModel, field_validator, ValidationError
from typing import Optional
import re
import time
from pathlib import Path
import datetime

## Task 1: Scrape the "Bombs" Table (4 Marks)

Extract raw data from the Wikipedia HTML file:
- Film Title (with symbols like § and †)
- Year
- Net production budget (may contain ranges like "$100–160")
- Estimated loss (Nominal column)

Note:
- You need to extract the entire raw string with the symbols, references, etc along with the titles.
- You must handle the nested headers in the Wikipedia table (Budget and Loss columns have sub-headers).

In [None]:
def scrape_box_office_bombs():
    """
    Scrape the box office bombs table from the local HTML file.
    Returns a list of dictionaries with raw extracted data.
    """
    # Get the HTML
    url = "https://en.wikipedia.org/wiki/List_of_biggest_box-office_bombs"
    header = {
        "User-Agent": "Chrome/120.0.0.0"
    }
    try:
        response = requests.get(url, headers=header)
        response.raise_for_status() # Raise an exception for bad status codes
        print(response.status_code)

        # Parse the HTML
        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')

    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return

    # TODO: Find the main table - it's wikitable sortable with caption "Biggest box-office bombs"
    table = soup.select_one('table')
    rows = table.select('tr')[2:]

    raw_data = []

    # TODO: Iterate through rows (skip headers) and extract: 'raw_title', 'raw_year', 'raw_budget', 'raw_loss'.
    # Store extracted data in raw_data. Example entry: {'raw_title': 'Town & Country', 'raw_year': '2001', 'raw_budget': '$90', 'raw_loss': '$10.4'}
    for row in rows:
        raw_title = row.select_one('th').get_text(strip=True)
        cols = row.select('td')
        if len(cols) >= 5:
            raw_year = cols[0].get_text(strip=True)
            raw_budget = cols[1].get_text(strip=True)
            raw_loss = cols[3].get_text(strip=True)

            raw_data.append({
                'raw_title': raw_title,
                'raw_year': raw_year,
                'raw_budget': raw_budget,
                'raw_loss': raw_loss
            })

    return raw_data

In [None]:
# Test the scraping function
raw_movies = scrape_box_office_bombs()
print(f"Scraped {len(raw_movies)} movies")
print("\nLast 15 raw entries:")
for i, movie in enumerate(raw_movies[-15:]):
    print(f"\n{i+1}. {movie}")

200
Scraped 139 movies

Last 15 raw entries:

1. {'raw_title': 'Town & Country', 'raw_year': '2001', 'raw_budget': '$90', 'raw_loss': '$85'}

2. {'raw_title': 'Transformers: The Last Knight', 'raw_year': '2017', 'raw_budget': '$217–260', 'raw_loss': '$100+'}

3. {'raw_title': 'Treasure Planet', 'raw_year': '2002', 'raw_budget': '$140', 'raw_loss': '$85'}

4. {'raw_title': 'Tron: Ares†', 'raw_year': '2025', 'raw_budget': '$220', 'raw_loss': '$132.7'}

5. {'raw_title': 'Turning Red§', 'raw_year': '2022', 'raw_budget': '$175', 'raw_loss': '$173'}

6. {'raw_title': 'Valerian and the City of a Thousand Planets', 'raw_year': '2017', 'raw_budget': '$177.2–180', 'raw_loss': '$82'}

7. {'raw_title': 'West Side Story', 'raw_year': '2021', 'raw_budget': '$100', 'raw_loss': '$104'}

8. {'raw_title': 'Wild Wild West', 'raw_year': '1999', 'raw_budget': '$170', 'raw_loss': '$66.2'}

9. {'raw_title': 'Windtalkers', 'raw_year': '2002', 'raw_budget': '$115–120', 'raw_loss': '$76–81'}

10. {'raw_title': 

## Task 2: Pydantic Data Parsing & Validation (6 Marks)

Create a Pydantic model that:
- Cleans titles (removes §, †, and footnotes like [nb 2], [1])
- Parses numeric values (handles ranges, currency symbols)
- Validates year as integer

In [None]:
class MovieData(BaseModel):
    """
    Pydantic model for validating and cleaning movie data.
    """
    # TODO: Define the 4 required fields with their types:
    # title (str), year (int), budget_millions (float), loss_millions (float)
    title : str
    year : int
    budget_millions : float
    loss_millions : float

    @field_validator('title', mode='before')
    @classmethod
    def clean_title(cls, v):
        """
        Remove footnote markers and special symbols from title.
        - Remove § (streaming symbol)
        - Remove † (currently playing symbol)
        - Remove footnotes like [nb 2], [1], etc.
        """
        # TODO: Implement title cleaning logic
        # Hint: Use .replace() for symbols and re.sub() for footnotes
        v = v.replace("§", "")
        v = v.replace("†", "")
        v = re.sub(r"\[.*?\]", "", v)
        return v

    @field_validator('year', mode='before')
    @classmethod
    def validate_year(cls, v):
        """
        Ensure year is a valid integer.
        Remove any extra characters and convert to int.
        """
        # TODO: Clean string (remove non-digits) and convert to integer
        v = int(v)
        return v if (v>=1888 and v<datetime.date.today().year+2) else 0

    @field_validator('budget_millions', mode='before')
    @classmethod
    def parse_budget(cls, v):
        """
        Parse budget value:
        - Strip $ and other currency symbols
        - Handle ranges (e.g., "100–160") by calculating average
        - Remove reference tags
        """
        # TODO: Call your helper method _parse_numeric_value
        return cls._parse_numeric_value(v)

    @field_validator('loss_millions', mode='before')
    @classmethod
    def parse_loss(cls, v):
        """
        Parse loss value with same logic as budget.
        """
        # TODO: Call your helper method _parse_numeric_value
        return cls._parse_numeric_value(v)

    @staticmethod
    def _parse_numeric_value(v):
        """
        Helper method to parse numeric values with ranges.
        """
        # TODO: Implement logic to:
        v = v.strip("$+")
        temp = "–"
        if (temp in v):
            l = v.split(temp)
            v = (float(l[0]) + float(l[1]))/2
        return v

## Task 3: Enrich with OMDb Data (4 Marks)

Query OMDb API for each movie to get:
- Plot
- Metascore
- IMDb Rating
- Director
- Language

Handle API failures (Response='False') or missing fields ('N/A') gracefully by storing them as None. Do not delete the row.

In [None]:
# Validate and clean the raw data
validated_movies = []
failed_validations = []

for raw_movie in raw_movies:
    try:
        movie = MovieData(
            title=raw_movie['raw_title'],
            year=raw_movie['raw_year'],
            budget_millions=raw_movie['raw_budget'],
            loss_millions=raw_movie['raw_loss']
        )
        validated_movies.append(movie)
    except ValidationError as e:
        failed_validations.append({
            'raw_data': raw_movie,
            'error': str(e)
        })
        print(f"Validation failed for {raw_movie['raw_title']}: {e}")

print(f"\n{'='*60}")
print(f"Validation Results:")
print(f"Successfully validated: {len(validated_movies)} movies")
print(f"Failed validations: {len(failed_validations)} movies")
print(f"{'='*60}")

# Show first 3 validated movies
print("\nLast 15 validated movies:")
for i, movie in enumerate(validated_movies[-15:]):
    print(f"\n{i+1}. {movie.model_dump()}")


Validation Results:
Successfully validated: 139 movies
Failed validations: 0 movies

Last 15 validated movies:

1. {'title': 'Town & Country', 'year': 2001, 'budget_millions': 90.0, 'loss_millions': 85.0}

2. {'title': 'Transformers: The Last Knight', 'year': 2017, 'budget_millions': 238.5, 'loss_millions': 100.0}

3. {'title': 'Treasure Planet', 'year': 2002, 'budget_millions': 140.0, 'loss_millions': 85.0}

4. {'title': 'Tron: Ares', 'year': 2025, 'budget_millions': 220.0, 'loss_millions': 132.7}

5. {'title': 'Turning Red', 'year': 2022, 'budget_millions': 175.0, 'loss_millions': 173.0}

6. {'title': 'Valerian and the City of a Thousand Planets', 'year': 2017, 'budget_millions': 178.6, 'loss_millions': 82.0}

7. {'title': 'West Side Story', 'year': 2021, 'budget_millions': 100.0, 'loss_millions': 104.0}

8. {'title': 'Wild Wild West', 'year': 1999, 'budget_millions': 170.0, 'loss_millions': 66.2}

9. {'title': 'Windtalkers', 'year': 2002, 'budget_millions': 117.5, 'loss_millions': 

In [None]:
# OMDb API configuration
# NOTE: You need to get a free API key from http://www.omdbapi.com/apikey.aspx
OMDB_API_KEY = "1baeaa9"  # Replace with your actual API key
OMDB_BASE_URL = "http://www.omdbapi.com/"

def query_omdb(title: str, year: int) -> dict:
    """
    Query OMDb API for movie metadata.
    Returns dict with plot, metascore, imdb_rating, director, language, omdb_year.
    Returns None values if movie not found or API fails.
    """
    # TODO: Construct params and make GET request
    params = {
        "apikey": OMDB_API_KEY,
        "t": title,  # Search by title
        "year": year,
        "type": "movie"
    }

    # TODO: Handle 'Response': 'False'
    response = requests.get(OMDB_BASE_URL, params=params)
    if response.ok:
        data = response.json()
        if data.get("Response") == "True":
            pass
        else:
            print(f"Movie not found: {data.get('Error')}")

    # TODO: Extract fields and handle 'N/A' conversion to None/Numbers
    for field in ['Plot', 'Metascore', 'imdbRating', 'Director', 'Language', 'Year']:
        if data.get(field) == 'N/A':
            data[field] = None

    return data

In [None]:
# Enrich each validated movie with OMDb data
enriched_data = []
omdb_year = []

print("Querying OMDb API...")
# TODO: Loop through validated_movies, call query_omdb, and merge data
for movie in validated_movies:
    data = query_omdb(movie.title, movie.year)
    movie_data = movie.model_dump()
    for field in ['Plot', 'Metascore', 'imdbRating', 'Director', 'Language']:
        movie_data[field] = data.get(field)
    omdb_year.append(data.get('Year'))
    enriched_data.append(movie_data)

print(f"\nEnriched {len(enriched_data)} movies with OMDb data")

# Show sample enriched data
print("\nFirst enriched entry:")
print(enriched_data[0])

Querying OMDb API...
Movie not found: Movie not found!
Movie not found: Movie not found!

Enriched 139 movies with OMDb data

First enriched entry:
{'title': 'The 13th Warrior', 'year': 1999, 'budget_millions': 130.0, 'loss_millions': 99.0, 'Plot': 'A man, having fallen in love with the wrong woman, is sent by the sultan himself on a diplomatic mission to a distant land as an ambassador. Stopping at a Viking village port to restock on supplies, he finds himself unwittingly em...', 'Metascore': '42', 'imdbRating': '6.6', 'Director': 'John McTiernan', 'Language': 'English, Latin, Swedish, Norse, Old, Danish, Arabic'}


In [None]:
len(omdb_year)

139

## Task 4: Data Consistency Check (2 Marks)

Compare Wikipedia year with OMDb year:
- "Verified": Years match (±1 year tolerance)
- "Mismatch": Years differ by >1
- "Not Found": OMDb returned no data

In [None]:
def determine_match_status(wiki_year: int, omdb_year: Optional[int]) -> str:
    """
    Determine the match status between Wikipedia and OMDb years.
    Returns "Verified", "Mismatch", or "Not Found".
    """
    # TODO: Implement
    if (wiki_year <= omdb_year+1 and wiki_year >= omdb_year-1):
        return "Verified"
    elif (omdb_year == 0):
        return "Not Found"
    else:
        return "Mismatch"

# TODO: Apply this function to your enriched_data list and add match_status to each entry
for i in range(len(enriched_data)):
    if omdb_year[i] == None:
        omdb_year[i] = 0
    else:
        omdb_year[i] = int(omdb_year[i])
    match_status = determine_match_status(enriched_data[i]['year'], omdb_year[i])
    enriched_data[i]['omdb_year'] = omdb_year[i]
    enriched_data[i]['match_status'] = match_status

# Show match status distribution
df_temp = pd.DataFrame(enriched_data)
print("Match Status Distribution:")
print(df_temp['match_status'].value_counts())

# Show some mismatches if any
mismatches = df_temp[df_temp['match_status'] == 'Mismatch']
if len(mismatches) > 0:
    print(f"\nSample Mismatches (showing up to 5):")
    print(mismatches[['title', 'year', 'omdb_year', 'match_status']].head())

Match Status Distribution:
match_status
Verified     128
Mismatch       9
Not Found      2
Name: count, dtype: int64

Sample Mismatches (showing up to 5):
              title  year  omdb_year match_status
17          Ben-Hur  2016       1959     Mismatch
33  Doctor Dolittle  1967       1998     Mismatch
39   Fantastic Four  2015       2005     Mismatch
76           Lolita  1997       1962     Mismatch
87            Mulan  2020       1998     Mismatch


In [None]:
df_temp

Unnamed: 0,title,year,budget_millions,loss_millions,Plot,Metascore,imdbRating,Director,Language,omdb_year,match_status
0,The 13th Warrior,1999,130.0,99.0,"A man, having fallen in love with the wrong wo...",42,6.6,John McTiernan,"English, Latin, Swedish, Norse, Old, Danish, A...",1999,Verified
1,The 355,2022,57.5,93.0,When a top-secret weapon falls into mercenary ...,40,5.6,Simon Kinberg,"English, Chinese, Spanish, French, German, Ara...",2022,Verified
2,47 Ronin,2013,200.0,96.0,A band of samurai sets out to avenge the death...,28,6.2,Carl Rinsch,"English, Japanese",2013,Verified
3,The Adventures of Baron Munchausen,1988,46.6,38.5,An account of Baron Munchausen's supposed trav...,69,7.1,Terry Gilliam,English,1988,Verified
4,The Adventures of Pluto Nash,2002,100.0,96.0,"In the future, a man struggles to keep his lun...",12,3.9,Ron Underwood,English,2002,Verified
...,...,...,...,...,...,...,...,...,...,...,...
134,The Wolfman,2010,150.0,76.0,"Upon his return to his ancestral homeland, an ...",43,5.9,Joe Johnston,"English, Romany, Romanian, Ukrainian",2010,Verified
135,Wonder Woman 1984,2020,200.0,117.5,Wonder Woman finds herself battling two oppone...,60,5.3,Patty Jenkins,"English, Arabic, Russian, Mandarin",2020,Verified
136,A Wrinkle in Time,2018,125.0,130.6,After the disappearance of her scientist fathe...,53,4.3,Ava DuVernay,English,2018,Verified
137,xXx: State of the Union,2005,113.1,78.0,"Darius Stone, a new agent in the xXx program, ...",37,4.5,Lee Tamahori,English,2005,Verified


## Task 5: Final Dataset & Categorization (4 Marks)

Create final DataFrame with:
- Loss_Category based on estimated loss:
  - "Catastrophic": Loss ≥ \$100M

  - "Severe": Loss between \$50M and \$100M

  - "Moderate": Loss < \$50M
- Save to `box_office_failures.csv`

Required columns: Title, Year, Director, Language, Budget_Millions, Loss_Millions, Loss_Category, IMDb_Rating, Metascore, Match_Status

In [None]:
def categorize_loss(loss_millions: float) -> str:
    """
    Categorize the financial loss into tiers.
    """
    # TODO: Implement categorization logic
    if loss_millions >= 100:
        return "Catastrophic"
    elif loss_millions >= 50 and loss_millions<100:
        return "Severe"
    else:
        return "Moderate"

# TODO: Create DataFrame from enriched_data
df_final = pd.DataFrame(enriched_data)
# TODO: Add Loss_Category
df_final['Loss_Category'] = df_final['loss_millions'].apply(categorize_loss)

# TODO: Select and rename columns to match requirements

# Display summary statistics
print("Final Dataset Summary:")
print(f"Total movies: {len(df_final)}")
print(f"\nLoss Category Distribution:")
print(df_final['Loss_Category'].value_counts())
print(f"\nBasic Statistics:")
print(df_final[['budget_millions', 'loss_millions', 'imdbRating', 'Metascore']].describe())

# Display first few rows
print(f"\nFirst 10 rows of final dataset:")
df_final.head(10)

Final Dataset Summary:
Total movies: 139

Loss Category Distribution:
Loss_Category
Severe          85
Catastrophic    46
Moderate         8
Name: count, dtype: int64

Basic Statistics:
       budget_millions  loss_millions
count       139.000000     139.000000
mean        129.552518      91.917266
std          58.126686      32.407538
min          17.000000      10.800000
25%          81.250000      70.550000
50%         117.500000      85.000000
75%         175.000000     108.500000
max         326.000000     218.000000

First 10 rows of final dataset:


Unnamed: 0,title,year,budget_millions,loss_millions,Plot,Metascore,imdbRating,Director,Language,omdb_year,match_status,Loss_Category
0,The 13th Warrior,1999,130.0,99.0,"A man, having fallen in love with the wrong wo...",42,6.6,John McTiernan,"English, Latin, Swedish, Norse, Old, Danish, A...",1999,Verified,Severe
1,The 355,2022,57.5,93.0,When a top-secret weapon falls into mercenary ...,40,5.6,Simon Kinberg,"English, Chinese, Spanish, French, German, Ara...",2022,Verified,Severe
2,47 Ronin,2013,200.0,96.0,A band of samurai sets out to avenge the death...,28,6.2,Carl Rinsch,"English, Japanese",2013,Verified,Severe
3,The Adventures of Baron Munchausen,1988,46.6,38.5,An account of Baron Munchausen's supposed trav...,69,7.1,Terry Gilliam,English,1988,Verified,Moderate
4,The Adventures of Pluto Nash,2002,100.0,96.0,"In the future, a man struggles to keep his lun...",12,3.9,Ron Underwood,English,2002,Verified,Severe
5,The Adventures of Rocky & Bullwinkle,2000,87.3,63.5,"When enemies Boris, Natasha, and Fearless Lead...",36,4.3,Des McAnuff,English,2000,Verified,Severe
6,The Alamo,2004,107.0,94.0,Based on the 1836 standoff between a group of ...,47,6.1,John Lee Hancock,"English, Spanish",2004,Verified,Severe
7,Alexander,2004,155.0,71.0,"Alexander, the King of Macedonia and one of th...",40,5.6,Oliver Stone,English,2004,Verified,Severe
8,Ali,2001,107.0,63.0,"A biography of sports legend Muhammad Ali, foc...",65,6.7,Michael Mann,"English, French, Swahili",2001,Verified,Severe
9,Allied,2016,85.0,82.5,"In 1942, a Canadian intelligence officer in No...",60,7.1,Robert Zemeckis,"English, French, German, Arabic",2016,Verified,Severe


In [None]:
# Save to CSV
output_path = 'box_office_failures.csv'

# TODO: Save DataFrame to CSV
df_final.to_csv(output_path, index=False)

print(f"✓ Dataset saved to: {output_path}")
print(f"✓ Total records: {len(df_final)}")

✓ Dataset saved to: box_office_failures.csv
✓ Total records: 139


## Additional Analysis (Optional)

In [None]:
# Show some interesting insights
print("Top 10 Biggest Box Office Bombs (by loss):")
print(df_final.nlargest(10, 'loss_millions')[['title', 'year', 'Director', 'loss_millions', 'Loss_Category']])

print("\n" + "="*60)
print("\nMovies with Lowest IMDb Ratings:")
df_final['imdbRating'] = df_final['imdbRating'].astype(float)
lowest_rated = df_final[df_final['imdbRating'].notna()].nsmallest(5, 'imdbRating')
print(lowest_rated[['title', 'year', 'Director', 'imdbRating', 'Metascore', 'loss_millions']])

print("\n" + "="*60)
print("\nAverage Loss by Category:")
print(df_final.groupby('Loss_Category')['loss_millions'].agg(['mean', 'count']))

print("\n" + "="*60)
print("\nTop 5 Most Common Directors in Box Office Bombs:")
print(df_final['Director'].value_counts().head())

Top 10 Biggest Box Office Bombs (by loss):
                   title  year                   Director  loss_millions  \
80           The Marvels  2023                Nia DaCosta          218.0   
116        Strange World  2022       Don Hall, Qui Nguyen          187.5   
77       The Lone Ranger  2013             Gore Verbinski          175.0   
86        Mortal Engines  2018           Christian Rivers          174.8   
128          Turning Red  2022                  Domee Shi          173.0   
67           John Carter  2012             Andrew Stanton          156.0   
42             The Flash  2023            Andy Muschietti          155.0   
69         Jungle Cruise  2021         Jaume Collet-Serra          150.0   
68   Joker: Folie à Deux  2024              Todd Phillips          144.3   
87                 Mulan  2020  Tony Bancroft, Barry Cook          141.0   

    Loss_Category  
80   Catastrophic  
116  Catastrophic  
77   Catastrophic  
86   Catastrophic  
128  Catastrophic  
