In [2]:
import pandas as pd
import os

# Historical + Current Box Office Data

Sourced from sovai - includes film titles, distributers, total gross and approximate gross per theatre, etc.

In [8]:
def normalize_title(s):
    s = s.astype(str).str.lower().str.strip()
    return s

In [125]:
sov_df = pd.read_csv("../data/raw/sov_data.csv")
print(sov_df.shape)

(236367, 16)


In [126]:
sov_df.head()

Unnamed: 0,ticker,date,title,distributor,gross,percent_yd,percent_lw,theaters,per_theater,total_gross,days_in_release,parent company,distributor address,distributorwebsite,release_date,year
0,600579,2011-02-11,Raymond Did It,Plastic Age …,2999,0.0,0.0,1.0,2999.0,2999,1,KraussMaffei Group,"7295 Tellier St, Montreal, Quebec H1N 3S9, CA",https://plastic-age.com/en/,2011-02-10,2011
1,600579,2011-02-12,Raymond Did It,Plastic Age …,193,-0.94,0.0,1.0,193.0,3192,2,KraussMaffei Group,"7295 Tellier St, Montreal, Quebec H1N 3S9, CA",https://plastic-age.com/en/,2011-02-10,2011
2,600579,2011-02-18,Raymond Did It,Plastic Age …,225,0.0,-0.92,1.0,225.0,3417,8,KraussMaffei Group,"7295 Tellier St, Montreal, Quebec H1N 3S9, CA",https://plastic-age.com/en/,2011-02-10,2011
3,600579,2011-02-24,Raymond Did It,Plastic Age …,215,0.0,0.0,1.0,215.0,3632,14,KraussMaffei Group,"7295 Tellier St, Montreal, Quebec H1N 3S9, CA",https://plastic-age.com/en/,2011-02-10,2011
4,6758,2000-01-17,All About My Mother,Sony Picture…,64715,0.0,0.0,98.0,660.0,3441867,74,Sony Corporation,"10202 West Washington Boulevard, Culver City, ...",https://www.sonypictures.com/,1999-11-04,2000


In [127]:
sov_df['title_key'] = normalize_title(sov_df['title'])

## Feature Engineering

In [128]:
# Feature engineering - Weekday/weekend flag based on SOVAI date
sov_df['weekday'] = pd.to_datetime(sov_df['date']).dt.weekday
sov_df['release_month'] = pd.to_datetime(sov_df['release_date']).dt.month
sov_df['release_weekday'] = pd.to_datetime(sov_df['release_date']).dt.dayofweek
sov_df['is_weekend'] = sov_df['weekday'].isin([4,5,6]).astype(int)

In [129]:
sov_df['average_theaters'] = sov_df.groupby('title_key')['theaters'].transform('mean')

In [130]:
# Clean SOVAI data
# Drop duplicates: keep latest/highest grossing per movie
sov_df = sov_df.sort_values(
    by=['title_key', 'date','total_gross'],
    ascending=[True, False, False]
)

In [131]:
# Calculate average gross per movie title using groupby + transform
sov_df['average_gross'] = sov_df.groupby('title_key')['gross'].transform('mean')
sov_df['average_gross_per_theaters'] = sov_df['total_gross'] / sov_df['average_theaters']

In [132]:
sov_df['weekday'] = pd.to_datetime(sov_df['date']).dt.weekday  # 0=Monday, 6=Sunday
sov_df['is_weekend'] = sov_df['weekday'].isin([4, 5, 6]).astype(int)

day_names = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']

for day in range(7):
    # Calculate average gross for each movie on this specific day
    day_avg = sov_df[sov_df['weekday'] == day].groupby('title_key')['gross'].mean()
    # Map it back to all rows
    sov_df[f'avg_gross_{day_names[day]}'] = sov_df['title_key'].map(day_avg).fillna(0)

# Also add average for weekends vs weekdays
weekend_avg = sov_df[sov_df['is_weekend'] == 1].groupby('title_key')['gross'].mean()
weekday_avg = sov_df[sov_df['is_weekend'] == 0].groupby('title_key')['gross'].mean()

sov_df['avg_gross_weekend'] = sov_df['title_key'].map(weekend_avg).fillna(0)
sov_df['avg_gross_weekday'] = sov_df['title_key'].map(weekday_avg).fillna(0)

In [133]:
sov_df.head()

Unnamed: 0,ticker,date,title,distributor,gross,percent_yd,percent_lw,theaters,per_theater,total_gross,...,average_gross_per_theaters,avg_gross_monday,avg_gross_tuesday,avg_gross_wednesday,avg_gross_thursday,avg_gross_friday,avg_gross_saturday,avg_gross_sunday,avg_gross_weekend,avg_gross_weekday
126325,Private,2011-09-05,!Women Art Revolution,Zeitgeist,435,0.0,0.0,1.0,435.0,46059,...,46059.0,435.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,435.0
51774,DIS,2012-11-15,'Luv Shuv Tey Chicken Khu…,UTV Communic…,407,-0.17,-0.9,38.0,11.0,143900,...,2936.734694,3416.5,3611.5,2457.0,2189.5,15414.5,27556.0,17108.5,20026.333333,2918.625
51772,DIS,2012-11-14,'Luv Shuv Tey Chicken Khu…,UTV Communic…,488,0.12,-0.89,38.0,13.0,143079,...,2919.979592,3416.5,3611.5,2457.0,2189.5,15414.5,27556.0,17108.5,20026.333333,2918.625
51767,DIS,2012-11-13,'Luv Shuv Tey Chicken Khu…,UTV Communic…,437,-0.79,-0.94,38.0,12.0,142591,...,2910.020408,3416.5,3611.5,2457.0,2189.5,15414.5,27556.0,17108.5,20026.333333,2918.625
51760,DIS,2012-11-12,'Luv Shuv Tey Chicken Khu…,UTV Communic…,2119,-0.69,-0.55,38.0,56.0,142154,...,2901.102041,3416.5,3611.5,2457.0,2189.5,15414.5,27556.0,17108.5,20026.333333,2918.625


In [134]:

sov_latest = sov_df.drop_duplicates(subset=['title_key'], keep='first').copy()

# Convert numeric columns
for col in ['average_gross', 'per_theater', 'theaters','total_gross', 'days_in_release']:
    if col in sov_latest.columns:
        sov_latest[col] = pd.to_numeric(sov_latest[col], errors='coerce')

# Drop rows missing zcritical data
sov_latest = sov_latest.dropna(subset=['title_key', 'average_gross', 'total_gross', 'theaters', 'date'])

In [135]:
sov_latest.head(10)

Unnamed: 0,ticker,date,title,distributor,gross,percent_yd,percent_lw,theaters,per_theater,total_gross,...,average_gross_per_theaters,avg_gross_monday,avg_gross_tuesday,avg_gross_wednesday,avg_gross_thursday,avg_gross_friday,avg_gross_saturday,avg_gross_sunday,avg_gross_weekend,avg_gross_weekday
126325,Private,2011-09-05,!Women Art Revolution,Zeitgeist,435,0.0,0.0,1.0,435.0,46059,...,46059.0,435.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,435.0
51774,DIS,2012-11-15,'Luv Shuv Tey Chicken Khu…,UTV Communic…,407,-0.17,-0.9,38.0,11.0,143900,...,2936.734694,3416.5,3611.5,2457.0,2189.5,15414.5,27556.0,17108.5,20026.33,2918.625
118107,Private,2005-09-05,...And They Lived Happily…,Kino Interna…,550,0.0,0.0,3.0,183.0,233465,...,63672.272727,1239.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1239.333333
111763,PARA,2016-06-02,10 Cloverfield Lane,Paramount Pi…,11414,0.32,-0.12,120.0,95.0,72082999,...,52306.296121,410377.583333,484828.0,362675.5,440960.307692,1473445.0,1834276.0,1113608.0,1473776.0,425041.979592
167713,Private,2025-10-16,100 Meters,GKIDS,313,-0.03,0.0,1.0,313.0,140991,...,140991.0,0.0,0.0,0.0,313.0,0.0,0.0,0.0,0.0,313.0
207920,WBD,2008-06-19,10000 B.C.,Warner Bros.,3661,0.05,-0.36,45.0,81.0,94784201,...,69195.049555,411076.133333,368612.8,337544.066667,373576.571429,1691484.0,2085060.0,1409515.0,1728686.0,372687.576271
189686,SONY,2018-06-03,102 Not Out,Sony Pictures,2806,-0.48,-0.83,17.0,165.0,1339909,...,16654.843224,23025.25,33422.25,21569.0,15996.0,49874.2,77773.8,65123.8,64257.27,23503.125
118476,Private,2006-09-04,10th & Wolf,ThinkFilm,1791,0.0,0.0,6.0,299.0,49783,...,8297.166667,1791.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1791.0
206691,WBD,2007-09-03,11th Hour,Warner Indep…,64888,0.0,0.0,111.0,585.0,417913,...,3764.981982,64888.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64888.0
386,6758,2009-05-25,12,Sony Picture…,344,0.0,0.0,5.0,69.0,119587,...,23917.4,344.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,344.0


# Film Ratings + Other Film Metadata

Sourced from TMDB (the movie database)

Includes aggregate runtimes, popularity ratings, descriptions of movies, number of raters, also includes revenue etc.

[Movie details for a given movie id](https://developer.themoviedb.org/reference/movie-details)


Here is the drive folder with the bulk movie id data:
[download tmdb_movie_ids.csv](https://drive.google.com/file/d/1gOMNDu7MLIriftb3audXMP0UEmZAiHQt/view?usp=sharing)


In [136]:
RAW_DATA_PATH = "raw"

In [137]:
tmdb_df = pd.read_csv(f"../data/{RAW_DATA_PATH}/tmdb_movie_ids.csv")
print(tmdb_df.columns)

Index(['adult', 'id', 'original_title', 'popularity', 'video'], dtype='object')


In [138]:
tmdb_df['title_key'] = normalize_title(tmdb_df['original_title'])

In [139]:
tmdb_df.sort_values(by='popularity', ascending=False)

Unnamed: 0,adult,id,original_title,popularity,video,title_key
811727,False,1156594,Culpa nuestra,532.1524,False,culpa nuestra
1073852,False,1511789,Captain Hook - The Cursed Tides,340.5134,False,captain hook - the cursed tides
534839,False,755898,War of the Worlds,317.3022,False,war of the worlds
907504,False,1280450,Stolen Girl,304.1407,False,stolen girl
926600,False,1305717,Hunting Grounds,300.4624,False,hunting grounds
...,...,...,...,...,...,...
807723,False,1151169,Fade Away,0.0000,False,fade away
807749,False,1151216,Scenic National Parks: Zion and Bryce,0.0000,True,scenic national parks: zion and bryce
807758,False,1151233,Pompeii: The Doomed City,0.0000,True,pompeii: the doomed city
807760,False,1151235,"When It Rayns, It Pours",0.0000,False,"when it rayns, it pours"


# DATA CLEANING TO MERGE



1.   Normalizing Titles: Movie titles often contain variations in capitalization, spacing, punctuation, etc., which can prevent matching across different datasets. To address this, we normalize all titles to the same format, creating a consistent key for merging called title_key.
2. SOVAI Data Cleaning: This dataset contains historical performance data, including gross revenue, number of theaters, and release dates. To clean up this data:

*   Duplicate entries are removed, keeping the row with the latest release data and highest revenue
*   Numeric columns (gross, total_gross, theaters) are converted to numeric types
* Rows missing critical data (title_key, gross, date) are dropped
3. TMDB Data Cleaning: This dataset contains basic metadata including ID, title, popularity – which we keep and normalize as needed, also converting popularity to a numeric value
4. Merging Datasets: The cleaned datasets are merged on title_key, ensuring that only movies present in both datasets are included and thus also combining data from both sources.
5. Feature Engineering/Extraction: Additional features can be derived from the newly created dataset to improve predictive power. One example  includes a flag indicating if the date released is_weekend.


In [140]:
# Clean TMDB CSV (minimal)
# Keep only relevant columns
tmdb_df = tmdb_df[['id', 'original_title', 'popularity', 'title_key']].copy()
tmdb_df['popularity'] = pd.to_numeric(tmdb_df['popularity'], errors='coerce')

# Merge

In [145]:

# Merge SOVAI + TMDB
merged_df = pd.merge(
    sov_latest,
    tmdb_df,
    how='inner',  # only keep movies present in both
    on='title_key',
    suffixes=('_sov', '_tmdb')
)

n=30
merged_df[['title_key', 'date', 'gross', 'average_gross','popularity', 'is_weekend', 'average_gross_per_theaters']].head(n) #printing first n


Unnamed: 0,title_key,date,gross,average_gross,popularity,is_weekend,average_gross_per_theaters
0,10 cloverfield lane,2016-06-02,11414,869211.8,7.8372,0,52306.296121
1,100 meters,2025-10-16,313,313.0,0.0143,0,140991.0
2,102 not out,2018-06-03,2806,43222.87,3.4544,1,16654.843224
3,10th & wolf,2006-09-04,1791,1791.0,3.6557,0,8297.166667
4,11th hour,2007-09-03,64888,64888.0,2.3114,0,3764.981982
5,11th hour,2007-09-03,64888,64888.0,0.1843,0,3764.981982
6,12,2009-05-25,344,344.0,2.6577,0,23917.4
7,12,2009-05-25,344,344.0,0.205,0,23917.4
8,12,2009-05-25,344,344.0,0.2622,0,23917.4
9,12,2009-05-25,344,344.0,1.3215,0,23917.4


In [142]:
merged_df

Unnamed: 0,ticker,date,title,distributor,gross,percent_yd,percent_lw,theaters,per_theater,total_gross,...,avg_gross_wednesday,avg_gross_thursday,avg_gross_friday,avg_gross_saturday,avg_gross_sunday,avg_gross_weekend,avg_gross_weekday,id,original_title,popularity
0,PARA,2016-06-02,10 Cloverfield Lane,Paramount Pi…,11414,0.32,-0.12,120.0,95.0,72082999,...,362675.5,440960.307692,1473445.25,1.834276e+06,1.113608e+06,1.473776e+06,425041.979592,333371,10 Cloverfield Lane,7.8372
1,Private,2025-10-16,100 Meters,GKIDS,313,-0.03,0.00,1.0,313.0,140991,...,0.0,313.000000,0.00,0.000000e+00,0.000000e+00,0.000000e+00,313.000000,911001,100 meters,0.0143
2,SONY,2018-06-03,102 Not Out,Sony Pictures,2806,-0.48,-0.83,17.0,165.0,1339909,...,21569.0,15996.000000,49874.20,7.777380e+04,6.512380e+04,6.425727e+04,23503.125000,460713,102 Not Out,3.4544
3,Private,2006-09-04,10th & Wolf,ThinkFilm,1791,0.00,0.00,6.0,299.0,49783,...,0.0,0.000000,0.00,0.000000e+00,0.000000e+00,0.000000e+00,1791.000000,13197,10th & Wolf,3.6557
4,WBD,2007-09-03,11th Hour,Warner Indep…,64888,0.00,0.00,111.0,585.0,417913,...,0.0,0.000000,0.00,0.000000e+00,0.000000e+00,0.000000e+00,64888.000000,476899,11th Hour,2.3114
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15805,SONY,2006-09-04,Zoom,Sony Pictures,94312,0.00,0.00,303.0,311.0,11406340,...,460782.0,444390.000000,1576554.00,1.732885e+06,1.200969e+06,1.503469e+06,438289.600000,1044350,Zoom,0.0448
15806,SONY,2006-09-04,Zoom,Sony Pictures,94312,0.00,0.00,303.0,311.0,11406340,...,460782.0,444390.000000,1576554.00,1.732885e+06,1.200969e+06,1.503469e+06,438289.600000,1286668,Zoom,0.0286
15807,SONY,2006-09-04,Zoom,Sony Pictures,94312,0.00,0.00,303.0,311.0,11406340,...,460782.0,444390.000000,1576554.00,1.732885e+06,1.200969e+06,1.503469e+06,438289.600000,1373857,Zoom,0.0382
15808,SONY,2006-09-04,Zoom,Sony Pictures,94312,0.00,0.00,303.0,311.0,11406340,...,460782.0,444390.000000,1576554.00,1.732885e+06,1.200969e+06,1.503469e+06,438289.600000,1491463,ZOOM,0.0311


In [146]:
merged_df.to_csv("../data/cleaned/merged_df.csv", index=False)

In [147]:
import pandas as pd
from tabulate import tabulate

# --- SOV.AI SCHEMA ---
schema_sovai = pd.DataFrame({
    "Data Field": sov_latest.columns,
    "Data Type": sov_latest.dtypes.astype(str)
})

print("SOV.AI Dataset Schema:")
print(tabulate(schema_sovai, headers="keys", tablefmt="github"))
print("\n")


# --- TMDB SCHEMA ---
schema_tmdb = pd.DataFrame({
    "Data Field": tmdb_df.columns,
    "Data Type": tmdb_df.dtypes.astype(str)
})

print("TMDB Dataset Schema:")
print(tabulate(schema_tmdb, headers="keys", tablefmt="github"))
print("\n")


# --- MERGED DATASET SCHEMA ---
schema_merged = pd.DataFrame({
    "Data Field": merged_df.columns,
    "Data Type": merged_df.dtypes.astype(str)
})

print("Merged Dataset Schema:")
print(tabulate(schema_merged, headers="keys", tablefmt="github"))
print("\n")


# OPTIONAL: save to CSV files
schema_sovai.to_csv("schemas/schema_sovai.csv", index=False)
schema_tmdb.to_csv("schemas/schema_tmdb.csv", index=False)
schema_merged.to_csv("schemas/schema_merged.csv", index=False)


SOV.AI Dataset Schema:
|                            | Data Field                 | Data Type   |
|----------------------------|----------------------------|-------------|
| ticker                     | ticker                     | object      |
| date                       | date                       | object      |
| title                      | title                      | object      |
| distributor                | distributor                | object      |
| gross                      | gross                      | int64       |
| percent_yd                 | percent_yd                 | float64     |
| percent_lw                 | percent_lw                 | float64     |
| theaters                   | theaters                   | float64     |
| per_theater                | per_theater                | float64     |
| total_gross                | total_gross                | int64       |
| days_in_release            | days_in_release            | int64       |
| parent compan