In [3]:
!pip install sovai
!pip install uv && uv pip install sovai['full'] --system > output.log 2>&1

Collecting sovai
  Downloading sovai-0.2.77-py3-none-any.whl.metadata (3.8 kB)
Collecting aiobotocore==2.21.1 (from sovai)
  Downloading aiobotocore-2.21.1-py3-none-any.whl.metadata (24 kB)
Collecting boto3==1.37.1 (from sovai)
  Downloading boto3-1.37.1-py3-none-any.whl.metadata (6.6 kB)
Collecting botocore==1.37.1 (from sovai)
  Downloading botocore-1.37.1-py3-none-any.whl.metadata (5.7 kB)
Collecting brotli<2.0.0,>=1.1.0 (from sovai)
  Downloading brotli-1.2.0-cp312-cp312-macosx_10_13_x86_64.whl.metadata (6.1 kB)
Collecting edgar-tool>=1.3.3 (from sovai)
  Downloading edgar_tool-2.1.2-py3-none-any.whl.metadata (20 kB)
Collecting fastapi-cache>=0.1.0 (from sovai)
  Downloading fastapi-cache-0.1.0.tar.gz (4.8 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting fastapi-cache2>=0.2.1 (from fastapi-cache2[redis]>=0.2.1->sovai)
  Downloading fastapi_cache2-0.2.2-py3-none-any.whl.metadata (8.8 kB)
Collecting fsspec==2024.10.0 (from sovai)
  Downloading fsspec-2024.10.0-py3-

In [4]:
import sovai as sov
import pandas as pd
import os

Downloading https://storage.googleapis.com/sovai-public/accounting/tickers_transformed.parq to data/tickers.parq
Downloading https://storage.googleapis.com/sovai-public/sovai-master/output/df_codes.parquet to data/codes.parq


# Historical + Current Box Office Data

Sourced from sovai - includes film titles, distributers, total gross and approximate gross per theatre, etc.

In [None]:
sov_token = os.getenv("SOV_API_KEY")
sov.token_auth(token=sov_token)

In [None]:
df_movies = sov.data("movies/boxoffice")

In [None]:
sov_df = df_movies.reset_index().copy()
# Extract the year from the 'date' column
sov_df = sov_df.dropna(subset=["date"])
sov_df['year'] = sov_df['date'].dt.year
sov_df = sov_df.drop_duplicates()
sov_df.sort_values(by='year', ascending=False)

Unnamed: 0,ticker,date,title,distributor,gross,percent_yd,percent_lw,theaters,per_theater,total_gross,days_in_release,parent company,distributor address,distributorwebsite,release_date,year
236222,WBD,2025-10-30,One Battle After Another,Warner Bros.,188951,-0.070,-0.460,1473.000,128.000,66607827,35,Warner Bros. Entertainment Inc.,"4000 Warner Blvd., Burbank, California, US",https://www.warnerbros.com/,2025-09-25,2025
236220,WBD,2025-10-29,One Battle After Another,Warner Bros.,203608,-0.290,-0.520,1473.000,138.000,66418876,34,Warner Bros. Entertainment Inc.,"4000 Warner Blvd., Burbank, California, US",https://www.warnerbros.com/,2025-09-25,2025
236229,WBD,2025-11-02,The Conjuring: Last Rites,Warner Bros.,105000,-0.320,-0.410,542.000,194.000,177403000,59,Warner Bros. Entertainment Inc.,"4000 Warner Blvd., Burbank, California, US",https://www.warnerbros.com/,2025-09-04,2025
236211,WBD,2025-10-26,The Conjuring: Last Rites,Warner Bros.,180000,-0.460,-0.570,961.000,187.000,176741000,52,Warner Bros. Entertainment Inc.,"4000 Warner Blvd., Burbank, California, US",https://www.warnerbros.com/,2025-09-04,2025
236212,WBD,2025-10-26,One Battle After Another,Warner Bros.,740000,-0.210,-0.270,1473.000,502.000,65786000,31,Warner Bros. Entertainment Inc.,"4000 Warner Blvd., Burbank, California, US",https://www.warnerbros.com/,2025-09-25,2025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68938,LGF.A,1997-11-07,Eve's Bayou,Trimark,960000,0.000,0.000,659.000,1457.000,960000,1,Lionsgate,"Santa Monica, California, United States",,1997-11-06,1997
68937,LGF.A,1997-08-24,Dirty Dancing (1987) (Re-…,Vestron,42498,-0.280,0.000,54.000,787.000,63608760,3657,Lionsgate,"Chicago, Illinois, United States",,1987-08-20,1997
68936,LGF.A,1997-08-23,Dirty Dancing (1987) (Re-…,Vestron,59257,-0.020,0.000,54.000,1097.000,63566262,3656,Lionsgate,"Chicago, Illinois, United States",,1987-08-20,1997
68933,LGF.A,1997-07-30,Box of Moonlight,Trimark,1537,-0.190,0.000,1.000,1537.000,18862,6,Lionsgate,"Santa Monica, California, United States",,1997-07-24,1997


In [None]:
print(f"Unique titles: {sov_df['title'].nunique()}")
print(f"Total rows: {len(sov_df)}")

Unique titles: 5996
Total rows: 236244


In [None]:
def normalize_title(s):
    s = s.astype(str).str.lower().str.strip()
    return s

# Film Ratings + Other Film Metadata

Sourced from TMDB (the movie database)

Includes aggregate runtimes, popularity ratings, descriptions of movies, number of raters, also includes revenue etc.

[Movie details for a given movie id](https://developer.themoviedb.org/reference/movie-details)


Here is the drive folder with the bulk movie id data:
[download tmdb_movie_ids.csv](https://drive.google.com/file/d/1gOMNDu7MLIriftb3audXMP0UEmZAiHQt/view?usp=sharing)


In [13]:
import requests
import json

In [11]:
RAW_DATA_PATH = "raw"

In [None]:
tmdb_df = pd.read_csv(f"{RAW_DATA_PATH}/tmdb_movie_ids.csv")
print(tmdb_df.columns)

Index(['adult', 'id', 'original_title', 'popularity', 'video'], dtype='object')


In [None]:
tmdb_df.sort_values(by='popularity', ascending=False)

Unnamed: 0,adult,id,original_title,popularity,video
811727,False,1156594,Culpa nuestra,532.152,False
1073852,False,1511789,Captain Hook - The Cursed Tides,340.513,False
534839,False,755898,War of the Worlds,317.302,False
907504,False,1280450,Stolen Girl,304.141,False
926600,False,1305717,Hunting Grounds,300.462,False
...,...,...,...,...,...
781645,False,1114005,Tio år på en minut - 2000-talet,0.000,False
781637,False,1113990,The Wheel'd Universe,0.000,False
477584,False,677812,Revolver cachas de oro,0.000,False
477583,False,677811,Perros de pelea,0.000,False


# DATA CLEANING TO MERGE



1.   Normalizing Titles: Movie titles often contain variations in capitalization, spacing, punctuation, etc., which can prevent matching across different datasets. To address this, we normalize all titles to the same format, creating a consistent key for merging called title_key.
2. SOVAI Data Cleaning: This dataset contains historical performance data, including gross revenue, number of theaters, and release dates. To clean up this data:

*   Duplicate entries are removed, keeping the row with the latest release data and highest revenue
*   Numeric columns (gross, total_gross, theaters) are converted to numeric types
* Rows missing critical data (title_key, gross, date) are dropped
3. TMDB Data Cleaning: This dataset contains basic metadata including ID, title, popularity – which we keep and normalize as needed, also converting popularity to a numeric value
4. Merging Datasets: The cleaned datasets are merged on title_key, ensuring that only movies present in both datasets are included and thus also combining data from both sources.
5. Feature Engineering/Extraction: Additional features can be derived from the newly created dataset to improve predictive power. One example  includes a flag indicating if the date released is_weekend.


In [None]:
sov_df['title_key'] = normalize_title(sov_df['title'])
tmdb_df['title_key'] = normalize_title(tmdb_df['original_title'])

In [None]:
# Clean SOVAI data
# Drop duplicates: keep latest/highest grossing per movie
sov_df = sov_df.sort_values(
    by=['title_key', 'date', 'total_gross', 'theaters'],
    ascending=[True, False, False, False]
)
sov_latest = sov_df.drop_duplicates(subset=['title_key'], keep='first').copy()

# Convert numeric columns
for col in ['gross', 'per_theater', 'theaters', 'total_gross', 'days_in_release']:
    if col in sov_latest.columns:
        sov_latest[col] = pd.to_numeric(sov_latest[col], errors='coerce')

# Drop rows missing critical data
sov_latest = sov_latest.dropna(subset=['title_key', 'gross', 'date'])

In [None]:
# Clean TMDB CSV (minimal)
# Keep only relevant columns
tmdb_df = tmdb_df[['id', 'original_title', 'popularity', 'title_key']].copy()
tmdb_df['popularity'] = pd.to_numeric(tmdb_df['popularity'], errors='coerce')

# Merge

In [None]:

# Merge SOVAI + TMDB

merged_df = pd.merge(
    sov_latest,
    tmdb_df,
    how='inner',  # only keep movies present in both
    on='title_key',
    suffixes=('_sov', '_tmdb')
)

# Feature engineering - Weekday/weekend flag based on SOVAI date
merged_df['weekday'] = merged_df['date'].dt.weekday
merged_df['is_weekend'] = merged_df['weekday'].isin([4,5,6]).astype(int)

n=30
print(merged_df[['title_key', 'date', 'gross', 'popularity', 'is_weekend']].head(n)) #printing first n


              title_key       date   gross  popularity  is_weekend
0   10 cloverfield lane 2016-06-02   11414       7.837           0
1            100 meters 2025-10-16     313       0.014           0
2           102 not out 2018-06-03    2806       3.454           1
3           10th & wolf 2006-09-04    1791       3.656           0
4             11th hour 2007-09-03   64888       2.311           0
5             11th hour 2007-09-03   64888       0.184           0
6                    12 2009-05-25     344       2.658           0
7                    12 2009-05-25     344       0.205           0
8                    12 2009-05-25     344       0.262           0
9                    12 2009-05-25     344       1.321           0
10                   12 2009-05-25     344       1.701           0
11                   12 2009-05-25     344       0.256           0
12                   12 2009-05-25     344       0.139           0
13                   12 2009-05-25     344       0.305        

In [None]:
import pandas as pd
from tabulate import tabulate

# --- SOV.AI SCHEMA ---
schema_sovai = pd.DataFrame({
    "Data Field": sov_latest.columns,
    "Data Type": sov_latest.dtypes.astype(str)
})

print("SOV.AI Dataset Schema:")
print(tabulate(schema_sovai, headers="keys", tablefmt="github"))
print("\n")


# --- TMDB SCHEMA ---
schema_tmdb = pd.DataFrame({
    "Data Field": tmdb_df.columns,
    "Data Type": tmdb_df.dtypes.astype(str)
})

print("TMDB Dataset Schema:")
print(tabulate(schema_tmdb, headers="keys", tablefmt="github"))
print("\n")


# --- MERGED DATASET SCHEMA ---
schema_merged = pd.DataFrame({
    "Data Field": merged_df.columns,
    "Data Type": merged_df.dtypes.astype(str)
})

print("Merged Dataset Schema:")
print(tabulate(schema_merged, headers="keys", tablefmt="github"))
print("\n")


# OPTIONAL: save to CSV files
schema_sovai.to_csv("schema_sovai.csv", index=False)
schema_tmdb.to_csv("schema_tmdb.csv", index=False)
schema_merged.to_csv("schema_merged.csv", index=False)


SOV.AI Dataset Schema:
|                     | Data Field          | Data Type      |
|---------------------|---------------------|----------------|
| ticker              | ticker              | object         |
| date                | date                | datetime64[ns] |
| title               | title               | string         |
| distributor         | distributor         | object         |
| gross               | gross               | Int64          |
| percent_yd          | percent_yd          | Float64        |
| percent_lw          | percent_lw          | Float64        |
| theaters            | theaters            | Float64        |
| per_theater         | per_theater         | Float64        |
| total_gross         | total_gross         | Int64          |
| days_in_release     | days_in_release     | Int64          |
| parent company      | parent company      | object         |
| distributor address | distributor address | object         |
| distributorwebsite  | distribu