# Comprehensive Data Merge: Final Dataset with OMDB

This notebook merges the final cleaned dataset (SOVAI + TMDB) with OMDB ratings data.

**Key improvements over previous merge:**
- Uses LEFT JOIN to preserve ALL movies from final_df.csv (not just those with OMDB ratings)
- Properly handles duplicate columns from merges
- Cleans up redundant/empty columns
- Provides detailed merge statistics


In [1]:
import pandas as pd
import glob
import os
from pathlib import Path


## 1. Load Final Dataset (SOVAI + TMDB merged)


In [2]:
CLEAN_DATA_PATH = "../data/cleaned"
OMDB_DATA_PATH = "../omdb_api"

# Load final_df (already has SOVAI + TMDB merged and filtered)
final_df = pd.read_csv(f'{CLEAN_DATA_PATH}/final_df.csv')
print(f"Loaded {len(final_df)} rows, {len(final_df.columns)} columns")
print(f"Movies with imdb_id: {final_df['imdb_id'].notna().sum()}")
print(f"Movies without imdb_id: {final_df['imdb_id'].isna().sum()}")
final_df.head()


Loaded 10067 rows, 39 columns
Movies with imdb_id: 8522
Movies without imdb_id: 1545


Unnamed: 0,ticker,date,title,distributor,gross,percent_yd,percent_lw,theaters,per_theater,total_gross,...,vote_average,vote_count,origin_country,spoken_languages,genre_ids,genre_names,production_company_ids,production_company_names,belongs_to_collection,gross_per_theater
0,PARA,2016-06-02,10 Cloverfield Lane,Paramount Pi…,11414,0.32,-0.12,120.0,95.0,72082999,...,7.0,8351.0,US,English,"53, 878, 18, 27","Thriller, Science Fiction, Drama, Horror",11461,Bad Robot,,95.116667
1,Private,2006-09-04,10th & Wolf,ThinkFilm,1791,0.0,0.0,6.0,299.0,49783,...,5.856,108.0,US,English,"28, 80, 18, 9648, 53","Action, Crime, Drama, Mystery, Thriller",41427,Suzanne DeLaurentiis Productions,,298.5
2,6758,2009-05-25,12,Sony Picture…,344,0.0,0.0,5.0,69.0,119587,...,5.6,50.0,US,English,18,Drama,,,,68.8
3,6758,2009-05-25,12,Sony Picture…,344,0.0,0.0,5.0,69.0,119587,...,5.0,57.0,US,English,35,Comedy,,,,68.8
4,DIS,2009-05-25,12 Rounds,20th Century…,4832,0.0,0.98,29.0,167.0,12187944,...,5.904,819.0,US,English,"28, 53, 80","Action, Thriller, Crime","1557, 17887, 2890, 10339","The Mark Gordon Company, Midnight Sun Pictures...",12 Rounds Collection,166.62069


## 2. Load and Combine OMDB Batch Files


In [4]:
# Find all OMDB batch files
csv_files = sorted(glob.glob(f"{OMDB_DATA_PATH}/omdbmovies_batch_*.csv"))
print(f"Found {len(csv_files)} OMDB batch files")

# Load and combine all batches
dfs = []
for file in csv_files:
    df = pd.read_csv(file)
    # Remove rows where Title is null or empty
    df = df[df["Title"].notna()]
    df = df[df["Title"].str.strip() != ""]
    dfs.append(df)
    print(f"  Loaded {os.path.basename(file)}: {len(df)} rows")

# Combine all batches
omdb_merged = pd.concat(dfs, ignore_index=True)
print(f"\nTotal OMDB records before deduplication: {len(omdb_merged)}")

# Remove duplicates based on imdbID (keep first occurrence)
initial_count = len(omdb_merged)
omdb_merged = omdb_merged.drop_duplicates(subset=["imdbID"], keep="first")
duplicates_removed = initial_count - len(omdb_merged)
if duplicates_removed > 0:
    print(f"Removed {duplicates_removed} duplicate entries")

print(f"Total OMDB records: {len(omdb_merged)}")
print(f"Unique IMDb IDs: {omdb_merged['imdbID'].nunique()}")


Found 11 OMDB batch files
  Loaded omdbmovies_batch_0.csv: 814 rows
  Loaded omdbmovies_batch_1.csv: 833 rows
  Loaded omdbmovies_batch_10.csv: 59 rows
  Loaded omdbmovies_batch_2.csv: 861 rows
  Loaded omdbmovies_batch_3.csv: 824 rows
  Loaded omdbmovies_batch_4.csv: 848 rows
  Loaded omdbmovies_batch_5.csv: 846 rows
  Loaded omdbmovies_batch_6.csv: 811 rows
  Loaded omdbmovies_batch_7.csv: 860 rows
  Loaded omdbmovies_batch_8.csv: 848 rows
  Loaded omdbmovies_batch_9.csv: 877 rows

Total OMDB records before deduplication: 8481
Total OMDB records: 8481
Unique IMDb IDs: 8481


## 3. Clean OMDB Data


In [5]:
# Rename imdbID to imdb_id for consistency
omdb_merged = omdb_merged.rename(columns={"imdbID": "imdb_id"})

# Filter to movies released after 2000 (matching final_df filter)
omdb_merged["omdb_release_date"] = pd.to_datetime(omdb_merged["Released"], errors="coerce")
omdb_merged = omdb_merged[omdb_merged["omdb_release_date"] >= pd.Timestamp("2000-01-01")]
print(f"After filtering to post-2000 releases: {len(omdb_merged)} rows")

# Select relevant columns (exclude redundant ones like Type, Season, Episode, etc.)
columns_to_keep = [
    "imdb_id",
    "Title",
    "Year",
    "Rated",
    "Released",
    "Runtime",
    "Genre",
    "Director",
    "Writer",
    "Actors",
    "Plot",
    "Language",
    "Country",
    "Awards",
    "Poster",
    "Metascore",
    "imdbRating",
    "imdbVotes",
    "BoxOffice",
    "Production",
    "Rating_InternetMovieDatabase",
    "Rating_RottenTomatoes",
    "Rating_Metacritic",
]

# Only keep columns that exist in the dataframe
available_columns = [col for col in columns_to_keep if col in omdb_merged.columns]
omdb_cleaned = omdb_merged[available_columns].copy()

# Add prefix to OMDB columns to avoid conflicts (except imdb_id which is the merge key)
omdb_columns = {col: f"omdb_{col.lower()}" if col != "imdb_id" else col 
                for col in omdb_cleaned.columns}
omdb_cleaned = omdb_cleaned.rename(columns=omdb_columns)

print(f"Final OMDB data: {len(omdb_cleaned)} rows, {len(omdb_cleaned.columns)} columns")
omdb_cleaned.head()


After filtering to post-2000 releases: 6235 rows
Final OMDB data: 6235 rows, 23 columns


Unnamed: 0,imdb_id,omdb_title,omdb_year,omdb_rated,omdb_released,omdb_runtime,omdb_genre,omdb_director,omdb_writer,omdb_actors,...,omdb_awards,omdb_poster,omdb_metascore,omdb_imdbrating,omdb_imdbvotes,omdb_boxoffice,omdb_production,omdb_rating_internetmoviedatabase,omdb_rating_rottentomatoes,omdb_rating_metacritic
1,tt9362736,Die My Love,2025,R,07 Nov 2025,119 min,"Drama, Thriller",Lynne Ramsay,"Enda Walsh, Lynne Ramsay, Alice Birch","Jennifer Lawrence, Robert Pattinson, Sissy Spacek",...,10 nominations total,https://m.media-amazon.com/images/M/MV5BYjc5OW...,72.0,6.6,9529.0,"$4,884,888",,6.6/10,,72/100
2,tt29567915,Nuremberg,2025,PG-13,07 Nov 2025,148 min,"Drama, History, Thriller",James Vanderbilt,"Jack El-Hai, James Vanderbilt","Rami Malek, Russell Crowe, Richard E. Grant",...,1 win & 4 nominations total,https://m.media-amazon.com/images/M/MV5BMjZhNG...,,,,,,,67%,
3,tt31227572,Predator: Badlands,2025,PG-13,07 Nov 2025,107 min,"Action, Adventure, Sci-Fi",Dan Trachtenberg,"Patrick Aison, Jim Thomas, John Thomas","Elle Fanning, Dimitrius Schuster-Koloamatangi",...,,https://m.media-amazon.com/images/M/MV5BNTdjZG...,71.0,7.6,18100.0,"$40,000,000",,7.6/10,85%,71/100
4,tt12583926,Anniversary,2025,R,29 Oct 2025,,Thriller,Jan Komasa,"Lori Rosene-Gambino, Jan Komasa","Diane Lane, Kyle Chandler, Zoey Deutch",...,,,,,,,,,62%,
5,tt14661372,Anniversary,2021,,26 Aug 2021,7 min,"Short, Horror",Craig Ouellette,Craig Ouellette,"David Crane, David T. Crane, Katie Peabody",...,1 win,https://m.media-amazon.com/images/M/MV5BZjQ2Yj...,,,,,,,,


## 4. Merge Final Dataset with OMDB Data

**Important:** We use LEFT JOIN to preserve ALL movies from final_df, even if they don't have OMDB data.


In [6]:
# Left join to preserve all movies from final_df
final_merged = final_df.merge(
    omdb_cleaned,
    on="imdb_id",
    how="left",  # Keep all movies from final_df
    suffixes=("", "_omdb")
)

print(f"Merge complete: {len(final_merged)} rows, {len(final_merged.columns)} columns")
if 'omdb_title' in final_merged.columns:
    print(f"Movies with OMDB data: {final_merged['omdb_title'].notna().sum()}")
    print(f"Movies without OMDB data: {final_merged['omdb_title'].isna().sum()}")
    print(f"Percentage with OMDB data: {(final_merged['omdb_title'].notna().sum() / len(final_merged) * 100):.1f}%")
else:
    print("Warning: OMDB data columns not found in merged dataset")


Merge complete: 10067 rows, 61 columns
Movies with OMDB data: 6233
Movies without OMDB data: 3834
Percentage with OMDB data: 61.9%


## 5. Clean Up Duplicate/Redundant Columns


In [7]:
initial_cols = len(final_merged.columns)
columns_to_drop = []

# Check for duplicate date columns
if "date_x" in final_merged.columns and "date_y" in final_merged.columns:
    # Keep date_x (from final_df) and drop date_y
    columns_to_drop.append("date_y")
    if "date_x" in final_merged.columns:
        final_merged = final_merged.rename(columns={"date_x": "date"})

# Drop columns with all null values
null_cols = final_merged.columns[final_merged.isnull().all()].tolist()
columns_to_drop.extend(null_cols)

if columns_to_drop:
    final_merged = final_merged.drop(columns=columns_to_drop)
    print(f"Dropped {len(columns_to_drop)} redundant/empty columns")

print(f"Final columns: {len(final_merged.columns)} (reduced from {initial_cols})")


Final columns: 61 (reduced from 61)


## 6. Dataset Summary and Missing Value Analysis


In [9]:
print("Final Dataset Summary:")
print(f"Total rows: {len(final_merged)}")
print(f"Total columns: {len(final_merged.columns)}")
print("\nMissing values per column (top 15):")
missing_counts = final_merged.isnull().sum()
missing_counts = missing_counts[missing_counts > 0].sort_values(ascending=False)
for col, count in missing_counts.head(15).items():
    pct = (count / len(final_merged)) * 100
    print(f"  {col}: {count} ({pct:.1f}%)")


Final Dataset Summary:
Total rows: 10067
Total columns: 61

Missing values per column (top 15):
  omdb_production: 10038 (99.7%)
  belongs_to_collection: 9216 (91.5%)
  omdb_metascore: 6183 (61.4%)
  omdb_rating_metacritic: 6182 (61.4%)
  omdb_boxoffice: 6044 (60.0%)
  omdb_rating_rottentomatoes: 5918 (58.8%)
  omdb_awards: 5815 (57.8%)
  omdb_rated: 5534 (55.0%)
  omdb_rating_internetmoviedatabase: 4698 (46.7%)
  omdb_imdbrating: 4698 (46.7%)
  omdb_imdbvotes: 4480 (44.5%)
  omdb_writer: 4170 (41.4%)
  omdb_runtime: 4051 (40.2%)
  omdb_poster: 4022 (40.0%)
  omdb_actors: 4022 (40.0%)


## 7. Preview Final Dataset


In [10]:
# Display first few rows
final_merged.head()


Unnamed: 0,ticker,date,title,distributor,gross,percent_yd,percent_lw,theaters,per_theater,total_gross,...,omdb_awards,omdb_poster,omdb_metascore,omdb_imdbrating,omdb_imdbvotes,omdb_boxoffice,omdb_production,omdb_rating_internetmoviedatabase,omdb_rating_rottentomatoes,omdb_rating_metacritic
0,PARA,2016-06-02,10 Cloverfield Lane,Paramount Pi…,11414,0.32,-0.12,120.0,95.0,72082999,...,16 wins & 48 nominations total,https://m.media-amazon.com/images/M/MV5BMjEzMj...,76.0,7.2,377108.0,"$72,082,998",,7.2/10,91%,76/100
1,Private,2006-09-04,10th & Wolf,ThinkFilm,1791,0.0,0.0,6.0,299.0,49783,...,1 win,https://m.media-amazon.com/images/M/MV5BMjE1ND...,36.0,6.3,7033.0,"$54,702",,6.3/10,19%,36/100
2,6758,2009-05-25,12,Sony Picture…,344,0.0,0.0,5.0,69.0,119587,...,,,,,,,,,,
3,6758,2009-05-25,12,Sony Picture…,344,0.0,0.0,5.0,69.0,119587,...,,https://m.media-amazon.com/images/M/MV5BN2I5Yj...,,,,,,,,
4,DIS,2009-05-25,12 Rounds,20th Century…,4832,0.0,0.98,29.0,167.0,12187944,...,,https://m.media-amazon.com/images/M/MV5BZDI5NG...,38.0,5.6,30927.0,"$12,234,694",,5.6/10,31%,38/100


## 8. Save Final Merged Dataset


In [12]:
# Preview first few rows before saving
print("First few rows of final merged dataset:")
print("=" * 80)
display(final_merged.head(10))
print("\n" + "=" * 80)
print(f"\nDataset shape: {final_merged.shape}")
print(f"Columns: {list(final_merged.columns)}")

# Save final merged dataset
output_path = f"{CLEAN_DATA_PATH}/final_merged_dataset.csv"
final_merged.to_csv(output_path, index=False)
print(f"\n✓ Saved final merged dataset to: {output_path}")
print(f"Shape: {final_merged.shape}")


First few rows of final merged dataset:


Unnamed: 0,ticker,date,title,distributor,gross,percent_yd,percent_lw,theaters,per_theater,total_gross,...,omdb_awards,omdb_poster,omdb_metascore,omdb_imdbrating,omdb_imdbvotes,omdb_boxoffice,omdb_production,omdb_rating_internetmoviedatabase,omdb_rating_rottentomatoes,omdb_rating_metacritic
0,PARA,2016-06-02,10 Cloverfield Lane,Paramount Pi…,11414,0.32,-0.12,120.0,95.0,72082999,...,16 wins & 48 nominations total,https://m.media-amazon.com/images/M/MV5BMjEzMj...,76.0,7.2,377108.0,"$72,082,998",,7.2/10,91%,76/100
1,Private,2006-09-04,10th & Wolf,ThinkFilm,1791,0.0,0.0,6.0,299.0,49783,...,1 win,https://m.media-amazon.com/images/M/MV5BMjE1ND...,36.0,6.3,7033.0,"$54,702",,6.3/10,19%,36/100
2,6758,2009-05-25,12,Sony Picture…,344,0.0,0.0,5.0,69.0,119587,...,,,,,,,,,,
3,6758,2009-05-25,12,Sony Picture…,344,0.0,0.0,5.0,69.0,119587,...,,https://m.media-amazon.com/images/M/MV5BN2I5Yj...,,,,,,,,
4,DIS,2009-05-25,12 Rounds,20th Century…,4832,0.0,0.98,29.0,167.0,12187944,...,,https://m.media-amazon.com/images/M/MV5BZDI5NG...,38.0,5.6,30927.0,"$12,234,694",,5.6/10,31%,38/100
5,WBD,2018-03-29,12 Strong,Warner Bros.,4502,0.08,-0.45,95.0,47.0,45500164,...,3 nominations total,https://m.media-amazon.com/images/M/MV5BNTEzMj...,54.0,6.5,97951.0,"$45,819,713",,6.5/10,50%,54/100
6,SONY,2004-06-03,13 Going On 30,Sony Pictures,115000,0.01,-0.59,1164.0,99.0,54901000,...,11 nominations total,https://m.media-amazon.com/images/M/MV5BMjE1Nz...,57.0,6.3,239662.0,"$57,231,747",,6.3/10,65%,57/100
7,AMZN,2007-09-03,1408,MGM,38250,0.0,0.0,218.0,175.0,71519946,...,4 wins & 12 nominations total,https://m.media-amazon.com/images/M/MV5BMjQ2OD...,64.0,6.8,309249.0,"$71,985,628",,6.8/10,79%,64/100
8,WBD,2001-04-05,15 Minutes,New Line,89000,-0.04,-0.56,936.0,95.0,23917000,...,1 nomination total,https://m.media-amazon.com/images/M/MV5BOTg5MD...,34.0,6.1,53088.0,"$24,403,552",,6.1/10,32%,34/100
9,WBD,2001-04-05,15 Minutes,New Line,89000,-0.04,-0.56,936.0,95.0,23917000,...,,,,,,,,,,




Dataset shape: (10067, 61)
Columns: ['ticker', 'date', 'title', 'distributor', 'gross', 'percent_yd', 'percent_lw', 'theaters', 'per_theater', 'total_gross', 'days_in_release', 'parent company', 'release_date', 'year', 'title_key', 'tmdb_id', 'popularity', 'weekday', 'release_month', 'release_weekday', 'is_weekend', 'imdb_id', 'original_language', 'status', 'budget', 'revenue', 'adult', 'overview', 'runtime', 'vote_average', 'vote_count', 'origin_country', 'spoken_languages', 'genre_ids', 'genre_names', 'production_company_ids', 'production_company_names', 'belongs_to_collection', 'gross_per_theater', 'omdb_title', 'omdb_year', 'omdb_rated', 'omdb_released', 'omdb_runtime', 'omdb_genre', 'omdb_director', 'omdb_writer', 'omdb_actors', 'omdb_plot', 'omdb_language', 'omdb_country', 'omdb_awards', 'omdb_poster', 'omdb_metascore', 'omdb_imdbrating', 'omdb_imdbvotes', 'omdb_boxoffice', 'omdb_production', 'omdb_rating_internetmoviedatabase', 'omdb_rating_rottentomatoes', 'omdb_rating_metacr