In [11]:
import pandas as pd
import numpy as np
import os
import json
import requests
import time
import random
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt

# Load environment variables from .env file
try:
    from dotenv import load_dotenv
    load_dotenv()
    print("✓ Loaded environment variables from .env file")
except ImportError:
    print("⚠ python-dotenv not installed. Installing...")
    import subprocess
    import sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "python-dotenv"])
    from dotenv import load_dotenv
    load_dotenv()
    print("✓ Installed python-dotenv and loaded environment variables")

✓ Loaded environment variables from .env file


In [12]:
from utils import get_genres

In [13]:
# Load the dataset
df = pd.read_csv("../data/cleaned/final_merged_dataset.csv")
print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows of genre columns:")
print(df[['title', 'genre_ids', 'genre_names']].head())

Dataset shape: (3971, 72)

First few rows of genre columns:
                 title             genre_ids  \
0  10 Cloverfield Lane       53, 878, 18, 27   
1          10th & Wolf  28, 80, 18, 9648, 53   
2            12 Rounds            28, 53, 80   
3            12 Strong     10752, 18, 28, 36   
4       13 Going On 30         35, 14, 10749   

                                genre_names  
0  Thriller, Science Fiction, Drama, Horror  
1   Action, Crime, Drama, Mystery, Thriller  
2                   Action, Thriller, Crime  
3               War, Drama, Action, History  
4                  Comedy, Fantasy, Romance  


In [14]:
tmdb_key = os.getenv("TMDB_API_KEY")

headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {tmdb_key}"
}

In [15]:
genres = get_genres(headers)

In [16]:
genres

[{'id': 28, 'name': 'Action'},
 {'id': 12, 'name': 'Adventure'},
 {'id': 16, 'name': 'Animation'},
 {'id': 35, 'name': 'Comedy'},
 {'id': 80, 'name': 'Crime'},
 {'id': 99, 'name': 'Documentary'},
 {'id': 18, 'name': 'Drama'},
 {'id': 10751, 'name': 'Family'},
 {'id': 14, 'name': 'Fantasy'},
 {'id': 36, 'name': 'History'},
 {'id': 27, 'name': 'Horror'},
 {'id': 10402, 'name': 'Music'},
 {'id': 9648, 'name': 'Mystery'},
 {'id': 10749, 'name': 'Romance'},
 {'id': 878, 'name': 'Science Fiction'},
 {'id': 10770, 'name': 'TV Movie'},
 {'id': 53, 'name': 'Thriller'},
 {'id': 10752, 'name': 'War'},
 {'id': 37, 'name': 'Western'}]

In [17]:
from sklearn.preprocessing import MultiLabelBinarizer                                                                                              
                                                                                                                                                
df['genres_list'] = df['genre_names'].str.split(',').apply(lambda x: [g.strip().replace(" ","_").lower() for g in x] if isinstance(x, list) else [])                       
                                                                                                                                                    # Create binary columns                                                                                                                            │
mlb = MultiLabelBinarizer()                                                                                                                        
genre_matrix = mlb.fit_transform(df['genres_list'])                                                                                                
genre_cols = pd.DataFrame(genre_matrix, columns=mlb.classes_, index=df.index)                                                                      
                                                                                                                                                    
df_encoded = pd.concat([df, genre_cols], axis=1)                                                                                                   
                                                                                                                                                    
print(f"Created {len(mlb.classes_)} genre columns: {list(mlb.classes_)}")                                                                          
print(f"\nFinal shape: {df_encoded.shape}")        

Created 19 genre columns: ['action', 'adventure', 'animation', 'comedy', 'crime', 'documentary', 'drama', 'family', 'fantasy', 'history', 'horror', 'music', 'mystery', 'romance', 'science_fiction', 'thriller', 'tv_movie', 'war', 'western']

Final shape: (3971, 92)


In [18]:
df_encoded

Unnamed: 0,ticker,date,title,distributor,gross,percent_yd,percent_lw,theaters,per_theater,total_gross,...,history,horror,music,mystery,romance,science_fiction,thriller,tv_movie,war,western
0,PARA,2016-06-02,10 Cloverfield Lane,Paramount Pi…,11414,0.32,-0.12,120.0,95.0,72082999,...,0,1,0,0,0,1,1,0,0,0
1,Private,2006-09-04,10th & Wolf,ThinkFilm,1791,0.00,0.00,6.0,299.0,49783,...,0,0,0,1,0,0,1,0,0,0
2,DIS,2009-05-25,12 Rounds,20th Century…,4832,0.00,0.98,29.0,167.0,12187944,...,0,0,0,0,0,0,1,0,0,0
3,WBD,2018-03-29,12 Strong,Warner Bros.,4502,0.08,-0.45,95.0,47.0,45500164,...,1,0,0,0,0,0,0,0,1,0
4,SONY,2004-06-03,13 Going On 30,Sony Pictures,115000,0.01,-0.59,1164.0,99.0,54901000,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3966,Private,2007-05-28,Zoo,ThinkFilm,1741,0.00,0.00,5.0,348.0,57644,...,0,0,0,0,0,0,0,0,0,0
3967,SONY,2011-10-23,Zookeeper,Sony Pictures,13006,-0.32,-0.57,94.0,138.0,80360866,...,0,0,0,0,1,0,0,0,0,0
3968,PARA,2001-10-20,Zoolander,Paramount Pi…,1430000,0.49,-0.38,2285.0,626.0,39272000,...,0,0,0,0,0,0,0,0,0,0
3969,PARA,2016-03-31,Zoolander 2,Paramount Pi…,2649,-0.21,-0.74,60.0,44.0,28848693,...,0,0,0,0,0,0,0,0,0,0


In [19]:
# Save the encoded dataset with genre columns
import csv

output_path = "../data/cleaned/final_merged_dataset_with_genres.csv"
df_encoded.to_csv(output_path, 
                  index=False, 
                  quoting=csv.QUOTE_MINIMAL,
                  doublequote=True,
                  lineterminator='\n')

print(f"✓ Saved dataset with genre encoding to: {output_path}")
print(f"  Shape: {df_encoded.shape}")
print(f"  Original columns: {len(df.columns)}")
print(f"  Genre columns added: {len(mlb.classes_)}")
print(f"  Total columns: {len(df_encoded.columns)}")


✓ Saved dataset with genre encoding to: ../data/cleaned/final_merged_dataset_with_genres.csv
  Shape: (3971, 92)
  Original columns: 73
  Genre columns added: 19
  Total columns: 92
