## Data Import and Preprocessing
### Movie Data

In [1]:
import pandas as pd

# Load movie data
movies = pd.read_csv("data/omdb_enriched_data.csv")
movies.head(1)

Unnamed: 0,title,release_date,original_language,genres,budget,revenue,runtime,year,Rated,Poster,Ratings,Metascore,imdbRating,imdbVotes,imdbID
0,Meg 2: The Trench,2023-08-02,en,Action-Science Fiction-Horror,129000000.0,352056482.0,116.0,2023.0,PG-13,posters/Meg_2:_The_Trench_2023_photo.jpg,"[{'Source': 'Internet Movie Database', 'Value'...",,5.0,86089,tt9224104


In [2]:
# Drop unused columns
movies.drop(columns=['imdbID', 'imdbVotes', 'Metascore', 'year', 'original_language'], inplace=True)

# Check updated DataFrame
movies.head(1)

Unnamed: 0,title,release_date,genres,budget,revenue,runtime,Rated,Poster,Ratings,imdbRating
0,Meg 2: The Trench,2023-08-02,Action-Science Fiction-Horror,129000000.0,352056482.0,116.0,PG-13,posters/Meg_2:_The_Trench_2023_photo.jpg,"[{'Source': 'Internet Movie Database', 'Value'...",5.0


In [3]:
# Convert release_date to datetime and extract Month
movies['release_date'] = pd.to_datetime(movies['release_date'])
movies['Month'] = movies['release_date'].dt.strftime('%m-%Y')

# Drop release_date column
movies = movies.drop(columns=['release_date'])


In [4]:
# Map 'Rated' to numerical values
rating_map = {
    'G': 0,
    'PG': 1,
    'PG-13': 2,
    'R': 3,
    'NC-17': 4
}
movies['Age Rating'] = movies['Rated'].map(rating_map)

# Handle missing values by assigning -1
movies['Age Rating'] = movies['Age Rating'].fillna(-1)

# Drop the original 'Rated' column
movies = movies.drop(columns=['Rated'])

In [5]:
import numpy as np

# Turn string representations of Ratings dictionary into actual python object
movies['Ratings'] = movies['Ratings'].apply(lambda x: eval(x) if isinstance(x, str) else x)

# Function to extract Rotten Tomatoes ratings
def extract_rt_rating(ratings_list):
    if isinstance(ratings_list, list):
        for rating in ratings_list:
            if rating['Source'] == 'Rotten Tomatoes':
                return float(rating['Value'].replace('%', '')) / 10  # Convert percentage to 0-10 scale
    return np.nan

# Apply the function to extract RT_Rating
movies['RT_Rating'] = movies['Ratings'].apply(extract_rt_rating)

# Drop the Ratings column if no longer needed
movies.drop(columns=['Ratings'], inplace=True)


In [6]:
# Fill missing IMDb and RT ratings with their respective medians
movies['imdbRating'] = movies['imdbRating'].fillna(movies['imdbRating'].median())
movies['RT_Rating'] = movies['RT_Rating'].fillna(movies['RT_Rating'].median())

# Check result
movies.head(1)

Unnamed: 0,title,genres,budget,revenue,runtime,Poster,imdbRating,Month,Age Rating,RT_Rating
0,Meg 2: The Trench,Action-Science Fiction-Horror,129000000.0,352056482.0,116.0,posters/Meg_2:_The_Trench_2023_photo.jpg,5.0,08-2023,2.0,2.7


In [7]:
# Fill any missing genre values
movies['genres'] = movies['genres'].fillna('') 

# Split genres and get all unique genres
genres_split = movies['genres'].str.split('-', expand=True)
all_genres = set(genres_split.stack())  # Get unique genres

# Create one-hot encoded columns for each genre
for genre in all_genres:
    movies[genre] = movies['genres'].apply(lambda x: 1 if genre in x.split('-') else 0)

# Drop the original genres column
movies = movies.drop(columns=['genres'])


In [8]:
movies.head(1)

Unnamed: 0,title,budget,revenue,runtime,Poster,imdbRating,Month,Age Rating,RT_Rating,Unnamed: 10,...,Science Fiction,Music,Animation,Documentary,Adventure,Comedy,Romance,Action,War,TV Movie
0,Meg 2: The Trench,129000000.0,352056482.0,116.0,posters/Meg_2:_The_Trench_2023_photo.jpg,5.0,08-2023,2.0,2.7,0,...,1,0,0,0,0,0,0,1,0,0


In [9]:
from sklearn.preprocessing import StandardScaler

# Fill missing values with the median
movies['budget'] = movies['budget'].fillna(movies['budget'].median())
movies['revenue'] = movies['revenue'].fillna(movies['revenue'].median())

# Standardize budget and revenue to better suit NN model
scaler = StandardScaler()
movies[['budget', 'revenue']] = scaler.fit_transform(movies[['budget', 'revenue']])


In [10]:
pd.set_option('display.max_columns', None)

In [11]:
movies.head(1)

Unnamed: 0,title,budget,revenue,runtime,Poster,imdbRating,Month,Age Rating,RT_Rating,Unnamed: 10,Thriller,History,Fantasy,Horror,Mystery,Crime,Drama,Family,Western,Science Fiction,Music,Animation,Documentary,Adventure,Comedy,Romance,Action,War,TV Movie
0,Meg 2: The Trench,2.634428,1.732435,116.0,posters/Meg_2:_The_Trench_2023_photo.jpg,5.0,08-2023,2.0,2.7,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0


In [12]:
# Fill missing runtime with the mean value
movies['runtime'] = movies['runtime'].fillna(movies['runtime'].mean())
movies.head()

Unnamed: 0,title,budget,revenue,runtime,Poster,imdbRating,Month,Age Rating,RT_Rating,Unnamed: 10,Thriller,History,Fantasy,Horror,Mystery,Crime,Drama,Family,Western,Science Fiction,Music,Animation,Documentary,Adventure,Comedy,Romance,Action,War,TV Movie
0,Meg 2: The Trench,2.634428,1.732435,116.0,posters/Meg_2:_The_Trench_2023_photo.jpg,5.0,08-2023,2.0,2.7,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
1,The Pope's Exorcist,-0.154311,-0.034583,103.0,posters/The_Pope's_Exorcist_2023_photo.jpg,6.1,04-2023,3.0,5.0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Deadpool & Wolverine,4.418216,7.744225,128.0,posters/Deadpool_&_Wolverine_2024_photo.jpg,7.9,07-2024,3.0,7.8,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0
3,Transformers: Rise of the Beasts,4.418216,2.071727,127.0,posters/Transformers:_Rise_of_the_Beasts_2023_...,6.0,06-2023,2.0,5.1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0
4,Dune: Part Two,4.166979,3.779435,167.0,posters/Dune:_Part_Two_2024_photo.jpg,8.5,02-2024,2.0,9.2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0


In [13]:
# Poster is of greater importance to the project so drop rows where Poster is missing
movies = movies.dropna(subset=['Poster'])

In [14]:
movies.head()


Unnamed: 0,title,budget,revenue,runtime,Poster,imdbRating,Month,Age Rating,RT_Rating,Unnamed: 10,Thriller,History,Fantasy,Horror,Mystery,Crime,Drama,Family,Western,Science Fiction,Music,Animation,Documentary,Adventure,Comedy,Romance,Action,War,TV Movie
0,Meg 2: The Trench,2.634428,1.732435,116.0,posters/Meg_2:_The_Trench_2023_photo.jpg,5.0,08-2023,2.0,2.7,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
1,The Pope's Exorcist,-0.154311,-0.034583,103.0,posters/The_Pope's_Exorcist_2023_photo.jpg,6.1,04-2023,3.0,5.0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Deadpool & Wolverine,4.418216,7.744225,128.0,posters/Deadpool_&_Wolverine_2024_photo.jpg,7.9,07-2024,3.0,7.8,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0
3,Transformers: Rise of the Beasts,4.418216,2.071727,127.0,posters/Transformers:_Rise_of_the_Beasts_2023_...,6.0,06-2023,2.0,5.1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0
4,Dune: Part Two,4.166979,3.779435,167.0,posters/Dune:_Part_Two_2024_photo.jpg,8.5,02-2024,2.0,9.2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0


### Crime Data

In [15]:
# Load crime data
Reports = pd.read_csv("data/States_Reports.csv")

# Inspect the structure of the Reports data
# Reports.info()
Reports.head(2)

Unnamed: 0,Month,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District of Columbia,Florida,Georgia,Hawaii,Idaho,Illinois,Indiana,Iowa,Kansas,Kentucky,Louisiana,Maine,Maryland,Massachusetts,Michigan,Minnesota,Mississippi,Missouri,Montana,Nebraska,Nevada,New Hampshire,New Jersey,New Mexico,New York,North Carolina,North Dakota,Ohio,Oklahoma,Oregon,Pennsylvania,Rhode Island,South Carolina,South Dakota,Tennessee,Texas,Utah,Vermont,Utah.1,Washington,West Virginia,Wisconsin,Wyoming
0,10-2014,8196.0,1094.0,7581.0,4719.0,34958.0,4151.0,3227.0,1652.0,2270.0,0.0,11895.0,451.0,1315.0,4482.0,5829.0,2472.0,2652.0,3466.0,6862.0,942.0,7197.0,6980.0,12901.0,3685.0,2233.0,7533.0,978.0,1770.0,4602.0,1495.0,6792.0,2290.0,13821.0,10770.0,897.0,12860.0,4483.0,2739.0,10661.0,871.0,8830.0,867.0,12574.0,35436.0,2655.0,260.0,2655.0,6685.0,1376.0,3627.0,533.0
1,11-2014,7729.0,1123.0,6942.0,4060.0,31782.0,3842.0,2838.0,1455.0,1857.0,0.0,10471.0,1229.0,1153.0,3817.0,5186.0,2278.0,2534.0,3227.0,6138.0,860.0,5201.0,6267.0,11360.0,3182.0,2038.0,6476.0,868.0,1554.0,4270.0,1192.0,6114.0,2035.0,12492.0,9348.0,820.0,11094.0,3897.0,2417.0,9366.0,828.0,7337.0,792.0,10996.0,31637.0,2475.0,215.0,2475.0,6279.0,1256.0,3266.0,560.0


In [16]:
# Drop Florida (inconsistent data) and Utah.1 (duplicated column)
Reports = Reports.drop(columns=['Florida', 'Utah.1'])
# Drop rows with missing values
Reports = Reports.dropna()


### Merge Movie Data with Crime Data

In [17]:
# Merge movies and Reports on 'Month'
merged_data = pd.merge(movies, Reports, on='Month', how='inner')

In [18]:
merged_data.head(2)

Unnamed: 0,title,budget,revenue,runtime,Poster,imdbRating,Month,Age Rating,RT_Rating,Unnamed: 10,Thriller,History,Fantasy,Horror,Mystery,Crime,Drama,Family,Western,Science Fiction,Music,Animation,Documentary,Adventure,Comedy,Romance,Action,War,TV Movie,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District of Columbia,Georgia,Hawaii,Idaho,Illinois,Indiana,Iowa,Kansas,Kentucky,Louisiana,Maine,Maryland,Massachusetts,Michigan,Minnesota,Mississippi,Missouri,Montana,Nebraska,Nevada,New Hampshire,New Jersey,New Mexico,New York,North Carolina,North Dakota,Ohio,Oklahoma,Oregon,Pennsylvania,Rhode Island,South Carolina,South Dakota,Tennessee,Texas,Utah,Vermont,Washington,West Virginia,Wisconsin,Wyoming
0,Meg 2: The Trench,2.634428,1.732435,116.0,posters/Meg_2:_The_Trench_2023_photo.jpg,5.0,08-2023,2.0,2.7,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,6795.0,1310.0,8724.0,5462.0,43727.0,6387.0,2604.0,1263.0,2390.0,11413.0,923.0,1510.0,18144.0,7360.0,2960.0,4197.0,4337.0,6562.0,1218.0,7045.0,6362.0,14452.0,4235.0,1747.0,7830.0,1279.0,2152.0,5574.0,1074.0,6921.0,3746.0,29029.0,11740.0,1092.0,13831.0,6157.0,3855.0,9791.0,857.0,8517.0,1243.0,12738.0,40485.0,3389.0,377.0,7866.0,1444.0,4250.0,613.0
1,Retribution,-0.104064,-0.360185,93.0,posters/Retribution_2023_photo.jpg,5.3,08-2023,3.0,3.0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,6795.0,1310.0,8724.0,5462.0,43727.0,6387.0,2604.0,1263.0,2390.0,11413.0,923.0,1510.0,18144.0,7360.0,2960.0,4197.0,4337.0,6562.0,1218.0,7045.0,6362.0,14452.0,4235.0,1747.0,7830.0,1279.0,2152.0,5574.0,1074.0,6921.0,3746.0,29029.0,11740.0,1092.0,13831.0,6157.0,3855.0,9791.0,857.0,8517.0,1243.0,12738.0,40485.0,3389.0,377.0,7866.0,1444.0,4250.0,613.0


In [19]:
# merged_data.info()
merged_data.shape

(1730, 78)

### Process Poster Images

In [20]:
from tensorflow.keras.preprocessing.image import load_img, img_to_array
import numpy as np

# Define function to preprocess posters into ResNet compatible form
def preprocess_image(poster_path, target_size=(224, 224)):
    try:
        # Load the image and resize to 224x224
        img = load_img(poster_path, target_size=target_size)
        # Convert to NumPy array
        img_array = img_to_array(img)
        # Normalize pixel values to [0, 1]
        img_array /= 255.0
        # Add batch dimension for ResNet
        return np.expand_dims(img_array, axis=0)
    except Exception as e:
        print(f"Error loading image at {poster_path}: {e}")
        return None


In [21]:
# movies['Poster'] = movies['Poster'].str.replace(':_', '_', regex=False)

In [22]:
# Test preprocessing on a single poster
sample_poster_path = "posters/Retribution_2023_photo.jpg"  # A given image path
preprocessed_image = preprocess_image(sample_poster_path)

# Check the shape of the preprocessed image
if preprocessed_image is not None:
    print("Preprocessed image shape:", preprocessed_image.shape)

Preprocessed image shape: (1, 224, 224, 3)


In [23]:
import os

# Windows won't allow colons in filenames
if os.name == 'nt':
    merged_data['Poster'] = merged_data['Poster'].str.replace(":", "", regex=False)

# Update function to use the paths directly from the DataFrame
poster_images = []

poster_folder_path = '' #"path/to/posters/folder"

for path in merged_data['Poster']:
    full_path = os.path.join(poster_folder_path, path)
    processed_image = preprocess_image(full_path)
    poster_images.append(processed_image)


Error loading image at posters/Are_You_There_God?_It's_Me,_Margaret._2023_photo.jpg: [Errno 22] Invalid argument: "posters/Are_You_There_God?_It's_Me,_Margaret._2023_photo.jpg"
Error loading image at posters/Do_You_Want_To_Win?_2017_photo.jpg: [Errno 22] Invalid argument: 'posters/Do_You_Want_To_Win?_2017_photo.jpg'
Error loading image at posters/Do_You_Want_To_Win?_2017_photo.jpg: [Errno 22] Invalid argument: 'posters/Do_You_Want_To_Win?_2017_photo.jpg'
Error loading image at posters/Do_You_Want_To_Win?_2017_photo.jpg: [Errno 22] Invalid argument: 'posters/Do_You_Want_To_Win?_2017_photo.jpg'
Error loading image at posters/TÁR_2022_photo.jpg: [Errno 2] No such file or directory: 'posters/TÁR_2022_photo.jpg'
Error loading image at posters/Dalíland_2022_photo.jpg: [Errno 2] No such file or directory: 'posters/Dalíland_2022_photo.jpg'
Error loading image at posters/Pokémon_Detective_Pikachu_2019_photo.jpg: [Errno 2] No such file or directory: 'posters/Pokémon_Detective_Pikachu_2019_photo.

In [24]:
poster_images[0:5]

[array([[[[0.44705883, 0.61960787, 0.7647059 ],
          [0.49019608, 0.654902  , 0.8039216 ],
          [0.5137255 , 0.68235296, 0.81960785],
          ...,
          [0.69411767, 0.8117647 , 0.85490197],
          [0.69411767, 0.8117647 , 0.85490197],
          [0.69803923, 0.8156863 , 0.85490197]],
 
         [[0.4509804 , 0.62352943, 0.7607843 ],
          [0.4392157 , 0.60784316, 0.7411765 ],
          [0.4627451 , 0.6313726 , 0.7647059 ],
          ...,
          [0.69411767, 0.8117647 , 0.85490197],
          [0.69411767, 0.8117647 , 0.85490197],
          [0.69803923, 0.8156863 , 0.85490197]],
 
         [[0.49411765, 0.67058825, 0.7921569 ],
          [0.45882353, 0.627451  , 0.7529412 ],
          [0.46666667, 0.6392157 , 0.7529412 ],
          ...,
          [0.69411767, 0.8117647 , 0.85490197],
          [0.69411767, 0.8117647 , 0.85490197],
          [0.7019608 , 0.8117647 , 0.85490197]],
 
         ...,
 
         [[0.01568628, 0.01568628, 0.16470589],
          [0.     

In [25]:
from tensorflow.keras.applications import ResNet50

# Load pre-trained ResNet50 model
# include_top=False to remove final classification layer, returning final features instead
resnet = ResNet50(weights='imagenet', include_top=False, pooling='avg')

# Extract features for all valid poster images
poster_features = []
for img in poster_images:
    if img is not None:
        features = resnet.predict(img).flatten()  # Extract features and flatten the array
        poster_features.append(features)
    else:
        poster_features.append(None)  # Keep alignment with the dataset


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65

In [26]:
len(poster_features)

1730

In [27]:
merged_data.shape

(1730, 78)

In [28]:
# Add Poster_Features to the DataFrame
merged_data['Poster_Features'] = poster_features

# Drop rows with missing Poster_Features
merged_data = merged_data[merged_data['Poster_Features'].notna()]

# Drop NA values from poster_features 
poster_features = [poster for poster in poster_features if poster is not None]

# Drop the original Poster column (optional if not done already)
merged_data = merged_data.drop(columns=['Poster'])



In [29]:
# Verify and save the cleaned dataset
# merged_data.info()
merged_data.shape

(1714, 78)

In [30]:
merged_data.columns

Index(['title', 'budget', 'revenue', 'runtime', 'imdbRating', 'Month',
       'Age Rating', 'RT_Rating', '', 'Thriller', 'History', 'Fantasy',
       'Horror', 'Mystery', 'Crime', 'Drama', 'Family', 'Western',
       'Science Fiction', 'Music', 'Animation', 'Documentary', 'Adventure',
       'Comedy', 'Romance', 'Action', 'War', 'TV Movie', 'Alabama', 'Alaska',
       'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut',
       'Delaware', 'District of Columbia', 'Georgia', 'Hawaii', 'Idaho',
       'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana',
       'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota',
       'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada',
       'New Hampshire', 'New Jersey', 'New Mexico', 'New York',
       'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon',
       'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota',
       'Tennessee', 'Texas', 'Utah', 'Vermont', 'Washington', 'West Virgin

## Build Models

### Prepare Data for Initial Total Crime Model

In [31]:
state_columns = [
    'Alabama', 'Alaska','Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut',
    'Delaware', 'District of Columbia', 'Georgia', 'Hawaii', 'Idaho',
    'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana',
    'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota',
    'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada',
    'New Hampshire', 'New Jersey', 'New Mexico', 'New York',
    'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon',
    'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota',
    'Tennessee', 'Texas', 'Utah', 'Vermont', 'Washington', 'West Virginia',
    'Wisconsin', 'Wyoming'
]

In [32]:
# For initial model sum the crime reports accross all states
merged_data['Total_Crime'] = merged_data[state_columns].sum(axis=1)

In [33]:
# Select month Total_Crime values as y
y = merged_data['Total_Crime'].values

In [34]:
merged_data.columns

Index(['title', 'budget', 'revenue', 'runtime', 'imdbRating', 'Month',
       'Age Rating', 'RT_Rating', '', 'Thriller', 'History', 'Fantasy',
       'Horror', 'Mystery', 'Crime', 'Drama', 'Family', 'Western',
       'Science Fiction', 'Music', 'Animation', 'Documentary', 'Adventure',
       'Comedy', 'Romance', 'Action', 'War', 'TV Movie', 'Alabama', 'Alaska',
       'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut',
       'Delaware', 'District of Columbia', 'Georgia', 'Hawaii', 'Idaho',
       'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana',
       'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota',
       'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada',
       'New Hampshire', 'New Jersey', 'New Mexico', 'New York',
       'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon',
       'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota',
       'Tennessee', 'Texas', 'Utah', 'Vermont', 'Washington', 'West Virgin

In [35]:
# Define columns to exclude
non_feature_columns = ['title', 'Month', 'Poster_Features', 'Total_Crime'] + state_columns

# Drop these columns to get numerical/categorical features
num_features = merged_data.drop(columns=non_feature_columns)

In [36]:
num_features.head(2)

Unnamed: 0,budget,revenue,runtime,imdbRating,Age Rating,RT_Rating,Unnamed: 7,Thriller,History,Fantasy,Horror,Mystery,Crime,Drama,Family,Western,Science Fiction,Music,Animation,Documentary,Adventure,Comedy,Romance,Action,War,TV Movie
0,2.634428,1.732435,116.0,5.0,2.0,2.7,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
1,-0.104064,-0.360185,93.0,5.3,3.0,3.0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0


In [37]:
num_features = num_features.values

In [38]:
num_features.shape

(1714, 26)

In [39]:
from sklearn.model_selection import train_test_split

# Train-test split for single-output regression
X_num_train, X_num_test, X_poster_train, X_poster_test, y_train, y_test = train_test_split(
    num_features, poster_features, y, test_size=0.2, random_state=42
)

In [40]:
# Ensure poster features are structured as NumPy arrays
X_poster_train = np.vstack(X_poster_train)
X_poster_test = np.vstack(X_poster_test)

In [41]:
from tensorflow.keras.layers import Dense, Input, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

# Input layers
num_input = Input(shape=(X_num_train.shape[1],), name="Numerical_Input")
poster_input = Input(shape=(X_poster_train.shape[1],), name="Poster_Input")

# Dense layers for numerical features
x_num = Dense(64, activation='relu')(num_input)
x_num = Dense(32, activation='relu')(x_num)

# Merge poster and numerical features
merged = Concatenate()([poster_input, x_num])

# Fully connected layers
x = Dense(128, activation='relu')(merged)
x = Dense(64, activation='relu')(x)
output = Dense(1, activation='linear', name="Output")(x)

# Define and compile the model
model = Model(inputs=[poster_input, num_input], outputs=output)
model.compile(optimizer=Adam(learning_rate=1e-4), loss='mae', metrics=['mse'])

# Print model summary
print(model.summary())

# Train the model
history = model.fit(
    [X_poster_train, X_num_train], y_train,
    validation_data=([X_poster_test, X_num_test], y_test),
    epochs=20,
    batch_size=32
)


None
Epoch 1/20




[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 300751.0000 - mse: 91089584128.0000 - val_loss: 301671.5000 - val_mse: 91656978432.0000
Epoch 2/20
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 301683.1250 - mse: 91668094976.0000 - val_loss: 301652.9062 - val_mse: 91645747200.0000
Epoch 3/20
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 300871.9062 - mse: 91163320320.0000 - val_loss: 301616.7188 - val_mse: 91623915520.0000
Epoch 4/20
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 300742.1250 - mse: 91086626816.0000 - val_loss: 301545.5312 - val_mse: 91580948480.0000
Epoch 5/20
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 300900.1250 - mse: 91181424640.0000 - val_loss: 301415.5938 - val_mse: 91502534656.0000
Epoch 6/20
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 299235.0625 - mse: 

In [42]:
# Evaluate on the test set
results = model.evaluate([X_poster_test, X_num_test], y_test, verbose=1)
print(f"Test Loss (MAE): {results[0]}")
print(f"Test MSE: {results[1]}")

[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 204911.9062 - mse: 42848358400.0000 
Test Loss (MAE): 203421.828125
Test MSE: 42259038208.0


In [43]:
merged_data.Total_Crime.min()

203017.0

In [44]:
merged_data.Total_Crime.max()

373433.0

#### MAE for a model that simply predicts the mean each time

In [45]:
# Compute the baseline predictions
baseline_prediction = np.mean(y_train)  # You can also try np.median(y_train)

# Calculate baseline MAE
baseline_mae = np.mean(np.abs(y_test - baseline_prediction))

print(f"Baseline MAE (mean): {baseline_mae}")

Baseline MAE (mean): 20395.01101109403


#### MAE for a model that predicts the median each time

In [46]:
# Compute the baseline predictions
baseline_prediction2 = np.median(y_train)  # You can also try np.median(y_train)

# Calculate baseline MAE
baseline_mae2 = np.mean(np.abs(y_test - baseline_prediction2))

print(f"Baseline MAE (mean): {baseline_mae2}")

Baseline MAE (mean): 20107.032069970846


In [47]:
# Export to CSV
merged_data.to_csv('merged_data.csv', index=False)