---
title: Modeling Box Office Prices with Stock Information
author: Rachna, Amina, Lizzie, Sophia

format: 
    html: 
        embed-resources: true
        code-fold: true
        toc: true
---

ideas: 

1. model without stocks
2. model with stocks
3. ANN
4. word embeddings

In [53]:
import pandas as pd 
import numpy as np 
import sklearn 
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neural_network import MLPRegressor



In [47]:
df = pd.read_csv("merged_data/merged.csv", low_memory=False)
df = df.dropna(subset=['Open', 'IMDB Rating'])

df = df.pivot_table(
    index=[col for col in df.columns if col != 'Stock'],  # Keep all columns except 'Stock'
    columns='Stock',
    aggfunc='size',  # Count occurrences of each stock
    fill_value=0  # Fill with 0 where there is no occurrence
).reset_index()
df = df.drop(columns = ["Date", "Title", "Director", "Writer", "Actors", "Awards", "Description", "Poster URL", "Unnamed: 0"])
df['Box Office'] = df['Box Office'].replace({r'[\$,]': ''}, regex=True).astype(float)
df['IMDB Votes'] = df['IMDB Votes'].replace({r'[,]': ''}, regex=True).astype(float)
df['Runtime'] = df['Runtime'].str.replace(' min', '', regex=False).astype(float)


df = df.rename(columns={'^SPX': 'SPX'})

columns_to_encode = ['Genre', 'Rated', 'Country', 'Language']
encoder = LabelEncoder()
for column in columns_to_encode:
    df[column] = encoder.fit_transform(df[column])


df = df.astype('float64')


df.head()


Stock,Open,High,Low,Close,Adj Close,Volume,Year,Genre,Runtime,Rated,...,T,TECH,UBER,VIAC,VOYA,WIX,XOM,YETI,ZION,SPX
0,17.05,17.05,17.05,17.05,17.05,1810000.0,1949.0,166.0,93.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,19.77,19.77,19.77,19.77,19.77,1800000.0,1950.0,156.0,138.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,24.11,24.11,24.11,24.11,24.11,1130000.0,1952.0,130.0,103.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,26.15,26.15,26.15,26.15,26.15,1900000.0,1953.0,76.0,77.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,43.85,43.85,43.85,43.85,43.85,2260000.0,1958.0,212.0,128.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# Models

## Just Stock Data

In [59]:
x = df[['SPX', 'XOM', 'DIS', 'PFE', 'HUBB', 'ZION', 'AAPL', 'T', 'BBY', 'FCNCA', 'CMCSA', 'ORCL', 'QCOM', 'NFLX', 'GOOG', 'DISCA', 'LDOS', 'MASI', 'YETI', 'UBER', 'FOUR', 'INFA', 'Open', 'High', 'Low', 'Adj Close', 'Volume']]
y = df['Box Office']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

model = LinearRegression()
model.fit(x_train, y_train)

ridge = Ridge(alpha = .001)
ridge.fit(x_train, y_train)
# Predict on the test set
y_pred_linear = model.predict(x_test)
y_pred_ridge = ridge.predict(x_test)

# Evaluate the model
mse_lin = mean_squared_error(y_test, y_pred_linear)
r2_lin = r2_score(y_test, y_pred_linear)
print(f"Mean Squared Error - Linear Model: {mse_lin}")
print(f"R² Score - Linear Model: {r2_lin}")
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)
print(f"Mean Squared Error - Ridge Regression: {mse_ridge}")
print(f"R² Score - Ridge Regression: {r2_ridge}")

Mean Squared Error - Linear Model: 5419361762198487.0
R² Score - Linear Model: 0.005411211474170119
Mean Squared Error - Ridge Regression: 5419364134755189.0
R² Score - Ridge Regression: 0.0054107760505169855


## Just Movie Data

In [62]:
x = df[["Year", "Genre", "Runtime", "Rated", "IMDB Rating", "Metascore", "IMDB Votes", "Country", "Language"]]
y = df['Box Office']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

model = LinearRegression()
model.fit(x_train, y_train)

ridge = Ridge(alpha = .1)
ridge.fit(x_train, y_train)
# Predict on the test set
y_pred_linear = model.predict(x_test)
y_pred_ridge = ridge.predict(x_test)

# Evaluate the model
mse_lin = mean_squared_error(y_test, y_pred_linear)
r2_lin = r2_score(y_test, y_pred_linear)
print(f"Mean Squared Error - Linear Model: {mse_lin}")
print(f"R² Score - Linear Model: {r2_lin}")
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)
print(f"Mean Squared Error - Ridge Regression: {mse_ridge}")
print(f"R² Score - Ridge Regression: {r2_ridge}")

Mean Squared Error - Linear Model: 2838364868671196.0
R² Score - Linear Model: 0.4790888669183907
Mean Squared Error - Ridge Regression: 2838365305266892.0
R² Score - Ridge Regression: 0.4790887867921316


## Combination

In [65]:
x = df[["Year", "Genre", "Runtime", "Rated", "IMDB Rating", "Metascore", "IMDB Votes", "Country", "Language", 'SPX', 'XOM', 'DIS', 'PFE', 'HUBB', 'ZION', 'AAPL', 'T', 'BBY', 'FCNCA', 'CMCSA', 'ORCL', 'QCOM', 'NFLX', 'GOOG', 'DISCA', 'LDOS', 'MASI', 'YETI', 'UBER', 'FOUR', 'INFA', 'Open', 'High', 'Low', 'Adj Close', 'Volume']]
y = df['Box Office']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

model = LinearRegression()
model.fit(x_train, y_train)

ridge = Ridge(alpha = 1)
ridge.fit(x_train, y_train)
# Predict on the test set
y_pred_linear = model.predict(x_test)
y_pred_ridge = ridge.predict(x_test)

# Evaluate the model
mse_lin = mean_squared_error(y_test, y_pred_linear)
r2_lin = r2_score(y_test, y_pred_linear)
print(f"Mean Squared Error - Linear Model: {mse_lin}")
print(f"R² Score - Linear Model: {r2_lin}")
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)
print(f"Mean Squared Error - Ridge Regression: {mse_ridge}")
print(f"R² Score - Ridge Regression: {r2_ridge}")

Mean Squared Error - Linear Model: 2840474606008099.5
R² Score - Linear Model: 0.4787016772096957
Mean Squared Error - Ridge Regression: 2839030516308060.5
R² Score - Ridge Regression: 0.4789667038841101


## NN

In [67]:
x = df[["Year", "Genre", "Runtime", "Rated", "IMDB Rating", "Metascore", "IMDB Votes", "Country", "Language", 'SPX', 'XOM', 'DIS', 'PFE', 'HUBB', 'ZION', 'AAPL', 'T', 'BBY', 'FCNCA', 'CMCSA', 'ORCL', 'QCOM', 'NFLX', 'GOOG', 'DISCA', 'LDOS', 'MASI', 'YETI', 'UBER', 'FOUR', 'INFA', 'Open', 'High', 'Low', 'Adj Close', 'Volume']]
y = df['Box Office']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

model = MLPRegressor(hidden_layer_sizes=(5,100), max_iter=1000, random_state=42, alpha = 1)

# Train the model on the training data
model.fit(x_train, y_train)

# Predict on the test set
y_pred = model.predict(x_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

Mean Squared Error: 2892003583800038.0
R² Score: 0.46924481755630576


