This is the Model module

In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
import numpy as np
import math

In [7]:
# Loads train and test data and computes log revenue for the adj merged revenue feature
train_df = pd.read_csv("./3.3 Construct Data/Final Datasets/imputed_budget_train.csv")
train_df['Log Revenue'] = train_df['Adj Merged Revenue'].apply(lambda x: None if pd.isnull(x) else (None if x == 0 else math.log(x)))
test_df = pd.read_csv("./3.3 Construct Data/Final Datasets/imputed_budget_test.csv")

In [19]:
X_vars = ['Runtime', 'Genre Cluster', 'G', 'NC-17', 'NR', 'PG', 'PG-13', 'R', 'Holiday', 'Adj Merged Budget', 'Has Star Score', 'Has Director Score', 'Has Production Company Score', 'Has Domestic Distributor Score', 'Unweighted Star Score_normalized', 'Simple Weight Star Score_normalized', 'Log Weight Star Score_normalized', 'Exponential Weight Star Score_normalized', 'Total Director Score_normalized', 'Avg Director Score_normalized', 'Total Production Company Score_normalized', 'Avg Production Company Score_normalized', 'Domestic Distributor Score_normalized', 'Season_ASO_4', 'Season_FMA_2', 'Season_MJJ_3', 'Season_NDJ_1']
# Take the log of revenue
y = ['Log Revenue']

In [20]:
train_X = train_df[X_vars]
train_y = train_df[y]
test_X = test_df[X_vars]
test_y = test_df['Adj Merged Revenue']

We decided to use the iterative importer with a random forrest regressor

In [21]:
imp = IterativeImputer(min_value=0, max_iter=1000, random_state=102, estimator=RandomForestRegressor())

Now to build the pipeline, we need the type of models we want to try.

In [22]:
# Step 3: Building the Pipeline
# Assuming train_X and train_y are your training features and target variable
# Convert train_y to a 1D array
train_y = np.ravel(train_y)

pipeline = Pipeline([
    ('imputer', imp),
    ('regressor', RandomForestRegressor())  # RandomForestRegressor without grid search
])

# Train the RandomForestRegressor
pipeline.fit(train_X, train_y)

# Get the trained model
trained_model = pipeline.named_steps['regressor']


In [23]:
# Assuming you have test_X as your test features
# Make predictions using the trained model
predictions = pipeline.predict(test_X)

# Now you can use these predictions for further analysis or evaluation
# For instance, if you have test_y (actual target values), you can evaluate the model performance
# For example, using metrics like mean squared error (MSE) or R-squared
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Assuming test_y is your actual target variable for the test data
# Evaluate the model
mae = mean_absolute_error(test_y, np.exp(predictions))
mse = mean_squared_error(test_y, np.exp(predictions))
r_squared = r2_score(test_y, np.exp(predictions))

print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared:", r_squared)

Mean Squared Error (MSE): 827038722676370.9
Mean Absolute Error (MAE): 10261868.982437762
R-squared: 0.4316076830417983
