In [11]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
import numpy as np

In [12]:
import warnings
warnings.filterwarnings("ignore")

In [13]:
# Load train and test data
train_df = pd.read_csv("./3.3 Construct Data/Final Datasets/imputed_budget_train.csv")
test_df = pd.read_csv("./3.3 Construct Data/Final Datasets/imputed_budget_test.csv")

# Define bin edges for Adjusted Merged Budget
bin_edges = np.percentile(train_df['Adj Merged Budget'], [0, 33, 67, 100])
bin_labels = ['1', '2', '3']  # Adjusted the labels to match the number of bins (one less than edges)

# Split train data into three bins based on Adjusted Merged Budget
train_df['Budget Bins'] = pd.cut(train_df['Adj Merged Budget'], bins=bin_edges, labels=bin_labels)
# Split test data into three bins based on Adjusted Merged Budget
test_df['Budget Bins'] = pd.cut(test_df['Adj Merged Budget'], bins=bin_edges, labels=bin_labels)

In [14]:
X_vars = ['Runtime', 'Genre Cluster', 'G', 'NC-17', 'NR', 'PG', 'PG-13', 'R', 'Holiday', 'Adj Merged Budget', 'Has Star Score', 'Has Director Score', 'Has Production Company Score', 'Has Domestic Distributor Score', 'Unweighted Star Score_normalized', 'Simple Weight Star Score_normalized', 'Log Weight Star Score_normalized', 'Exponential Weight Star Score_normalized', 'Total Director Score_normalized', 'Avg Director Score_normalized', 'Total Production Company Score_normalized', 'Avg Production Company Score_normalized', 'Domestic Distributor Score_normalized', 'Season_ASO_4', 'Season_FMA_2', 'Season_MJJ_3', 'Season_NDJ_1']
y = ["Adj Merged Revenue"]

In [15]:
imp = IterativeImputer(min_value=0, max_iter=1000, random_state=102, estimator=RandomForestRegressor())

In [17]:
for b in ['2', '3','4']:
    print(f"For bin {b}:")
    print()
    if b == '4':
        to_train = train_df
        to_test = test_df
    else:   
        to_train = train_df[train_df["Budget Bins"] == b]
        to_test = test_df[test_df["Budget Bins"] == b]

    train_X = to_train[X_vars]
    train_y = to_train[y]
    test_X = to_test[X_vars]
    test_y = to_test[y]
    for reg in [RandomForestRegressor(), SVR(), GradientBoostingRegressor(), RandomForestRegressor(n_estimators=100, max_depth=10), RandomForestRegressor(n_estimators=100, max_depth=20), RandomForestRegressor(n_estimators=100, max_depth=50)]:
        
        train_y = np.ravel(train_y)

        pipeline = Pipeline([
            ('imputer', imp),
            ('regressor', reg)  # RandomForestRegressor without grid search
        ])

        # Train the RandomForestRegressor
        pipeline.fit(train_X, train_y)
        r2cv = cross_val_score(pipeline, train_X, train_y, cv=5, scoring="r2")
        rmsecv = cross_val_score(pipeline, train_X, train_y, cv=5, scoring="neg_root_mean_squared_error")
        mapecv = cross_val_score(pipeline, train_X, train_y, cv=5, scoring="neg_mean_absolute_percentage_error")
        maecv = cross_val_score(pipeline, train_X, train_y, cv=5, scoring="neg_mean_absolute_error")

        # Get the trained model
        trained_model = pipeline.named_steps['regressor']

        # Assuming you have test_X as your test features
        # Make predictions using the trained model
        predictions = pipeline.predict(test_X)
        # Now you can use these predictions for further analysis or evaluation
        # For instance, if you have test_y (actual target values), you can evaluate the model performance
        # For example, using metrics like mean squared error (MSE) or R-squared
        from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error

        # Assuming test_y is your actual target variable for the test data
        # Evaluate the model
        mse = mean_squared_error(test_y, predictions)
        mae = mean_absolute_error(test_y, predictions)
        mape = mean_absolute_percentage_error(test_y, predictions)
        r_squared = r2_score(test_y, predictions)

        print("Regressor:", reg)
        print("ON TRAINING, CROSS VALIDATION:")
        print("Root Mean Squared Error (MSE):", np.mean(rmsecv))
        print("Mean Absolute Error (MAE):", np.mean(maecv))
        print("Mean Absolute Percentage Error (MAPE):", np.mean(mapecv))
        print("R-squared:", np.mean(r2cv))
        print("ON TESTING:")
        print("Mean Squared Error (MSE):", mse)
        print("Mean Absolute Error (MAE):", mae)
        print("Mean Absolute Percentage Error (MAPE):", mape)
        print("R-squared:", r_squared)

For bin 2:

Regressor: RandomForestRegressor()
ON TRAINING, CROSS VALIDATION:
Root Mean Squared Error (MSE): -4196533.23738346
Mean Absolute Error (MAE): -1753908.3658680809
Mean Absolute Percentage Error (MAPE): -120.29180491785625
R-squared: 0.16691347166373013
ON TESTING:
Mean Squared Error (MSE): 26138343898391.37
Mean Absolute Error (MAE): 4136059.616075003
Mean Absolute Percentage Error (MAPE): 11.739737315659605
R-squared: 0.14700730013438967
Regressor: SVR()
ON TRAINING, CROSS VALIDATION:
Root Mean Squared Error (MSE): -5035059.876461744
Mean Absolute Error (MAE): -1740740.9615721558
Mean Absolute Percentage Error (MAPE): -12.108946274465458
R-squared: -0.12391921138927908
ON TESTING:
Mean Squared Error (MSE): 48585136416407.836
Mean Absolute Error (MAE): 4239134.76101088
Mean Absolute Percentage Error (MAPE): 0.8319961495031468
R-squared: -0.5855161614780502
Regressor: GradientBoostingRegressor()
ON TRAINING, CROSS VALIDATION:
Root Mean Squared Error (MSE): -4129573.8949203854