In [25]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
import numpy as np

In [26]:
import warnings
warnings.filterwarnings("ignore")

In [27]:
# Load train and test data
train_df = pd.read_csv("./3.3 Construct Data/Final Datasets/imputed_budget_train.csv")
test_df = pd.read_csv("./3.3 Construct Data/Final Datasets/imputed_budget_test.csv")

# Define bin edges for Adjusted Merged Budget
bin_edges = np.percentile(train_df['Adj Merged Budget'], [0, 33, 67, 100])
bin_labels = ['1', '2', '3']  # Adjusted the labels to match the number of bins (one less than edges)

# Split train data into three bins based on Adjusted Merged Budget
train_df['Budget Bins'] = pd.cut(train_df['Adj Merged Budget'], bins=bin_edges, labels=bin_labels)
# Split test data into three bins based on Adjusted Merged Budget
test_df['Budget Bins'] = pd.cut(test_df['Adj Merged Budget'], bins=bin_edges, labels=bin_labels)

In [28]:
X_vars = ['Runtime', 'Genre Cluster', 'G', 'NC-17', 'NR', 'PG', 'PG-13', 'R', 'Holiday', 'Adj Merged Budget', 'Has Star Score', 'Has Director Score', 'Has Production Company Score', 'Has Domestic Distributor Score', 'Unweighted Star Score_normalized', 'Simple Weight Star Score_normalized', 'Log Weight Star Score_normalized', 'Exponential Weight Star Score_normalized', 'Total Director Score_normalized', 'Avg Director Score_normalized', 'Total Production Company Score_normalized', 'Avg Production Company Score_normalized', 'Domestic Distributor Score_normalized', 'Season_ASO_4', 'Season_FMA_2', 'Season_MJJ_3', 'Season_NDJ_1']
y = ["Adj Merged Revenue"]

In [29]:
imp = IterativeImputer(min_value=0, max_iter=1000, random_state=102, estimator=RandomForestRegressor())

In [24]:
for b in ['2', '3','4']:
    print(f"For bin {b}:")
    print()
    if b == '4':
        to_train = train_df
        to_test = test_df
    else:   
        to_train = train_df[train_df["Budget Bins"] == b]
        to_test = test_df[test_df["Budget Bins"] == b]

    train_X = to_train[X_vars]
    train_y = to_train[y]
    test_X = to_test[X_vars]
    test_y = to_test[y]
    for reg in [RandomForestRegressor(), SVR(), GradientBoostingRegressor(), RandomForestRegressor(n_estimators=100, max_depth=10), RandomForestRegressor(n_estimators=100, max_depth=20), RandomForestRegressor(n_estimators=100, max_depth=50)]:
        
        train_y = np.ravel(train_y)

        pipeline = Pipeline([
            ('imputer', imp),
            ('regressor', reg)  # RandomForestRegressor without grid search
        ])

        # Train the RandomForestRegressor
        pipeline.fit(train_X, train_y)
        r2cv = cross_val_score(pipeline, train_X, train_y, cv=5, scoring="r2")
        rmsecv = cross_val_score(pipeline, train_X, train_y, cv=5, scoring="neg_root_mean_squared_error")
        mapecv = cross_val_score(pipeline, train_X, train_y, cv=5, scoring="neg_mean_absolute_percentage_error")
        maecv = cross_val_score(pipeline, train_X, train_y, cv=5, scoring="neg_mean_absolute_error")

        # Get the trained model
        trained_model = pipeline.named_steps['regressor']

        # Assuming you have test_X as your test features
        # Make predictions using the trained model
        predictions = pipeline.predict(test_X)
        # Now you can use these predictions for further analysis or evaluation
        # For instance, if you have test_y (actual target values), you can evaluate the model performance
        # For example, using metrics like mean squared error (MSE) or R-squared
        from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error

        # Assuming test_y is your actual target variable for the test data
        # Evaluate the model
        mse = mean_squared_error(test_y, predictions)
        mae = mean_absolute_error(test_y, predictions)
        mape = mean_absolute_percentage_error(test_y, predictions)
        r_squared = r2_score(test_y, predictions)

        print("Regressor:", reg)
        print("ON TRAINING, CROSS VALIDATION:")
        print("Root Mean Squared Error (MSE):", np.mean(rmsecv))
        print("Mean Absolute Error (MAE):", np.mean(maecv))
        print("Mean Absolute Percentage Error (MAPE):", np.mean(mapecv))
        print("R-squared:", np.mean(r2cv))
        print("ON TESTING:")
        print("Mean Squared Error (MSE):", mse)
        print("Mean Absolute Error (MAE):", mae)
        print("Mean Absolute Percentage Error (MAPE):", mape)
        print("R-squared:", r_squared)

For bin 2:



KeyboardInterrupt: 

In [30]:
for b in ['2', '3','4']:
    print(f"For bin {b}:")
    print()
    if b == '4':
        to_train = train_df
        to_test = test_df
    else:   
        to_train = train_df[train_df["Budget Bins"] == b]
        to_test = test_df[test_df["Budget Bins"] == b]

    train_X = to_train[X_vars]
    train_y = to_train[y]
    test_X = to_test[X_vars]
    test_y = to_test[y]
    if b == 2:
        for reg in [RandomForestRegressor(n_estimators=100, max_depth=10)]:
            
            train_y = np.ravel(train_y)

            pipeline = Pipeline([
                ('imputer', imp),
                ('regressor', reg)  # RandomForestRegressor without grid search
            ])

            # Train the RandomForestRegressor
            pipeline.fit(train_X, train_y)

            # Get the trained model
            trained_model = pipeline.named_steps['regressor']

            # Assuming you have test_X as your test features
            # Make predictions using the trained model
            predictions = pipeline.predict(test_X)
            to_save = test_df[test_df["Budget Bins"] == b]
            to_save["Predictions"] = predictions
            to_save.to_csv(f"bin_{b}_{reg}_pred.csv")
    elif b == 3:
        for reg in [RandomForestRegressor(n_estimators=100, max_depth=10), GradientBoostingRegressor()]:
                
            train_y = np.ravel(train_y)

            pipeline = Pipeline([
                ('imputer', imp),
                ('regressor', reg)  # RandomForestRegressor without grid search
            ])

            # Train the RandomForestRegressor
            pipeline.fit(train_X, train_y)

            # Get the trained model
            trained_model = pipeline.named_steps['regressor']

            # Assuming you have test_X as your test features
            # Make predictions using the trained model
            predictions = pipeline.predict(test_X)
            to_save = test_df[test_df["Budget Bins"] == b]
            to_save["Predictions"] = predictions
            to_save.to_csv(f"bin_{b}_{reg}_pred.csv")
    else:
        for reg in [RandomForestRegressor(n_estimators=100, max_depth=50)]:
                
            train_y = np.ravel(train_y)

            pipeline = Pipeline([
                ('imputer', imp),
                ('regressor', reg)  # RandomForestRegressor without grid search
            ])

            # Train the RandomForestRegressor
            pipeline.fit(train_X, train_y)

            # Get the trained model
            trained_model = pipeline.named_steps['regressor']

            # Assuming you have test_X as your test features
            # Make predictions using the trained model
            predictions = pipeline.predict(test_X)
            to_save = test_df[test_df["Budget Bins"] == b]
            to_save["Predictions"] = predictions
            to_save.to_csv(f"bin_{b}_{reg}_pred.csv")

For bin 2:

For bin 3:

For bin 4:



In [None]:
X_vars = ['Adj Merged Budget']
y = ["Adj Merged Revenue"]
imp = IterativeImputer(min_value=0, max_iter=1000, random_state=102, estimator=RandomForestRegressor())
for b in ['2', '3','4']:
    print(f"For bin {b}:")
    print()
    if b == '4':
        to_train = train_df
        to_test = test_df
    else:   
        to_train = train_df[train_df["Budget Bins"] == b]
        to_test = test_df[test_df["Budget Bins"] == b]

    train_X = to_train[X_vars]
    train_y = to_train[y]
    test_X = to_test[X_vars]
    test_y = to_test[y]
    for reg in [RandomForestRegressor(), SVR(), GradientBoostingRegressor(), RandomForestRegressor(n_estimators=100, max_depth=10), RandomForestRegressor(n_estimators=100, max_depth=20), RandomForestRegressor(n_estimators=100, max_depth=50)]:
        
        train_y = np.ravel(train_y)

        pipeline = Pipeline([
            ('imputer', imp),
            ('regressor', reg)  # RandomForestRegressor without grid search
        ])

        # Train the RandomForestRegressor
        pipeline.fit(train_X, train_y)
        r2cv = cross_val_score(pipeline, train_X, train_y, cv=5, scoring="r2")
        rmsecv = cross_val_score(pipeline, train_X, train_y, cv=5, scoring="neg_root_mean_squared_error")
        mapecv = cross_val_score(pipeline, train_X, train_y, cv=5, scoring="neg_mean_absolute_percentage_error")
        maecv = cross_val_score(pipeline, train_X, train_y, cv=5, scoring="neg_mean_absolute_error")

        # Get the trained model
        trained_model = pipeline.named_steps['regressor']

        # Assuming you have test_X as your test features
        # Make predictions using the trained model
        predictions = pipeline.predict(test_X)
         # Now you can use these predictions for further analysis or evaluation
        # For instance, if you have test_y (actual target values), you can evaluate the model performance
        # For example, using metrics like mean squared error (MSE) or R-squared
        from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error

        # Assuming test_y is your actual target variable for the test data
        # Evaluate the model
        mse = mean_squared_error(test_y, predictions)
        mae = mean_absolute_error(test_y, predictions)
        mape = mean_absolute_percentage_error(test_y, predictions)
        r_squared = r2_score(test_y, predictions)

        print("Regressor:", reg)
        print("ON TRAINING, CROSS VALIDATION:")
        print("Root Mean Squared Error (MSE):", np.mean(rmsecv))
        print("Mean Absolute Error (MAE):", np.mean(maecv))
        print("Mean Absolute Percentage Error (MAPE):", np.mean(mapecv))
        print("R-squared:", np.mean(r2cv))
        print("ON TESTING:")
        print("Mean Squared Error (MSE):", mse)
        print("Mean Absolute Error (MAE):", mae)
        print("Mean Absolute Percentage Error (MAPE):", mape)
        print("R-squared:", r_squared)

For bin 2:

Regressor: RandomForestRegressor()
ON TRAINING, CROSS VALIDATION:
Root Mean Squared Error (MSE): -4923125.461628975
Mean Absolute Error (MAE): -2153577.130476358
Mean Absolute Percentage Error (MAPE): -168.03323615604899
R-squared: -0.14301584316743562
ON TESTING:
Mean Squared Error (MSE): 44009066515790.65
Mean Absolute Error (MAE): 4483804.246194173
Mean Absolute Percentage Error (MAPE): 8.695092325709556
R-squared: -0.436181749379301
Regressor: SVR()
ON TRAINING, CROSS VALIDATION:
Root Mean Squared Error (MSE): -5035058.830429592
Mean Absolute Error (MAE): -1740740.2870439775
Mean Absolute Percentage Error (MAPE): -12.108989710878094
R-squared: -0.12391863363611111
ON TESTING:
Mean Squared Error (MSE): 48585129808362.1
Mean Absolute Error (MAE): 4239134.709657704
Mean Absolute Percentage Error (MAPE): 0.8320129131795717
R-squared: -0.5855159458326065
Regressor: GradientBoostingRegressor()
ON TRAINING, CROSS VALIDATION:
Root Mean Squared Error (MSE): -4930226.049674685
Me

In [31]:
X_vars = ['Adj Merged Budget']
y = ["Adj Merged Revenue"]
imp = IterativeImputer(min_value=0, max_iter=1000, random_state=102, estimator=RandomForestRegressor())
for b in ['2', '3','4']:
    print(f"For bin {b}:")
    print()
    if b == '4':
        to_train = train_df
        to_test = test_df
    else:   
        to_train = train_df[train_df["Budget Bins"] == b]
        to_test = test_df[test_df["Budget Bins"] == b]

    train_X = to_train[X_vars]
    train_y = to_train[y]
    test_X = to_test[X_vars]
    test_y = to_test[y]
    for reg in [RandomForestRegressor(n_estimators=100, max_depth=10)]:
        
        train_y = np.ravel(train_y)

        pipeline = Pipeline([
            ('imputer', imp),
            ('regressor', reg)  # RandomForestRegressor without grid search
        ])
        # Train the RandomForestRegressor
        pipeline.fit(train_X, train_y)
        # Get the trained model
        trained_model = pipeline.named_steps['regressor']

        # Assuming you have test_X as your test features
        # Make predictions using the trained model
        predictions = pipeline.predict(test_X)
        if b == '4':
            to_save = test_df
            to_save["Predictions"] = predictions
            to_save.to_csv(f"bin_{b}_rf_10_BASELINE_predictions.csv")
        else:
            to_save = test_df[test_df["Budget Bins"] == b]
            to_save["Predictions"] = predictions
            to_save.to_csv(f"bin_{b}_rf_10_BASELINE_predictions.csv")

For bin 2:

For bin 3:

For bin 4:

