In [2]:
import pandas as pd

### Preprocessing
First, drop the movie_title column if it's only being used as an index and is not part of the feature set.

In [3]:
df = pd.read_csv('training_data.csv')
df.head()

Unnamed: 0,movie_title,available_globally,viewing_hours,release_month,runtime,imdb_rating,total_seasons,gn_drama,gn_action,gn_fantasy,...,up,us,war,we,who,wild,with,world,you,your
0,The Night Agent,1,812100000,3,30,7.5,2.0,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Ginny & Georgia,1,665100000,2,30,7.5,2.0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,The Glory,1,622800000,12,30,8.1,1.0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Wednesday,1,507700000,11,30,8.1,2.0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Queen Charlotte: A Bridgerton Story,1,503000000,5,30,7.4,1.0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
df.drop(['movie_title'], axis=1, inplace=True)

### Splitting the Data
Split the data into training and testing sets.

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop(['viewing_hours', 'imdb_rating'], axis=1)
y = df[['viewing_hours', 'imdb_rating']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Model Selection & Training
Use the RandomForestRegressor within a MultiOutputRegressor. This wraps the random forest model to enable it to predict multiple targets.

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor

model = MultiOutputRegressor(RandomForestRegressor(random_state=42))

In [None]:
model.fit(X_train, y_train)

### Model Evaluation
Evaluate the model on the test set using appropriate metrics. Mean Squared Error (MSE) and R-squared are common choices.

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate MSE and R-squared for each target
mse_viewing_hours = mean_squared_error(y_test['viewing_hours'], y_pred[:, 0])
mse_imdb_rating = mean_squared_error(y_test['imdb_rating'], y_pred[:, 1])
r2_viewing_hours = r2_score(y_test['viewing_hours'], y_pred[:, 0])
r2_imdb_rating = r2_score(y_test['imdb_rating'], y_pred[:, 1])

print(f'Viewing Hours - MSE: {mse_viewing_hours}, R2: {r2_viewing_hours}')
print(f'IMDb Rating - MSE: {mse_imdb_rating}, R2: {r2_imdb_rating}')
