---

## Modelo de Regressão Random Forest

---

In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [None]:
df = pd.read_csv('filmes_tratado_novas_features.csv')

target_column = 'vote_average'
features = [col for col in df.columns if col != target_column]
X = df[features]
y = df[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf_model = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1,
                                 max_depth=15, min_samples_leaf=5, min_samples_split=10)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)

print(f"\n--- Resultado Final ---")
print(f"R²: {r2:.4f}")
print(f"MSE:  {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE:  {mae:.4f}")


--- Resultado Final ---
R²: 0.8634
MSE:  0.7000
RMSE: 0.8367
MAE:  0.5278


---

## Modelo de Regressão XGBoost

---

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [24]:
df = pd.read_csv('filmes_tratado_novas_features.csv')

X = df.drop('vote_average', axis=1)
y = df['vote_average']
X = X.loc[:,~X.columns.duplicated()]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xgbr = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    early_stopping_rounds=50 
)

print("\nIniciando o treinamento do modelo XGBoost...")
xgbr.fit(X_train, y_train,
         eval_set=[(X_test, y_test)],
         verbose=False)
print("Treinamento concluído!")

# Predição e Avaliação
y_pred = xgbr.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"\n--- Resultado Final ---")
print(f"R²: {r2:.4f}")
print(f"MSE:  {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE:  {mae:.4f}")

# Análise das Features
feature_importances = pd.DataFrame({
    'feature': X.columns,
    'importance': xgbr.feature_importances_
}).sort_values('importance', ascending=False)

print("\nAs 15 features mais importantes para o modelo:")
print(feature_importances.head(15))


Iniciando o treinamento do modelo XGBoost...
Treinamento concluído!

--- Resultado Final ---
R²: 0.8688
MSE:  3.2478
RMSE: 0.8201
MAE:  0.5307

As 15 features mais importantes para o modelo:
              feature  importance
15         vote_count    0.461525
19      genre_Western    0.043544
4          popularity    0.040841
13        genre_Drama    0.037443
14    genre_Animation    0.036294
27       genre_Action    0.033577
7         has_tagline    0.025037
12       genre_Horror    0.024290
10            runtime    0.024080
21       release_year    0.023557
23          genre_War    0.020981
26        genre_Music    0.020217
3   genre_Documentary    0.020040
1              budget    0.019022
20      genre_History    0.016611
