### 1. Testando com novas features (tratamento de gabriel)

In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor

df = pd.read_csv('filmes_gabriel.csv')


df['log_budget'] = np.log1p(df['budget'])
df['log_popularity'] = np.log1p(df['popularity'])

# Interação entre budget e popularidade (ambos log)
df['log_budget_popularity'] = df['log_budget'] * df['log_popularity']

# Contagens existentes
df['genre_count'] = df[['Action','Adventure','Animation','Comedy','Crime',
                       'Documentary','Drama','Family','Fantasy','History',
                       'Horror','Music','Mystery','Romance','Science Fiction',
                       'TV Movie','Thriller','War','Western']].sum(axis=1)

df['company_count'] = df[['20th Century Fox','Canal+','Columbia Pictures',
                         'Goldwyn','Mayer','Metro','New Line Cinema',
                         'Outros','Paramount','Universal Pictures',
                         'Warner Bros. Pictures']].sum(axis=1)

# Cíclicas para mês e estação
df['season'] = df['release_month'] % 12 // 3 + 1
df['month_sin'] = np.sin(2 * np.pi * df['release_month']/12)
df['month_cos'] = np.cos(2 * np.pi * df['release_month']/12)

# Estatísticas agregadas: média de vote_average por idioma e por estação
lang_mean = df.groupby('original_language')['vote_average'].mean()
season_mean = df.groupby('season')['vote_average'].mean()
df['lang_mean_vote'] = df['original_language'].map(lang_mean)
df['season_mean_vote'] = df['season'].map(season_mean)

# Frequência de idioma
lang_freq = df['original_language'].value_counts(normalize=True)
df['lang_freq'] = df['original_language'].map(lang_freq)

base_features = [
    'log_budget', 'log_popularity', 'log_budget_popularity',
    'runtime', 'release_year', 'season', 'month_sin', 'month_cos',
    'genre_count', 'company_count', 'lang_freq',
    'lang_mean_vote', 'season_mean_vote'
]
genre_cols = ['Action','Adventure','Animation','Comedy','Crime',
              'Documentary','Drama','Family','Fantasy','History',
              'Horror','Music','Mystery','Romance','Science Fiction',
              'TV Movie','Thriller','War','Western']
feature_cols = base_features + genre_cols

X = df[feature_cols].fillna(0)
y = df['vote_average']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = XGBRegressor(n_estimators=300, learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.8, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
print(f"R²: {r2:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")






R²: 0.5069
MSE: 0.3986
RMSE: 0.6313
MAE: 0.4868
