In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor

df = pd.read_csv('filmes_gabriel.csv')


df['log_budget'] = np.log1p(df['budget'])
df['log_popularity'] = np.log1p(df['popularity'])

# Interação entre budget e popularidade (ambos log)
df['log_budget_popularity'] = df['log_budget'] * df['log_popularity']

# Contagens existentes
df['genre_count'] = df[['Action','Adventure','Animation','Comedy','Crime',
                       'Documentary','Drama','Family','Fantasy','History',
                       'Horror','Music','Mystery','Romance','Science Fiction',
                       'TV Movie','Thriller','War','Western']].sum(axis=1)

df['company_count'] = df[['20th Century Fox','Canal+','Columbia Pictures',
                         'Goldwyn','Mayer','Metro','New Line Cinema',
                         'Outros','Paramount','Universal Pictures',
                         'Warner Bros. Pictures']].sum(axis=1)

# Cíclicas para mês e estação
df['season'] = df['release_month'] % 12 // 3 + 1
df['month_sin'] = np.sin(2 * np.pi * df['release_month']/12)
df['month_cos'] = np.cos(2 * np.pi * df['release_month']/12)

# Estatísticas agregadas: média de vote_average por idioma e por estação
lang_mean = df.groupby('original_language')['vote_average'].mean()
season_mean = df.groupby('season')['vote_average'].mean()
df['lang_mean_vote'] = df['original_language'].map(lang_mean)
df['season_mean_vote'] = df['season'].map(season_mean)

# Frequência de idioma
lang_freq = df['original_language'].value_counts(normalize=True)
df['lang_freq'] = df['original_language'].map(lang_freq)

base_features = [
    'log_budget', 'log_popularity', 'log_budget_popularity',
    'runtime', 'release_year', 'season', 'month_sin', 'month_cos',
    'genre_count', 'company_count', 'lang_freq',
    'lang_mean_vote', 'season_mean_vote'
]
genre_cols = ['Action','Adventure','Animation','Comedy','Crime',
              'Documentary','Drama','Family','Fantasy','History',
              'Horror','Music','Mystery','Romance','Science Fiction',
              'TV Movie','Thriller','War','Western']
feature_cols = base_features + genre_cols

X = df[feature_cols].fillna(0)
y = df['vote_average']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = XGBRegressor(n_estimators=300, learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.8, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
print(f"R²: {r2:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")



R²: 0.5069
MSE: 0.3986
RMSE: 0.6313
MAE: 0.4868


In [19]:
import pandas as pd
import numpy as np
import ast
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Carregar o dataset
df = pd.read_csv('filmes_rodrigo.csv')

# --- 1. Limpeza e Pré-processamento ---

# Substituir orçamentos e receitas implausivelmente baixos por NaN
df['budget'] = df['budget'].replace(0, np.nan)
df['revenue'] = df['revenue'].replace(0, np.nan)

# Preencher os valores ausentes com a mediana (sintaxe corrigida)
df['budget'] = df['budget'].fillna(df['budget'].median())
df['revenue'] = df['revenue'].fillna(df['revenue'].median())

# Função para converter strings de listas/json em objetos Python de forma segura
def safe_literal_eval(val):
    try:
        if isinstance(val, str):
            return ast.literal_eval(val)
    except (ValueError, SyntaxError):
        return []
    return []

df['production_companies'] = df['production_companies'].apply(safe_literal_eval)
df['keywords'] = df['keywords'].apply(safe_literal_eval)

# --- 2. Engenharia de Features ---

df['profit'] = df['revenue'] - df['budget']
df['roi'] = (df['profit']) / (df['budget'].replace(0, 1))
df['num_production_companies'] = df['production_companies'].apply(len)
df['num_keywords'] = df['keywords'].apply(len)
df['has_tagline'] = df['tagline'].notna().astype(int)
df['overview_length'] = df['overview'].astype(str).str.len()

# --- 3. Preparação para o Modelo ---

features_to_keep = [
    'popularity', 'budget', 'revenue', 'runtime', 'vote_count',
    'release_year', 'release_month', 'release_day_of_week',
    'profit', 'roi', 'num_production_companies', 'num_keywords',
    'has_tagline', 'overview_length'
]
genre_cols = [col for col in df.columns if col.startswith('genre_')]
features_to_keep.extend(genre_cols)

# Criar X como uma cópia explícita para evitar o SettingWithCopyWarning
X = df[features_to_keep].copy()
y = df['vote_average']

# Lidar com possíveis valores infinitos e NaNs restantes em X
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.fillna(0, inplace=True)

# --- 4. Treinamento e Avaliação do Modelo ---

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1,
                                 max_depth=15, min_samples_leaf=5, min_samples_split=10)

print("Treinando o modelo RandomForest...")
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
r2 = r2_score(y_test, y_pred)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)

print(f"\n--- Resultado Final ---")
print(f"Score R² com Novas Features no dataset 'filmes_rodrigo': {r2}")
print(f"MSE:  {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE:  {mae:.4f}")

Treinando o modelo RandomForest...

--- Resultado Final ---
Score R² com Novas Features no dataset 'filmes_rodrigo': 0.8671733259810854
MSE:  0.6808
RMSE: 0.8251
MAE:  0.5231


In [None]:
# Parte 2: Modelo de Regressão (Usando o Arquivo Tratado)

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# Carregar o dataset JÁ TRATADO
df = pd.read_csv('filmes_tratado.csv')


# --- 1. Preparação dos Dados para o Modelo ---

# Definir a variável alvo (y) e as features (X)
# 'vote_average' é nosso alvo, todo o resto são as features
target_column = 'vote_average'
features = [col for col in df.columns if col != target_column]

X = df[features]
y = df[target_column]


# --- 2. Treinamento e Avaliação do Modelo ---

# Dividir os dados em conjuntos de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instanciar e treinar o modelo
rf_model = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1,
                                 max_depth=15, min_samples_leaf=5, min_samples_split=10)

print("Treinando o modelo RandomForest com os dados tratados...")
rf_model.fit(X_train, y_train)

# Fazer previsões no conjunto de teste
y_pred = rf_model.predict(X_test)

# Avaliar o desempenho do modelo
r2 = r2_score(y_test, y_pred)

print(f"\n--- Resultado Final do Modelo ---")
print(f"Score R²: {r2}")