## Importações

In [9]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold, cross_val_predict
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, median_absolute_error

from xgboost import XGBRegressor
from sklearn.svm import SVR 
from sklearn.ensemble import RandomForestRegressor 

---

In [10]:
df = pd.read_csv('../data/filmes_filtrados_credits.csv')

---

In [11]:


def load_data(file_path):
    """Carrega o dataset a partir de um arquivo CSV."""
    df = pd.read_csv('../data/filmes_filtrados_credits.csv')
    return df

def feature_engineering(df):
    """Realiza a engenharia de features no DataFrame."""
    # Contagem de créditos
    # Garante que 'credits' é uma string antes de aplicar split
    df['credits_count'] = df['credits'].astype(str).apply(lambda x: len(x.split('-')))

    # Extração de features de data
    df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce') # 'coerce' para NaN em datas inválidas
    df['year'] = df['release_date'].dt.year
    df['month'] = df['release_date'].dt.month

    return df

def handle_outliers(X_train, X_test, numeric_cols, lower_percentile=0.01, upper_percentile=0.99):
    """Lida com outliers aplicando capping (limitação) em colunas numéricas."""
    for col in numeric_cols:
        # Verifica se a coluna existe e é numérica
        if col in X_train.columns and pd.api.types.is_numeric_dtype(X_train[col]):
            lower = X_train[col].quantile(lower_percentile)
            upper = X_train[col].quantile(upper_percentile)
            X_train[col] = X_train[col].clip(lower, upper)
            X_test[col] = X_test[col].clip(lower, upper)
        else:
            print(f"Aviso: Coluna '{col}' não encontrada ou não é numérica para tratamento de outliers. Pulando.")
    return X_train, X_test

def group_rare_categories(series_train, series_test, min_count_threshold=50, replacement_label='Other'):
    """Agrupa categorias raras em uma coluna categórica."""
    counts = series_train.value_counts()
    rare_categories = counts[counts < min_count_threshold].index
    series_train_processed = series_train.replace(rare_categories, replacement_label)
    series_test_processed = series_test.replace(rare_categories, replacement_label)
    return series_train_processed, series_test_processed

def process_multilabel_column(train_series, test_series, sep='-', top_n=12, prefix='feature'):
    """
    Processa colunas multi-rótulo (e.g., gêneros, produtoras) usando MultiLabelBinarizer.
    Filtra para os top_n mais frequentes e agrupa o restante em 'Other'.
    """
    # Extrai os top_n categorias do conjunto de treino
    exploded_train = train_series.astype(str).str.split(sep).explode().str.strip()
    top_categories = exploded_train.value_counts().nlargest(top_n).index.tolist()

    def filter_and_process(vals_series):
        filtered_lists = []
        for val_str in vals_series.astype(str).str.split(sep):
            filtered_list = [v.strip() for v in val_str if v.strip() in top_categories]
            if not filtered_list: # Se não houver top_n categorias, adicione 'Other'
                filtered_list = ['Other']
            filtered_lists.append(filtered_list)
        return filtered_lists

    train_filtered_lists = filter_and_process(train_series)
    test_filtered_lists = filter_and_process(test_series)

    mlb = MultiLabelBinarizer()
    train_enc = pd.DataFrame(
        mlb.fit_transform(train_filtered_lists),
        columns=[f"{prefix}_{cls}" for cls in mlb.classes_],
        index=train_series.index
    )
    test_enc = pd.DataFrame(
        mlb.transform(test_filtered_lists),
        columns=[f"{prefix}_{cls}" for cls in mlb.classes_],
        index=test_series.index
    )
    return train_enc, test_enc


def train_and_evaluate_model(model, X_train, y_train, X_test, y_test, kf_splits=5, random_state=87):
    """Treina e avalia o modelo usando validação cruzada e no conjunto de teste."""
    kf = KFold(n_splits=kf_splits, shuffle=True, random_state=random_state)

    print("\nRealizando validação cruzada no conjunto de treino...")
    y_pred_cv_train = cross_val_predict(model, X_train, y_train, cv=kf)

    print("Treinando o modelo no conjunto de treino completo...")
    model.fit(X_train, y_train)

    print("Realizando previsões no conjunto de teste...")
    y_test_pred = model.predict(X_test)

    # Métricas de avaliação
    mse_train_cv = mean_squared_error(y_train, y_pred_cv_train)
    rmse_train_cv = np.sqrt(mse_train_cv)

    mse_test = mean_squared_error(y_test, y_test_pred)
    rmse_test = np.sqrt(mse_test)

    metrics = {
        'R2_train_cv': r2_score(y_train, y_pred_cv_train),
        'MAE_train_cv': mean_absolute_error(y_train, y_pred_cv_train),
        'RMSE_train_cv': rmse_train_cv,
        'MedAE_train_cv': median_absolute_error(y_train, y_pred_cv_train),
        'R2_test': r2_score(y_test, y_test_pred),
        'MAE_test': mean_absolute_error(y_test, y_test_pred),
        'RMSE_test': rmse_test,
        'MedAE_test': median_absolute_error(y_test, y_test_pred),
    }

    metrics_df = pd.DataFrame(metrics, index=[0]).T
    metrics_df.columns = ['Value']

    print("\nMétricas de Avaliação:")
    print(metrics_df)
    return model, metrics_df


# --- Pipeline Principal ---
if __name__ == "__main__":
    file_path = 'filmes_filtrados_credits.csv'
    random_seed = 87
    test_size = 0.2

    print("Passo 1: Carregando os dados...")
    df = load_data(file_path)

    print("Passo 2: Engenharia de Features...")
    df = feature_engineering(df)

    # Drop colunas originais que não são mais necessárias e podem causar erro
    # Esta é a correção do erro anterior: 'credits' e 'release_date'
    # 'genres' e 'production_companies' serão dropadas mais abaixo, após o processamento MultiLabelBinarizer
    df = df.drop(columns=['credits', 'release_date'])


    print("Passo 3: Preparando X e y e dividindo em treino/teste...")
    X = df.drop(columns=['vote_average'])
    y = df['vote_average']

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_seed
    )

    print("Passo 4: Tratamento de Outliers (Capping)...")
    numeric_cols_for_outliers = ['budget', 'runtime', 'credits_count', 'year', 'month']
    X_train, X_test = handle_outliers(X_train.copy(), X_test.copy(), numeric_cols_for_outliers)
    # Usar .copy() para evitar SettingWithCopyWarning, embora o clip já crie uma cópia em muitos casos.

    print("Passo 5: Agrupamento de Categorias Raras (original_language)...")
    X_train['original_language'], X_test['original_language'] = group_rare_categories(
        X_train['original_language'], X_test['original_language'], min_count_threshold=50
    )

    print("Passo 6: One-Hot Encoding (original_language)...")
    X_train = pd.get_dummies(X_train, columns=['original_language'], drop_first=True)
    X_test = pd.get_dummies(X_test, columns=['original_language'], drop_first=True)
    X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0) # Alinhar após get_dummies

    print("Passo 7: MultiLabel Binarization (genres e production_companies)...")
    genres_train_encoded, genres_test_encoded = process_multilabel_column(
        X_train['genres'], X_test['genres'], sep='-', top_n=12, prefix='genre'
    )
    X_train = X_train.drop(columns=['genres']).join(genres_train_encoded)
    X_test = X_test.drop(columns=['genres']).join(genres_test_encoded)

    prod_train_encoded, prod_test_encoded = process_multilabel_column(
        X_train['production_companies'], X_test['production_companies'], sep='-', top_n=8, prefix='production'
    )
    X_train = X_train.drop(columns=['production_companies']).join(prod_train_encoded)
    X_test = X_test.drop(columns=['production_companies']).join(prod_test_encoded)

    # Alinhar novamente após MultiLabelBinarizer para garantir que as colunas correspondem
    X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

    print("Passo 8: Inicializando e avaliando o modelo XGBoost...")
    # Removendo a coluna 'title' se ela existir e não for usada, pois é do tipo 'object'
    # Se 'title' fosse para ser usada, precisaria de um processamento de texto como TF-IDF ou embeddings
    if 'title' in X_train.columns:
        print("Aviso: 'title' detectada e será removida, pois não é numérica/categórica processada.")
        X_train = X_train.drop(columns=['title'])
        X_test = X_test.drop(columns=['title'])


    model = XGBRegressor(random_state=random_seed, n_estimators=100, verbosity=0)
    final_model, evaluation_metrics = train_and_evaluate_model(
        model, X_train, y_train, X_test, y_test, kf_splits=5, random_state=random_seed
    )

    print("\nRefatoração concluída e modelo executado.")

Passo 1: Carregando os dados...
Passo 2: Engenharia de Features...
Passo 3: Preparando X e y e dividindo em treino/teste...
Passo 4: Tratamento de Outliers (Capping)...
Passo 5: Agrupamento de Categorias Raras (original_language)...
Passo 6: One-Hot Encoding (original_language)...
Passo 7: MultiLabel Binarization (genres e production_companies)...
Passo 8: Inicializando e avaliando o modelo XGBoost...

Realizando validação cruzada no conjunto de treino...
Treinando o modelo no conjunto de treino completo...
Realizando previsões no conjunto de teste...

Métricas de Avaliação:
                   Value
R2_train_cv     0.416747
MAE_train_cv    0.529386
RMSE_train_cv   0.690032
MedAE_train_cv  0.425330
R2_test         0.452009
MAE_test        0.514377
RMSE_test       0.675054
MedAE_test      0.411738

Refatoração concluída e modelo executado.


---

In [12]:


# Load dataset
df = pd.read_csv('../data/filmes_filtrados_credits.csv')
# --- Pré-processamento de Dados ---

# Feature: count of credits
df['credits_count'] = df['credits'].apply(lambda x: len(x.split('-')))

# Extract date features
df['release_date'] = pd.to_datetime(df['release_date'])
df['year'] = df['release_date'].dt.year
df['month'] = df['release_date'].dt.month

# Drop unused columns
df = df.drop(columns=['credits', 'release_date'])

# Prepare X and y
X = df.drop(columns=['vote_average'])
y = df['vote_average']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=87
)

# Outlier handling: cap numeric features at 1st and 99th percentiles
numeric_cols = [ 'budget', 'runtime', 'credits_count']
for col in numeric_cols:
    lower = X_train[col].quantile(0.01)
    upper = X_train[col].quantile(0.99)
    X_train[col] = X_train[col].clip(lower, upper)
    X_test[col] = X_test[col].clip(lower, upper)

# Rare category grouping for original_language
lang_counts = X_train['original_language'].value_counts()
rare_langs = lang_counts[lang_counts < 50].index
X_train['original_language'] = X_train['original_language'].replace(rare_langs, 'Other')
X_test['original_language'] = X_test['original_language'].replace(rare_langs, 'Other')

# One-hot encode original_language
X_train = pd.get_dummies(X_train, columns=['original_language'], drop_first=True)
X_test = pd.get_dummies(X_test, columns=['original_language'], drop_first=True)
# Align columns
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# MultiLabelBinarizer for genres and production_companies
def process_multilabel_column(train_series, test_series, sep='-', top_n=12, prefix='genre'):
    # Extract top_n categories
    exploded = train_series.str.split(sep).explode().str.strip()
    top = exploded.value_counts().nlargest(top_n).index.tolist()

    def filter_top(vals):
        vals = [v.strip() for v in vals.split(sep)]
        return [v if v in top else 'Other' for v in vals]

    train_filtered = train_series.apply(filter_top)
    test_filtered = test_series.apply(filter_top)

    mlb = MultiLabelBinarizer()
    train_enc = pd.DataFrame(
        mlb.fit_transform(train_filtered),
        columns=[f"{prefix}_{cls}" for cls in mlb.classes_],
        index=train_series.index
    )
    test_enc = pd.DataFrame(
        mlb.transform(test_filtered),
        columns=[f"{prefix}_{cls}" for cls in mlb.classes_],
        index=test_series.index
    )
    return train_enc, test_enc

# Genres
genres_train, genres_test = process_multilabel_column(
    X_train['genres'], X_test['genres'], sep='-', top_n=12, prefix='genre'
)
X_train = X_train.drop(columns=['genres']).join(genres_train)
X_test = X_test.drop(columns=['genres']).join(genres_test)

# Production companies
prod_train, prod_test = process_multilabel_column(
    X_train['production_companies'], X_test['production_companies'], sep='-', top_n=8, prefix='production'
)
X_train = X_train.drop(columns=['production_companies']).join(prod_train)
X_test = X_test.drop(columns=['production_companies']).join(prod_test)

# Align again after multilabel
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# --- Escalonamento de Features Numéricas para SVR ---
# Identificar todas as colunas numéricas após o pré-processamento
# Excluímos as colunas dummy (0/1) para escalonamento
all_numeric_cols = X_train.select_dtypes(include=np.number).columns.tolist()
# As colunas dummy (original_language_X, genre_X, production_X) não precisam ser escalonadas.
# Vamos assumir que 'year', 'month', 'popularity', 'budget', 'runtime', 'credits_count' são as numéricas que precisam de escalonamento.
# É uma boa prática verificar isso dinamicamente se a estrutura das colunas puder mudar.
cols_to_scale = ['budget', 'runtime', 'credits_count', 'year', 'month']

scaler = StandardScaler()
X_train[cols_to_scale] = scaler.fit_transform(X_train[cols_to_scale])
X_test[cols_to_scale] = scaler.transform(X_test[cols_to_scale])

# --- Treinamento e Avaliação do Modelo SVR ---

kf = KFold(n_splits=5, shuffle=True, random_state=87)
model = SVR(kernel='rbf', C=1.0, epsilon=0.1) # Parâmetros iniciais para SVR

# Predições de validação cruzada no conjunto de treino
y_pred_cv_train = cross_val_predict(model, X_train, y_train, cv=kf)

# Treinar o modelo no conjunto de treino completo
model.fit(X_train, y_train)

# Predições no conjunto de teste
y_test_pred = model.predict(X_test)

# --- Métricas de Avaliação ---

mse_train_cv = mean_squared_error(y_train, y_pred_cv_train)
rmse_train_cv = np.sqrt(mse_train_cv)

mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)

metrics = {
    'R2': r2_score(y_test, y_test_pred),
    'MAE': mean_absolute_error(y_test, y_test_pred),
    'RMSE': rmse_test,
    'MedAE': median_absolute_error(y_test, y_test_pred),
}

metrics_df = pd.DataFrame(metrics, index=[0]).T
metrics_df.columns = ['Value']

print(metrics_df)

          Value
R2     0.218136
MAE    0.626591
RMSE   0.806339
MedAE  0.515157


---

In [13]:


# Load dataset
df = pd.read_csv('../data/filmes_filtrados_credits.csv')
# --- Pré-processamento de Dados ---

# Feature: count of credits
df['credits_count'] = df['credits'].apply(lambda x: len(x.split('-')))

# Extract date features
df['release_date'] = pd.to_datetime(df['release_date'])
df['year'] = df['release_date'].dt.year
df['month'] = df['release_date'].dt.month

# Drop unused columns
df = df.drop(columns=['credits', 'release_date'])

# Prepare X and y
X = df.drop(columns=['vote_average'])
y = df['vote_average']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=87
)

# Outlier handling: cap numeric features at 1st and 99th percentiles
numeric_cols = ['budget', 'runtime', 'credits_count']
for col in numeric_cols:
    lower = X_train[col].quantile(0.01)
    upper = X_train[col].quantile(0.99)
    X_train[col] = X_train[col].clip(lower, upper)
    X_test[col] = X_test[col].clip(lower, upper)

# Rare category grouping for original_language
lang_counts = X_train['original_language'].value_counts()
rare_langs = lang_counts[lang_counts < 50].index
X_train['original_language'] = X_train['original_language'].replace(rare_langs, 'Other')
X_test['original_language'] = X_test['original_language'].replace(rare_langs, 'Other')

# One-hot encode original_language
X_train = pd.get_dummies(X_train, columns=['original_language'], drop_first=True)
X_test = pd.get_dummies(X_test, columns=['original_language'], drop_first=True)
# Align columns
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# MultiLabelBinarizer for genres and production_companies
def process_multilabel_column(train_series, test_series, sep='-', top_n=12, prefix='genre'):
    # Extract top_n categories
    exploded = train_series.str.split(sep).explode().str.strip()
    top = exploded.value_counts().nlargest(top_n).index.tolist()

    def filter_top(vals):
        vals = [v.strip() for v in vals.split(sep)]
        return [v if v in top else 'Other' for v in vals]

    train_filtered = train_series.apply(filter_top)
    test_filtered = test_series.apply(filter_top)

    mlb = MultiLabelBinarizer()
    train_enc = pd.DataFrame(
        mlb.fit_transform(train_filtered),
        columns=[f"{prefix}_{cls}" for cls in mlb.classes_],
        index=train_series.index
    )
    test_enc = pd.DataFrame(
        mlb.transform(test_filtered),
        columns=[f"{prefix}_{cls}" for cls in mlb.classes_],
        index=test_series.index
    )
    return train_enc, test_enc

# Genres
genres_train, genres_test = process_multilabel_column(
    X_train['genres'], X_test['genres'], sep='-', top_n=12, prefix='genre'
)
X_train = X_train.drop(columns=['genres']).join(genres_train)
X_test = X_test.drop(columns=['genres']).join(genres_test)

# Production companies
prod_train, prod_test = process_multilabel_column(
    X_train['production_companies'], X_test['production_companies'], sep='-', top_n=8, prefix='production'
)
X_train = X_train.drop(columns=['production_companies']).join(prod_train)
X_test = X_test.drop(columns=['production_companies']).join(prod_test)

# Align again after multilabel
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# --- Treinamento e Avaliação do Modelo Random Forest ---

kf = KFold(n_splits=5, shuffle=True, random_state=87)
# Parâmetros iniciais para RandomForestRegressor
# n_estimators: número de árvores na floresta
# random_state: para reprodutibilidade
model = RandomForestRegressor(n_estimators=100, random_state=87, n_jobs=-1) # n_jobs=-1 para usar todos os núcleos da CPU

# Predições de validação cruzada no conjunto de treino
y_pred_cv_train = cross_val_predict(model, X_train, y_train, cv=kf)

# Treinar o modelo no conjunto de treino completo
model.fit(X_train, y_train)

# Predições no conjunto de teste
y_test_pred = model.predict(X_test)

# --- Métricas de Avaliação ---

mse_train_cv = mean_squared_error(y_train, y_pred_cv_train)
rmse_train_cv = np.sqrt(mse_train_cv)

mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)

metrics = {
    'R2': r2_score(y_test, y_test_pred),
    'MAE': mean_absolute_error(y_test, y_test_pred),
    'RMSE': rmse_test,
    'MedAE': median_absolute_error(y_test, y_test_pred),
}

metrics_df = pd.DataFrame(metrics, index=[0]).T
metrics_df.columns = ['Value']

print(metrics_df)

          Value
R2     0.455687
MAE    0.513751
RMSE   0.672785
MedAE  0.415920


---