## Importações

In [86]:
import pandas as pd
import numpy as np

#### Modelos

In [87]:
from xgboost import XGBRegressor
from sklearn.svm import SVR 
from sklearn.ensemble import RandomForestRegressor 

#### Métricas

In [88]:
from sklearn.model_selection import train_test_split, KFold, cross_val_predict
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, median_absolute_error, r2_score

## Carregando dataset com nova coluna 'credits' 

In [89]:
def load_data(file_path):
    return pd.read_csv(file_path)

## Dividindo os dados

### Engenharia de features

In [90]:
def feature_engineering(df):
    df_copy = df.copy()
    df_copy['credits_count'] = df_copy['credits'].astype(str).apply(lambda x: len(x.split('-')))
    df_copy['release_date'] = pd.to_datetime(df_copy['release_date'], errors='coerce')
    df_copy['year'] = df_copy['release_date'].dt.year
    df_copy['month'] = df_copy['release_date'].dt.month
    df_copy = df_copy.drop(columns=['credits', 'release_date'])
    return df_copy

### Lidando com outliers

In [91]:
def handle_outliers(X_train, X_test, numeric_cols, lower_percentile=0.01, upper_percentile=0.99):
    X_train_copy, X_test_copy = X_train.copy(), X_test.copy()
    for col in numeric_cols:
        if col in X_train_copy.columns and pd.api.types.is_numeric_dtype(X_train_copy[col]):
            lower = X_train_copy[col].quantile(lower_percentile)
            upper = X_train_copy[col].quantile(upper_percentile)
            X_train_copy[col] = X_train_copy[col].clip(lower, upper)
            X_test_copy[col] = X_test_copy[col].clip(lower, upper)
    return X_train_copy, X_test_copy

### Agrupando categorias de colunas 

In [92]:
def group_rare_categories(series_train, series_test, min_count_threshold=50, replacement_label='Other'):
    counts = series_train.value_counts()
    rare_categories = counts[counts < min_count_threshold].index
    series_train_processed = series_train.replace(rare_categories, replacement_label)
    series_test_processed = series_test.replace(rare_categories, replacement_label)
    test_categories_not_in_train = set(series_test_processed.unique()) - set(series_train_processed.unique())
    if test_categories_not_in_train:
        series_test_processed = series_test_processed.replace(list(test_categories_not_in_train), replacement_label)
    return series_train_processed, series_test_processed

def process_multilabel_column(train_series, test_series, sep='-', top_n=12, prefix='feature'):
    exploded_train = train_series.astype(str).str.split(sep).explode().str.strip()
    top_categories = exploded_train.value_counts().nlargest(top_n).index.tolist()

    def filter_and_process(vals_series):
        filtered_lists = []
        for val_str in vals_series.astype(str).str.split(sep):
            if not isinstance(val_str, list):
                val_str = []
            filtered_list = [v.strip() for v in val_str if v.strip() in top_categories]
            if not filtered_list:
                filtered_list = ['Other']
            filtered_lists.append(filtered_list)
        return filtered_lists

    train_filtered = filter_and_process(train_series)
    test_filtered = filter_and_process(test_series)
    mlb = MultiLabelBinarizer()
    train_enc = pd.DataFrame(mlb.fit_transform(train_filtered), columns=[f"{prefix}_{cls}" for cls in mlb.classes_], index=train_series.index)
    test_enc = pd.DataFrame(mlb.transform(test_filtered), columns=[f"{prefix}_{cls}" for cls in mlb.classes_], index=test_series.index)
    return train_enc, test_enc

### Pipeline para carregar, processar e dividir os dados


In [93]:
def prepare_data(file_path, target_col='vote_average', test_size=0.2, random_state=87):
    print("Iniciando preparação dos dados...")

    df = load_data(file_path)
    df = feature_engineering(df)
    X = df.drop(columns=[target_col])
    y = df[target_col]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    numeric_cols_to_impute = ['budget', 'runtime', 'year', 'month']

    for col in numeric_cols_to_impute:
        if col in X_train.columns:
            median_val = X_train[col].median()
            X_train[col] = X_train[col].fillna(median_val)
            X_test[col] = X_test[col].fillna(median_val) 

    numeric_cols = ['budget', 'runtime', 'credits_count', 'year', 'month']
    X_train, X_test = handle_outliers(X_train, X_test, numeric_cols)
    
    X_train['original_language'], X_test['original_language'] = group_rare_categories(
        X_train['original_language'], X_test['original_language'], min_count_threshold=50
    )
    X_train = pd.get_dummies(X_train, columns=['original_language'], drop_first=True)
    X_test = pd.get_dummies(X_test, columns=['original_language'], drop_first=True)
    
    genres_train_enc, genres_test_enc = process_multilabel_column(X_train['genres'], X_test['genres'], sep='-', top_n=12, prefix='genre')
    X_train = X_train.drop(columns=['genres']).join(genres_train_enc)
    X_test = X_test.drop(columns=['genres']).join(genres_test_enc)
    
    prod_train_enc, prod_test_enc = process_multilabel_column(X_train['production_companies'], X_test['production_companies'], sep='-', top_n=8, prefix='prod')
    X_train = X_train.drop(columns=['production_companies']).join(prod_train_enc)
    X_test = X_test.drop(columns=['production_companies']).join(prod_test_enc)
    
    if 'title' in X_train.columns:
        X_train = X_train.drop(columns=['title'])
        X_test = X_test.drop(columns=['title'])
        
    X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)
    
    print("Preparação dos dados concluída.")
    return X_train, X_test, y_train, y_test

## Preparação dos dados para Treinamento e Avaliação

In [None]:
FILE_PATH = '../data/filmes_filtrados_credits.csv'
RANDOM_SEED = 87
TEST_SIZE = 0.2

X_train, X_test, y_train, y_test = prepare_data(
    file_path=FILE_PATH, 
    target_col='vote_average', 
    test_size=TEST_SIZE, 
    random_state=RANDOM_SEED
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled = pd.DataFrame(X_train_scaled, index=X_train.index, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, index=X_test.index, columns=X_test.columns)

Iniciando preparação dos dados...
Preparação dos dados concluída.


## Definição dos modelos
- Modelo RandomForest
- Modelo XGBRegressor
- Modelo SVR

In [95]:
models = {
    "RandomForest": RandomForestRegressor(
    n_estimators=200, 
    max_depth=10,
    min_samples_leaf=4,
    random_state=RANDOM_SEED,
    n_jobs=-1 
    ),
    "XGBoost": XGBRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=RANDOM_SEED,
    n_jobs=-1,
    verbosity=0
    ),
    "SVR": SVR(
    kernel='rbf', 
    C=1.0,      
    gamma='scale'
    )
}

## Treinamento e Avaliação dos Modelos de  Regressão

In [96]:
kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
all_metrics = {}

for name, model in models.items():
    if name == "SVR":
        X_train_fit, X_test_fit = X_train_scaled, X_test_scaled
    else:
        X_train_fit, X_test_fit = X_train, X_test

    y_pred_cv_train = cross_val_predict(model, X_train_fit, y_train, cv=kf, n_jobs=-1)

    model.fit(X_train_fit, y_train)
    
    y_test_pred = model.predict(X_test_fit)

    metrics = {
        'R2': r2_score(y_test, y_test_pred),
        'MAE': mean_absolute_error(y_test, y_test_pred),
        'RMSE': np.sqrt(mean_squared_error(y_test, y_test_pred)),
        'MedAE': median_absolute_error(y_test, y_test_pred)
    }
    all_metrics[name] = metrics

print("\n\n==================================================")
print("      Métricas de Avaliação Consolidadas      ")
print("==================================================")

metrics_df = pd.DataFrame(all_metrics).T
print(metrics_df)



      Métricas de Avaliação Consolidadas      
                    R2       MAE      RMSE     MedAE
RandomForest  0.399492  0.969570  1.511209  0.592048
XGBoost       0.407013  0.951707  1.501715  0.561051
SVR           0.144854  1.089977  1.803372  0.595216
