# Importações

In [1]:
#!pip install scikit-learn pandas numpy matplotlib xgboost

In [None]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt

# Pré-processamento e pipelines
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Modelos de ML
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

# Validação e busca de hiperparâmetros
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV
from scipy.stats import randint, uniform, loguniform

# Métricas
from sklearn.metrics import mean_absolute_error, mean_squared_error, median_absolute_error, r2_score

# Carregando dataset

In [4]:
# Carregar o dataset
df = pd.read_csv('filmes_gabriel.csv')

# Dividindo os dados
- X_train, X_test, y_train, y_test

In [5]:
numerical_cols = df[['popularity', 'budget', 'runtime']].columns
categorical_col = ['original_language']
date_column = 'release_date'

X = df.drop(columns=['vote_average'])
y = df['vote_average']

# Dividindo o dataset em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=87)

# Transformadores

## Colunas: Generos Cinematográficos e Produtora
Aplicando MultiLabelBinarize com limites de 12 e 10, respectivamente

In [6]:
def process_multilabel_column(train_series, test_series, sep='-', top_n=20, outros=True, prefix=''):
    # Extrair top N do treino
    exploded = train_series.str.split(sep).explode().str.strip()
    top = exploded.value_counts().nlargest(top_n).index

    def filter_top(vals):
        vals = [v.strip() for v in vals.split(sep)]
        if outros:
            return [v if v in top else 'Outros' for v in vals]
        else:
            return [v for v in vals if v in top]

    # Aplicar transformação
    train_processed = train_series.apply(filter_top)
    test_processed = test_series.apply(filter_top)

    mlb = MultiLabelBinarizer()
    train_encoded = pd.DataFrame(
        mlb.fit_transform(train_processed),
        columns=[f'{prefix}_{cls}' for cls in mlb.classes_],
        index=train_series.index
    )
    test_encoded = pd.DataFrame(
        mlb.transform(test_processed),
        columns=[f'{prefix}_{cls}' for cls in mlb.classes_],
        index=test_series.index
    )

    return train_encoded, test_encoded


### Generos cinematográficos

In [7]:
teste = X_train['genres'].apply(lambda x: x.split('-')).copy()
teste = teste.explode().reset_index(drop=True)
genre_counts = teste.value_counts() # Apply value_counts directly to the Series
print(genre_counts)
print(len(genre_counts))

genres
Drama              3380
Comedy             2486
Thriller           1974
Action             1893
Romance            1253
Adventure          1196
Horror             1174
Crime              1161
Science Fiction     864
Fantasy             691
Family              683
Mystery             666
History             381
Animation           364
War                 303
Music               220
Western             129
Documentary          90
TV Movie             59
Name: count, dtype: int64
19


In [8]:
genres_train, genres_test = process_multilabel_column(
    X_train['genres'], X_test['genres'], sep='-', top_n=12, outros=True, prefix='genre'
)

# Substituir a coluna original
X_train = X_train.drop(columns='genres').join(genres_train)
X_test = X_test.drop(columns='genres').join(genres_test)

### Produtora

In [9]:
teste = X_train['production_companies'].apply(lambda x: x.split('-')).copy()
teste = teste.explode().reset_index(drop=True)
production_counts = teste.value_counts() # Apply value_counts directly to the Series
print(production_counts)
print(len(production_counts))

production_companies
Universal Pictures                        444
Warner Bros. Pictures                     441
Columbia Pictures                         343
Paramount                                 333
20th Century Fox                          315
                                         ... 
October Pictures                            1
Eidos Films                                 1
Samuelson Productions                       1
Gorai / Samuelson Productions               1
Arlington Road Productions Corporation      1
Name: count, Length: 9045, dtype: int64
9045


In [10]:
production_train, production_test = process_multilabel_column(
    X_train['production_companies'], X_test['production_companies'], sep='-', top_n=10, outros=True, prefix='production'
)

# Substituir a coluna original
X_train = X_train.drop(columns='production_companies').join(production_train)
X_test = X_test.drop(columns='production_companies').join(production_test)

## Colunas de data
Transformer: Data para ano/mês

In [11]:
class DateFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, date_column):
        self.date_column = date_column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X[self.date_column] = pd.to_datetime(X[self.date_column], errors='coerce')
        X['year'] = X[self.date_column].dt.year
        X['month'] = X[self.date_column].dt.month
        return X.drop(columns=[self.date_column])

## Outliers
- CapTransformer: Cap
- LogCapTransformer: Cap + Log

In [12]:
class CapTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.bounds_ = {}

    def fit(self, X, y=None):
        X = X.copy()
        for col in self.columns:
            Q1 = X[col].quantile(0.25)
            Q3 = X[col].quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR
            self.bounds_[col] = (lower, upper)
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            lower, upper = self.bounds_[col]
            X[col] = X[col].clip(lower, upper)
        return X


In [13]:
class LogCapTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.bounds_ = {}

    def fit(self, X, y=None):
        X = X.copy()
        for col in self.columns:
            Q1 = X[col].quantile(0.25)
            Q3 = X[col].quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR
            self.bounds_[col] = (lower, upper)
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            lower, upper = self.bounds_[col]
            X[col] = X[col].clip(lower, upper)
            X[col] = np.log1p(X[col])
        return X

# Identificando outliers (graficos-modelo-se quiser)

numeric_cols = df[['popularity', 'budget', 'runtime']].columns

for col in numeric_cols:
    plt.figure(figsize=(8, 1))
    plt.boxplot(df[col], vert=False)
    plt.title(f'Boxplot de {col}')
    plt.show()

### Tratamento com Cap nos outliers
 - Função para aplicar o "cap" nos outliers usando o método do IQR (Interquartile Range).
 - O cap limita os valores extremos (outliers) ao valor máximo permitido pelo intervalo interquartil,
 - reduzindo o impacto desses valores sem removê-los do dataset.


def log_transform(df, columns):
    df = df.copy()
    for col in columns:
        df[col] = np.log1p(df[col])
    return df


def cap_outliers_iqr(df, columns):
    df = df.copy()
    bounds = {}  
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        df[col] = df[col].clip(lower, upper)
        bounds[col] = (lower, upper)
    return df, bounds


def apply_outlier_bounds(df, bounds):
    df = df.copy()
    for col, (lower, upper) in bounds.items():
        df[col] = df[col].clip(lower, upper)
    return df


X_train, bounds = cap_outliers_iqr(X_train, numeric_cols)

X_test = apply_outlier_bounds(X_test, bounds)

X_train = log_transform(X_train, numeric_cols)
X_test = log_transform(X_test, numeric_cols)

def log_transform(df, columns):
    for col in columns:
        # Adiciona 1 para evitar log(0)
        df[col] = np.log1p(df[col])
    return df

def cap_outliers_iqr(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        df[col] = df[col].clip(lower, upper)
    return df

df = log_transform(df, numeric_cols)
df = cap_outliers_iqr(df, numeric_cols)

for col in numeric_cols:
    plt.figure(figsize=(8, 1))
    plt.boxplot(df[col], vert=False)
    plt.title(f'Boxplot de {col} (log + cap_outliers_iqr)')
    plt.show()


# Treinamento
- Modelo XGBRegressor
- Modelo SVR

### Definindo Kf

In [14]:
# Definindo o KFold para o cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=87)

### XGBRegressor

In [15]:
# Cap de outliers + padronização
num_pipeline = Pipeline([
    ('cap', CapTransformer(columns=numerical_cols)),
    ('scale', StandardScaler())
])

# OneHot para variáveis categóricas
cat_pipeline = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Pré-processador geral
preprocessor = ColumnTransformer([
    ('num', num_pipeline, numerical_cols),
    ('cat', cat_pipeline, categorical_col)
], remainder='passthrough')  # passa as colunas de ano/mês e binarizadas


# Pipeline Completo
pipeline = Pipeline([
    ('date', DateFeatureExtractor(date_column=date_column)),  # extrai year e month
    ('preprocess', preprocessor),
    ('regressor', XGBRegressor(objective='reg:squarederror', random_state=42, n_jobs=-1))
])

# RandomizedSearch
param_dist = {
    'regressor__n_estimators': randint(100, 300),
    'regressor__max_depth': randint(3, 10),
    'regressor__learning_rate': uniform(0.01, 0.3),
    'regressor__subsample': uniform(0.7, 0.3),
    'regressor__colsample_bytree': uniform(0.7, 0.3),
}

kf = KFold(n_splits=5, shuffle=True, random_state=87)

XGBRegressor_rand = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=30,
    cv=kf,
    scoring='r2',
    random_state=87,
    verbose=2,
    n_jobs=-1
)

# Treinamento e resultado
XGBRegressor_rand.fit(X_train, y_train)

print("Melhores parâmetros:", XGBRegressor_rand.best_params_)
print("Melhor R² (validação cruzada):", XGBRegressor_rand.best_score_)


Fitting 5 folds for each of 30 candidates, totalling 150 fits
Melhores parâmetros: {'regressor__colsample_bytree': np.float64(0.8696231377727208), 'regressor__learning_rate': np.float64(0.05992056035524427), 'regressor__max_depth': 5, 'regressor__n_estimators': 241, 'regressor__subsample': np.float64(0.8173221947525602)}
Melhor R² (validação cruzada): 0.48837201095390803


### SVR

In [16]:
# Pipeline para numéricas com cap e padronização
num_pipeline = Pipeline([
    ('cap', CapTransformer(columns=numerical_cols)),
    ('scale', StandardScaler())
])

# Pipeline para categóricas com OneHot
cat_pipeline = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# ColumnTransformer para combinar tudo
preprocessor = ColumnTransformer([
    ('num', num_pipeline, numerical_cols),
    ('cat', cat_pipeline, categorical_col)
], remainder='passthrough')  # mantém as colunas como 'year', 'month', e as multilabel binarizadas


pipeline = Pipeline([
    ('date', DateFeatureExtractor(date_column=date_column)),  # extrai 'year' e 'month'
    ('preprocess', preprocessor),
    ('regressor', SVR())
])

param_dist = {
    'regressor__kernel': ['rbf', 'linear'],
    'regressor__C': loguniform(1e-2, 1e2),
    'regressor__epsilon': uniform(0.01, 0.3),
    'regressor__gamma': ['scale', 'auto']
}


svm_rand = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=10,  # aumentei para melhorar a busca, mas pode manter em 10 se quiser rapidez
    cv=kf,
    scoring='r2',
    random_state=87,
    verbose=2,
    n_jobs=-1
)

svm_rand.fit(X_train, y_train)

print("Melhores parâmetros:", svm_rand.best_params_)
print("Melhor R² (validação cruzada):", svm_rand.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Melhores parâmetros: {'regressor__C': np.float64(4.217516020234043), 'regressor__epsilon': np.float64(0.29647041307396826), 'regressor__gamma': 'auto', 'regressor__kernel': 'rbf'}
Melhor R² (validação cruzada): 0.45791552953965875


# Avaliação dos modelos

In [None]:
def evaluate_model(model_name, model, X_test, y_test):
    y_pred = model.predict(X_test)
    return {
        'Modelo': model_name,
        'R²': r2_score(y_test, y_pred),
        'MAE': mean_absolute_error(y_test, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
        'MedAE': median_absolute_error(y_test, y_pred)
    }


In [18]:
results = []

results.append(evaluate_model("XGBRegressor", XGBRegressor_rand.best_estimator_, X_test, y_test))
results.append(evaluate_model("SVR", svm_rand.best_estimator_, X_test, y_test))

import pandas as pd
results_df = pd.DataFrame(results)
print(results_df)

         Modelo        R²       MAE      RMSE     MedAE
0  XGBRegressor  0.482426  0.491358  0.644614  0.395054
1           SVR  0.441534  0.504149  0.669594  0.389334
