# Importações

In [37]:
#!pip install scikit-learn pandas numpy matplotlib xgboost smogn imbalanced-learn scipy

#Caso dê algum erro nas importacoes rodar os comandos abaixo:
#!pip uninstall -y scikit-learn imbalanced-learn scipy
#!pip install scikit-learn==1.3.2 scipy==1.11.4 imbalanced-learn==0.11.0

In [38]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MultiLabelBinarizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import randint, uniform, loguniform

import smogn
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN

# Carregando dataset

In [39]:
df = pd.read_csv('filmes_luan.csv')

# Extraindo data para ano e mês

In [40]:
class DateFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, date_column):
        self.date_column = date_column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X[self.date_column] = pd.to_datetime(X[self.date_column], errors='coerce')
        X['year'] = X[self.date_column].dt.year
        X['month'] = X[self.date_column].dt.month
        return X.drop(columns=[self.date_column])

### Tratamento com CAP + LOG nos outliers
 - Substitui outliers extremos pelos percentis limite.
 - reduz o impacto de valores extremos sem truncar bruscamente como o CAP faz.
 - reduzindo o impacto desses valores sem removê-los do dataset.

In [41]:
class LogCapTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.bounds_ = {}

    def fit(self, X, y=None):
        for col in self.columns:
            Q1 = X[col].quantile(0.25)
            Q3 = X[col].quantile(0.75)
            IQR = Q3 - Q1
            self.bounds_[col] = (Q1 - 1.5 * IQR, Q3 + 1.5 * IQR)
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            lower, upper = self.bounds_[col]
            X[col] = X[col].clip(lower, upper)
            X[col] = np.log1p(X[col])
        return X

## Função para processar colunas multilabel

In [42]:
def process_multilabel_column(train_series, test_series, sep='-', top_n=20, outros=True, prefix=''):
    exploded = train_series.str.split(sep).explode().str.strip()
    top = exploded.value_counts().nlargest(top_n).index

    def filter_top(vals):
        vals = [v.strip() for v in vals.split(sep)]
        return [v if v in top else 'Outros' for v in vals] if outros else [v for v in vals if v in top]

    train_processed = train_series.apply(filter_top)
    test_processed = test_series.apply(filter_top)

    mlb = MultiLabelBinarizer()
    train_encoded = pd.DataFrame(
        mlb.fit_transform(train_processed),
        columns=[f'{prefix}_{cls}' for cls in mlb.classes_],
        index=train_series.index
    )
    test_encoded = pd.DataFrame(
        mlb.transform(test_processed),
        columns=[f'{prefix}_{cls}' for cls in mlb.classes_],
        index=test_series.index
    )
    return train_encoded, test_encoded

## DIVISÃO E TRANSFORMAÇÃO

In [43]:
X = df.drop(columns=['vote_average'])
y = df['vote_average']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=87)

genres_train, genres_test = process_multilabel_column(
    X_train['genres'], X_test['genres'], sep='-', top_n=12, outros=True, prefix='genre'
)
X_train = X_train.drop(columns='genres').join(genres_train)
X_test = X_test.drop(columns='genres').join(genres_test)

production_train, production_test = process_multilabel_column(
    X_train['production_companies'], X_test['production_companies'], sep='-', top_n=10, outros=True, prefix='production'
)
X_train = X_train.drop(columns='production_companies').join(production_train)
X_test = X_test.drop(columns='production_companies').join(production_test)

# Pipelines

In [44]:
numerical_cols = ['popularity', 'budget', 'runtime']
categorical_col = ['original_language']
date_column = 'release_date'

### Pré-processamento completo

In [45]:
num_pipeline = Pipeline([
    ('logcap', LogCapTransformer(columns=numerical_cols)),
    ('scale', StandardScaler())
])

cat_pipeline = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, numerical_cols),
    ('cat', cat_pipeline, categorical_col)
], remainder='passthrough')

full_pipeline = Pipeline([
    ('date', DateFeatureExtractor(date_column=date_column)),
    ('preprocess', preprocessor)
])

X_train_transf = full_pipeline.fit_transform(X_train)
X_test_transf = full_pipeline.transform(X_test)

### Aplicando SMOGN, SMOTE e SMOTEENN
- Balanceamento para usar SMOTE e SMOTEENN na regressão

In [46]:
# Transformar alvo contínuo em bins
y_train_binned = pd.qcut(y_train, q=5, labels=False)

# SMOTE
X_smote, y_smote = SMOTE(random_state=42).fit_resample(X_train_transf, y_train_binned)

# SMOTEENN
X_smoteenn, y_smoteenn = SMOTEENN(random_state=42).fit_resample(X_train_transf, y_train_binned)

# SMOGN
X_smogn_df = pd.DataFrame(X_train_transf)
X_smogn_df['vote_average'] = y_train.values
X_smogn = smogn.smoter(data=X_smogn_df, y='vote_average', k=3, samp_method='balance')
y_smogn = X_smogn['vote_average']
X_smogn = X_smogn.drop(columns=['vote_average'])

dist_matrix: 100%|##########| 695/695 [01:46<00:00,  6.50it/s]
synth_matrix: 100%|##########| 695/695 [00:01<00:00, 371.02it/s]
r_index: 100%|##########| 386/386 [00:00<00:00, 733.91it/s]
dist_matrix: 100%|##########| 587/587 [01:33<00:00,  6.29it/s]
synth_matrix: 100%|##########| 587/587 [00:02<00:00, 219.79it/s]
r_index: 100%|##########| 123/123 [00:00<00:00, 643.31it/s]


# Treinamento dos Modelos
- Modelo XGBRegressor
- Modelo SVM

### XGBRegressor

In [47]:
xgb_pipeline = Pipeline([
    ('regressor', XGBRegressor(objective='reg:squarederror', random_state=42, n_jobs=-1))
])

param_dist_xgb = {
    'regressor__n_estimators': randint(100, 300),
    'regressor__max_depth': randint(3, 10),
    'regressor__learning_rate': uniform(0.01, 0.3),
    'regressor__subsample': uniform(0.7, 0.3),
    'regressor__colsample_bytree': uniform(0.7, 0.3),
}

kf = KFold(n_splits=5, shuffle=True, random_state=87)

xgb_rand = RandomizedSearchCV(
    estimator=xgb_pipeline,
    param_distributions=param_dist_xgb,
    n_iter=60,
    cv=kf,
    scoring='r2',
    random_state=87,
    verbose=2,
    n_jobs=-1
)
xgb_rand.fit(X_train_transf, y_train)
print("Melhores parâmetros XGB:", xgb_rand.best_params_)
print("Melhor R² (XGB):", xgb_rand.best_score_)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
Melhores parâmetros XGB: {'regressor__colsample_bytree': 0.755226088407728, 'regressor__learning_rate': 0.07272737227112255, 'regressor__max_depth': 5, 'regressor__n_estimators': 170, 'regressor__subsample': 0.8300538566031057}
Melhor R² (XGB): 0.49198798633056706


### SVR

In [48]:
svr_pipeline = Pipeline([
    ('regressor', SVR())
])

param_dist_svr = {
    'regressor__kernel': ['rbf', 'linear'],
    'regressor__C': loguniform(1e-2, 1e2),
    'regressor__epsilon': uniform(0.01, 0.3),
    'regressor__gamma': ['scale', 'auto']
}

svm_rand = RandomizedSearchCV(
    estimator=svr_pipeline,
    param_distributions=param_dist_svr,
    n_iter=10,
    cv=kf,
    scoring='r2',
    random_state=42,
    verbose=2,
    n_jobs=-1
)
svm_rand.fit(X_train_transf, y_train)
print("Melhores parâmetros SVR:", svm_rand.best_params_)
print("Melhor R² (SVR):", svm_rand.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Melhores parâmetros SVR: {'regressor__C': 0.017073967431528128, 'regressor__epsilon': 0.26985284373248053, 'regressor__gamma': 'auto', 'regressor__kernel': 'linear'}
Melhor R² (SVR): 0.4270260012299543


# Avaliação final

In [51]:
from sklearn.metrics import mean_squared_error, r2_score

def strip_regressor_prefix(params):
    return {k.replace('regressor__', ''): v for k, v in params.items()}

xgb_best_params = strip_regressor_prefix(xgb_rand.best_params_)
svr_best_params = strip_regressor_prefix(svm_rand.best_params_)

modelos_bal = {
    'XGBRegressor + SMOTE': XGBRegressor(**xgb_best_params, objective='reg:squarederror', n_jobs=-1),
    'XGBRegressor + SMOTEENN': XGBRegressor(**xgb_best_params, objective='reg:squarederror', n_jobs=-1),
    'XGBRegressor + SMOGN': XGBRegressor(**xgb_best_params, objective='reg:squarederror', n_jobs=-1),
    'SVR + SMOTE': SVR(**svr_best_params),
    'SVR + SMOTEENN': SVR(**svr_best_params),
    'SVR + SMOGN': SVR(**svr_best_params),
}

resultados = []

for nome, modelo in modelos_bal.items():
    if 'SMOGN' in nome:
        X_bal, y_bal = X_smogn, y_smogn
    elif 'SMOTEENN' in nome:
        X_bal, y_bal = X_smoteenn, y_smoteenn
    else:
        X_bal, y_bal = X_smote, y_smote

    modelo.fit(X_bal, y_bal)
    y_pred = modelo.predict(X_test_transf)

    resultados.append({
        'Modelo': nome,
        'MSE': mean_squared_error(y_test, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
        'R²': r2_score(y_test, y_pred)
    })

resultados_df = pd.DataFrame(resultados)
print(resultados_df.sort_values(by='R²', ascending=False))

                    Modelo        MSE      RMSE         R²
2     XGBRegressor + SMOGN   0.467544  0.683772   0.417634
5              SVR + SMOGN   0.625158  0.790670   0.221312
1  XGBRegressor + SMOTEENN  19.161391  4.377373 -22.867152
4           SVR + SMOTEENN  19.654185  4.433304 -23.480970
3              SVR + SMOTE  19.829205  4.453000 -23.698972
0     XGBRegressor + SMOTE  19.904447  4.461440 -23.792692
