# Importações

In [16]:
#!pip install scikit-learn pandas numpy matplotlib xgboost smogn imbalanced-learn scipy

#Caso dê algum erro nas importacoes rodar os comandos abaixo:
#!pip uninstall -y scikit-learn imbalanced-learn scipy
#!pip install scikit-learn==1.3.2 scipy==1.11.4 imbalanced-learn==0.11.0

In [8]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MultiLabelBinarizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import randint, uniform, loguniform

from xgboost import XGBRegressor
from sklearn.svm import SVR

import smogn
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN

# Carregando dataset

In [9]:
df = pd.read_csv('filmes_luan.csv')

# Extraindo data para ano e mês

In [10]:
class DateFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, date_column):
        self.date_column = date_column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X[self.date_column] = pd.to_datetime(X[self.date_column], errors='coerce')
        X['year'] = X[self.date_column].dt.year
        X['month'] = X[self.date_column].dt.month
        return X.drop(columns=[self.date_column])

### Tratamento com CAP + LOG nos outliers
 - Substitui outliers extremos pelos percentis limite.
 - reduz o impacto de valores extremos sem truncar bruscamente como o CAP faz.
 - reduzindo o impacto desses valores sem removê-los do dataset.

In [11]:
class LogCapTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.bounds_ = {}

    def fit(self, X, y=None):
        for col in self.columns:
            Q1 = X[col].quantile(0.25)
            Q3 = X[col].quantile(0.75)
            IQR = Q3 - Q1
            self.bounds_[col] = (Q1 - 1.5 * IQR, Q3 + 1.5 * IQR)
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            lower, upper = self.bounds_[col]
            X[col] = X[col].clip(lower, upper)
            X[col] = np.log1p(X[col])
        return X

## Alternativas para tratamento de outliers

In [12]:
from sklearn.preprocessing import StandardScaler

num_pipeline_std = Pipeline([
    ('scale', StandardScaler())
])

from sklearn.preprocessing import RobustScaler

num_pipeline_robust = Pipeline([
    ('scale', RobustScaler())
])

from sklearn.preprocessing import PowerTransformer

num_pipeline_yeojohnson = Pipeline([
    ('power', PowerTransformer(method='yeo-johnson')),
    ('scale', StandardScaler())
])

from sklearn.preprocessing import QuantileTransformer

num_pipeline_quantile = Pipeline([
    ('quantile', QuantileTransformer(output_distribution='normal')),
    ('scale', StandardScaler())
])

class CapTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.bounds_ = {}

    def fit(self, X, y=None):
        for col in self.columns:
            Q1 = X[col].quantile(0.25)
            Q3 = X[col].quantile(0.75)
            IQR = Q3 - Q1
            self.bounds_[col] = (Q1 - 1.5 * IQR, Q3 + 1.5 * IQR)
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            lower, upper = self.bounds_[col]
            X[col] = X[col].clip(lower, upper)
        return X

## Função para processar colunas multilabel

In [13]:
def process_multilabel_column(train_series, test_series, sep='-', top_n=20, outros=True, prefix=''):
    exploded = train_series.str.split(sep).explode().str.strip()
    top = exploded.value_counts().nlargest(top_n).index

    def filter_top(vals):
        vals = [v.strip() for v in vals.split(sep)]
        return [v if v in top else 'Outros' for v in vals] if outros else [v for v in vals if v in top]

    train_processed = train_series.apply(filter_top)
    test_processed = test_series.apply(filter_top)

    mlb = MultiLabelBinarizer()
    train_encoded = pd.DataFrame(
        mlb.fit_transform(train_processed),
        columns=[f'{prefix}_{cls}' for cls in mlb.classes_],
        index=train_series.index
    )
    test_encoded = pd.DataFrame(
        mlb.transform(test_processed),
        columns=[f'{prefix}_{cls}' for cls in mlb.classes_],
        index=test_series.index
    )
    return train_encoded, test_encoded

## DIVISÃO E TRANSFORMAÇÃO

In [14]:
X = df.drop(columns=['vote_average'])
y = df['vote_average']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

genres_train, genres_test = process_multilabel_column(
    X_train['genres'], X_test['genres'], sep='-', top_n=12, outros=True, prefix='genre'
)
X_train = X_train.drop(columns='genres').join(genres_train)
X_test = X_test.drop(columns='genres').join(genres_test)

production_train, production_test = process_multilabel_column(
    X_train['production_companies'], X_test['production_companies'], sep='-', top_n=10, outros=True, prefix='production'
)
X_train = X_train.drop(columns='production_companies').join(production_train)
X_test = X_test.drop(columns='production_companies').join(production_test)

# Pipelines

In [15]:
numerical_cols = ['popularity', 'budget', 'runtime']
categorical_col = ['original_language']
date_column = 'release_date'

### Pré-processamento completo

In [16]:
num_pipeline = Pipeline([
    ('logcap', LogCapTransformer(columns=numerical_cols)),
    ('scale', StandardScaler())
])

cat_pipeline = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, numerical_cols),
    ('cat', cat_pipeline, categorical_col)
], remainder='passthrough')

full_pipeline = Pipeline([
    ('date', DateFeatureExtractor(date_column=date_column)),
    ('preprocess', preprocessor)
])

X_train_transf = full_pipeline.fit_transform(X_train)
X_test_transf = full_pipeline.transform(X_test)

### Aplicando SMOGN, SMOTE e SMOTEENN
- Balanceamento para usar SMOTE e SMOTEENN na regressão

In [13]:
# Transformar alvo contínuo em bins
y_train_binned = pd.qcut(y_train, q=5, labels=False)

# SMOTE
X_smote, y_smote = SMOTE(random_state=42).fit_resample(X_train_transf, y_train_binned)

# SMOTEENN
X_smoteenn, y_smoteenn = SMOTEENN(random_state=42).fit_resample(X_train_transf, y_train_binned)

# SMOGN
X_smogn_df = pd.DataFrame(X_train_transf)
X_smogn_df['vote_average'] = y_train.values
X_smogn = smogn.smoter(data=X_smogn_df, y='vote_average', k=3, samp_method='balance')
y_smogn = X_smogn['vote_average']
X_smogn = X_smogn.drop(columns=['vote_average'])

dist_matrix: 100%|##########| 706/706 [01:52<00:00,  6.27it/s]
synth_matrix: 100%|##########| 706/706 [00:01<00:00, 386.31it/s]
r_index: 100%|##########| 353/353 [00:00<00:00, 658.62it/s]
dist_matrix: 100%|##########| 542/542 [01:23<00:00,  6.51it/s]
synth_matrix: 100%|##########| 542/542 [00:02<00:00, 215.28it/s]
r_index: 100%|##########| 303/303 [00:00<00:00, 639.20it/s]


# Treinamento dos Modelos
- Modelo XGBRegressor
- Modelo SVM

### XGBRegressor

In [18]:
xgb_pipeline = Pipeline([
    ('regressor', XGBRegressor(objective='reg:squarederror', random_state=42, n_jobs=-1))
])

param_dist_xgb = {
    'regressor__n_estimators': randint(100, 300),
    'regressor__max_depth': randint(3, 10),
    'regressor__learning_rate': uniform(0.01, 0.3),
    'regressor__subsample': uniform(0.7, 0.3),
    'regressor__colsample_bytree': uniform(0.7, 0.3),
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)

xgb_rand = RandomizedSearchCV(
    estimator=xgb_pipeline,
    param_distributions=param_dist_xgb,
    n_iter=60,
    cv=kf,
    scoring='r2',
    random_state=42,
    verbose=2,
    n_jobs=-1
)
xgb_rand.fit(X_train_transf, y_train)
print("Melhores parâmetros XGB:", xgb_rand.best_params_)
print("Melhor R² (XGB):", xgb_rand.best_score_)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
Melhores parâmetros XGB: {'regressor__colsample_bytree': 0.8508037069686584, 'regressor__learning_rate': 0.025443625374996805, 'regressor__max_depth': 7, 'regressor__n_estimators': 228, 'regressor__subsample': 0.7718685672000917}
Melhor R² (XGB): 0.48698226649020937


### SVR

In [15]:
svr_pipeline = Pipeline([
    ('regressor', SVR())
])

param_dist_svr = {
    'regressor__kernel': ['rbf', 'linear'],
    'regressor__C': loguniform(1e-2, 1e2),
    'regressor__epsilon': uniform(0.01, 0.3),
    'regressor__gamma': ['scale', 'auto']
}

svm_rand = RandomizedSearchCV(
    estimator=svr_pipeline,
    param_distributions=param_dist_svr,
    n_iter=10,
    cv=kf,
    scoring='r2',
    random_state=42,
    verbose=2,
    n_jobs=-1
)
svm_rand.fit(X_train_transf, y_train)
print("Melhores parâmetros SVR:", svm_rand.best_params_)
print("Melhor R² (SVR):", svm_rand.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Melhores parâmetros SVR: {'regressor__C': 0.017073967431528128, 'regressor__epsilon': 0.26985284373248053, 'regressor__gamma': 'auto', 'regressor__kernel': 'linear'}
Melhor R² (SVR): 0.4141686272365884


# Avaliação final

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

def strip_regressor_prefix(params):
    return {k.replace('regressor__', ''): v for k, v in params.items()}

xgb_best_params = strip_regressor_prefix(xgb_rand.best_params_)
svr_best_params = strip_regressor_prefix(svm_rand.best_params_)

modelos_bal = {
    'XGBRegressor + SMOTE': XGBRegressor(**xgb_best_params, objective='reg:squarederror', n_jobs=-1),
    'XGBRegressor + SMOTEENN': XGBRegressor(**xgb_best_params, objective='reg:squarederror', n_jobs=-1),
    'XGBRegressor + SMOGN': XGBRegressor(**xgb_best_params, objective='reg:squarederror', n_jobs=-1),
    'SVR + SMOTE': SVR(**svr_best_params),
    'SVR + SMOTEENN': SVR(**svr_best_params),
    'SVR + SMOGN': SVR(**svr_best_params),
}

resultados2 = []

for nome, modelo in modelos_bal.items():
    if 'SMOGN' in nome:
        X_bal, y_bal = X_smogn, y_smogn
    elif 'SMOTEENN' in nome:
        X_bal, y_bal = X_smoteenn, y_smoteenn
    else:
        X_bal, y_bal = X_smote, y_smote

    modelo.fit(X_bal, y_bal)
    y_pred = modelo.predict(X_test_transf)

    resultados2.append({
        'Modelo': nome,
        'MSE': mean_squared_error(y_test, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
        'R²': r2_score(y_test, y_pred)
    })

resultados_df2 = pd.DataFrame(resultados2)
print(resultados_df2.sort_values(by='R²', ascending=False))

```markdown
Modelo                      |     MSE     |   RMSE   |     R²
---------------------------|-------------|----------|-----------
XGBRegressor + SMOGN       |   0.467544  | 0.683772 |  0.417634
SVR + SMOGN                |   0.625158  | 0.790670 |  0.221312
XGBRegressor + SMOTEENN    |  19.161391  | 4.377373 | -22.867152
SVR + SMOTEENN             |  19.654185  | 4.433304 | -23.480970
SVR + SMOTE                |  19.829205  | 4.453000 | -23.698972
XGBRegressor + SMOTE       |  19.904447  | 4.461440 | -23.792692
```

## Testes com modelos de tratamento de outliers diferentes (Com SMOGN)

In [31]:
from sklearn.metrics import mean_squared_error, r2_score

def strip_regressor_prefix(params):
    return {k.replace('regressor__', ''): v for k, v in params.items()}

xgb_best_params = strip_regressor_prefix(xgb_rand.best_params_)
svr_best_params = strip_regressor_prefix(svm_rand.best_params_)

pipelines_num = {
    'LogCap': Pipeline([('logcap', LogCapTransformer(columns=numerical_cols)), ('scale', StandardScaler())]),
    'Standard': Pipeline([('scale', StandardScaler())]),
    'Robust': Pipeline([('scale', RobustScaler())]),
    'YeoJohnson': Pipeline([('power', PowerTransformer(method='yeo-johnson')), ('scale', StandardScaler())]),
    'Quantile': Pipeline([('quantile', QuantileTransformer(output_distribution='normal')), ('scale', StandardScaler())]),
    'CapOnly': Pipeline([('cap', CapTransformer(columns=numerical_cols)), ('scale', StandardScaler())])
}

resultados = []

for nome_transf, pipeline_num in pipelines_num.items():
    # Reconstroi o preprocessor com essa transformação
    preprocessor = ColumnTransformer([
        ('num', pipeline_num, numerical_cols),
        ('cat', cat_pipeline, categorical_col)
    ], remainder='passthrough')

    full_pipeline = Pipeline([
        ('date', DateFeatureExtractor(date_column=date_column)),
        ('preprocess', preprocessor)
    ])

    X_train_transf = full_pipeline.fit_transform(X_train)
    X_test_transf = full_pipeline.transform(X_test)

    # Aplique SMOGN aqui (ou SMOTE/SMOTEENN, se quiser variar também)
    X_smogn_df = pd.DataFrame(X_train_transf)
    X_smogn_df['vote_average'] = y_train.values
    X_smogn = smogn.smoter(data=X_smogn_df, y='vote_average', k=3, samp_method='balance')
    y_smogn = X_smogn['vote_average']
    X_smogn = X_smogn.drop(columns=['vote_average'])

    modelo = XGBRegressor(**xgb_best_params, objective='reg:squarederror', n_jobs=-1)
    modelo.fit(X_smogn, y_smogn)
    y_pred = modelo.predict(X_test_transf)

    resultados.append({
        'Transformação': nome_transf,
        'MSE': mean_squared_error(y_test, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
        'R²': r2_score(y_test, y_pred)
    })

df_resultados = pd.DataFrame(resultados).sort_values(by='R²', ascending=False)
print(df_resultados)

dist_matrix: 100%|##########| 706/706 [01:51<00:00,  6.34it/s]
synth_matrix: 100%|##########| 706/706 [00:01<00:00, 383.56it/s]
r_index: 100%|##########| 353/353 [00:00<00:00, 775.77it/s]
dist_matrix: 100%|##########| 542/542 [01:23<00:00,  6.45it/s]
synth_matrix: 100%|##########| 542/542 [00:02<00:00, 207.48it/s]
r_index: 100%|##########| 303/303 [00:00<00:00, 602.69it/s]
dist_matrix: 100%|##########| 706/706 [02:03<00:00,  5.74it/s]
synth_matrix: 100%|##########| 706/706 [00:03<00:00, 198.57it/s]
r_index: 100%|##########| 353/353 [00:00<00:00, 397.85it/s]
dist_matrix: 100%|##########| 542/542 [01:25<00:00,  6.36it/s]
synth_matrix: 100%|##########| 542/542 [00:02<00:00, 213.38it/s]
r_index: 100%|##########| 303/303 [00:00<00:00, 621.87it/s]
dist_matrix: 100%|##########| 706/706 [02:12<00:00,  5.33it/s]
synth_matrix: 100%|##########| 706/706 [00:01<00:00, 380.92it/s]
r_index: 100%|##########| 353/353 [00:00<00:00, 746.50it/s]
dist_matrix: 100%|##########| 542/542 [01:22<00:00,  6.56it/

  Transformação       MSE      RMSE        R²
1      Standard  0.434842  0.659426  0.482482
4      Quantile  0.439627  0.663044  0.476788
3    YeoJohnson  0.441031  0.664101  0.475117
5       CapOnly  0.442977  0.665565  0.472801
2        Robust  0.446271  0.668035  0.468880
0        LogCap  0.447623  0.669046  0.467272


## Testes com modelos de tratamento de outliers diferentes (Sem SMOGN)

In [21]:
def strip_regressor_prefix(params):
    return {k.replace('regressor__', ''): v for k, v in params.items()}

xgb_best_params = strip_regressor_prefix(xgb_rand.best_params_)

resultados1 = []

pipelines_num = {
    'LogCap': Pipeline([('logcap', LogCapTransformer(columns=numerical_cols)), ('scale', StandardScaler())]),
    'Standard': Pipeline([('scale', StandardScaler())]),
    'Robust': Pipeline([('scale', RobustScaler())]),
    'YeoJohnson': Pipeline([('power', PowerTransformer(method='yeo-johnson')), ('scale', StandardScaler())]),
    'Quantile': Pipeline([('quantile', QuantileTransformer(output_distribution='normal')), ('scale', StandardScaler())]),
    'CapOnly': Pipeline([('cap', CapTransformer(columns=numerical_cols)), ('scale', StandardScaler())])
}

for nome_transf, pipeline_num in pipelines_num.items():
    preprocessor = ColumnTransformer([
        ('num', pipeline_num, numerical_cols),
        ('cat', cat_pipeline, categorical_col)
    ], remainder='passthrough')

    full_pipeline = Pipeline([
        ('date', DateFeatureExtractor(date_column=date_column)),
        ('preprocess', preprocessor)
    ])

    X_train_transf = full_pipeline.fit_transform(X_train)
    X_test_transf = full_pipeline.transform(X_test)

    modelo = XGBRegressor(**xgb_best_params, objective='reg:squarederror', n_jobs=-1)
    modelo.fit(X_train_transf, y_train)
    y_pred = modelo.predict(X_test_transf)

    resultados1.append({
        'Transformação': nome_transf,
        'MSE': mean_squared_error(y_test, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
        'R²': r2_score(y_test, y_pred)
    })

df_resultados1 = pd.DataFrame(resultados1).sort_values(by='R²', ascending=False)
print(df_resultados1)

  Transformação       MSE      RMSE        R²
1      Standard  0.414784  0.644037  0.506354
2        Robust  0.414784  0.644037  0.506354
3    YeoJohnson  0.414784  0.644037  0.506354
4      Quantile  0.414784  0.644037  0.506354
0        LogCap  0.417818  0.646389  0.502743
5       CapOnly  0.417818  0.646389  0.502743
