# Importações

In [None]:
#!pip install scikit-learn pandas numpy matplotlib xgboost smogn imbalanced-learn scipy

#Caso dê algum erro nas importacoes rodar os comandos abaixo:
#!pip uninstall -y scikit-learn imbalanced-learn scipy
#!pip install scikit-learn==1.3.2 scipy==1.11.4 imbalanced-learn==0.11.0

In [None]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt

# Pré-processamento e pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Modelos de ML
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

# Validação e busca de hiperparâmetros
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV
from scipy.stats import randint, uniform, loguniform
from scipy.stats.mstats import winsorize

# Métricas
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error

# SMOGN, SMOTE e SMOTEENN
import smogn
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN

# Carregando dataset

In [None]:
df = pd.read_csv('filmes_luan.csv')

# Identificando e tratando outliers em variáveis numéricas do df

In [None]:
numeric_cols = df[['popularity']].columns

for col in numeric_cols:
    plt.figure(figsize=(8, 1))
    plt.boxplot(df[col], vert=False)
    plt.title(f'Boxplot de {col}')
    plt.show()

### Tratamento com Winsorization + LOG nos outliers
 - Substitui outliers extremos pelos percentis limite.
 - reduz o impacto de valores extremos sem truncar bruscamente como o CAP faz.
 - reduzindo o impacto desses valores sem removê-los do dataset.

In [None]:
df['popularity'] = np.log1p(df['popularity'])
df['popularity'] = winsorize(df['popularity'], limits=[0.01, 0.01])

plt.figure(figsize=(8, 1))
plt.boxplot(df['popularity'], vert=False)
plt.title('Boxplot de popularity (log + winsorize)')
plt.show()

# Treinamento
- Modelo XGBRegressor
- Modelo SVM

### Definindo as colunas de features e o target

In [None]:
X = df.drop(columns=['vote_average'])
y = df['vote_average']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=87)

categorical_col = ['original_language']
numerical_cols = [col for col in X_train.columns if col not in categorical_col]

In [None]:
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_cat_train = ohe.fit_transform(X_train[categorical_col])
X_cat_test = ohe.transform(X_test[categorical_col])

scaler = StandardScaler()
X_num_train = scaler.fit_transform(X_train[numerical_cols])
X_num_test = scaler.transform(X_test[numerical_cols])

X_train_transf = np.concatenate([X_cat_train, X_num_train], axis=1)
X_test_transf = np.concatenate([X_cat_test, X_num_test], axis=1)

# Aplicando SMOGN, SMOTE e SMOTEENN

In [None]:
# Transformando o alvo contínuo em 5 bins para usar SMOTE e SMOTEENN (Necessário pois SMOTE e SMOTEENN não foi feito para regressão)
y_train_binned = pd.qcut(y_train, q=5, labels=False)

smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train_transf, y_train_binned)

smoteenn = SMOTEENN(random_state=42)
X_smoteenn, y_smoteenn = smoteenn.fit_resample(X_train_transf, y_train_binned)

In [None]:
#Aplicando SMOGN (oversampling para regressão)
X_smogn_df = pd.DataFrame(X_train_transf)
X_smogn_df['vote_average'] = y_train.values

X_smogn = smogn.smoter(
    data=X_smogn_df,
    y='vote_average',
    k=3,
    samp_method='balance'
)
y_smogn = X_smogn['vote_average']
X_smogn = X_smogn.drop(columns=['vote_average'])

# Pipeline com XGBoost (RandomSearch)

In [None]:
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_col),
    ('num', StandardScaler(), numerical_cols)
])

pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('regressor', XGBRegressor(objective='reg:squarederror', random_state=42, n_jobs=-1))
])

param_dist = {
    'regressor__n_estimators': randint(100, 300),
    'regressor__max_depth': randint(3, 10),
    'regressor__learning_rate': uniform(0.01, 0.3),
    'regressor__subsample': uniform(0.7, 0.3),
    'regressor__colsample_bytree': uniform(0.7, 0.3),
}

kf = KFold(n_splits=5, shuffle=True, random_state=87)

xgb_rand = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=60,
    cv=kf,
    scoring='r2',
    random_state=87,
    verbose=2,
    n_jobs=-1
)

xgb_rand.fit(X_train, y_train)
print("Melhores parâmetros:", xgb_rand.best_params_)
print("Melhor R² (validação cruzada):", xgb_rand.best_score_)