# Importações

In [None]:
#!pip install scikit-learn shap pandas numpy matplotlib xgboost seaborn 

In [1]:
import pickle
import joblib
import shap
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np  
import os
import sys

# Pré-processamento e pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Modelos de ML
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# Validação e busca de hiperparâmetros
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from scipy.stats import randint, uniform, loguniform

# Métricas
from sklearn.metrics import mean_absolute_error, mean_squared_error, median_absolute_error, r2_score

# Transformadores personalizados
src_path = os.path.abspath(os.path.join("..", "Streamlit", "src"))
if src_path not in sys.path:
    sys.path.append(src_path)

from custom_transformers import (
    DateFeatureExtractor,
    CapTransformer,
    RareCategoryGrouper,
    TopNMultiLabelTransformer,
    BudgetRuntimeRatioTransformer
)

  from .autonotebook import tqdm as notebook_tqdm


# Carregando dataset

In [2]:
df = pd.read_csv('../data/filmes_filtrados.csv')

# Dividindo os dados
- X_train, X_test, y_train, y_test

In [3]:
mean_impute_cols = ['popularity', 'budget', 'runtime', 'budget_runtime_ratio']
mode_impute_cols = ['year', 'month']
categorical_col = ['original_language']
date_column = 'release_date'

X = df.drop(columns=['vote_average'])
y = df['vote_average']

# Dividindo o dataset em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=87)

# Transformadores
- MultiLabelBinarize -> Generos Cinematográficos, Produtora e Atores
- Data -> Extrai o ano e o mês das datas
- Outliers -> Cap, Log + Cap, Winsorizer
- Agrupamento de categorias raras -> original language
- Criação da feature budget_runtime_ratio

In [4]:
# Transformers personalizados por coluna
credits_transformer = TopNMultiLabelTransformer(top_n=60, prefix='credits')
genres_transformer = TopNMultiLabelTransformer(top_n=12, prefix='genre')
prod_companies_transformer = TopNMultiLabelTransformer(top_n=60, prefix='prod_company')

# Treinamento
- Modelo XGBRegressor
- Modelo SVR
- Modelo RandomForest

### Definindo Kf

In [5]:
# Definindo o KFold para o cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=87)

### XGBRegressor

In [None]:
# Sub-pipelines de pré-processamento
mean_numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('cap', CapTransformer(columns=mean_impute_cols)),
    ('scale', StandardScaler())
])

mode_numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('cap', CapTransformer(columns=mode_impute_cols)),
    ('scale', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('rare', RareCategoryGrouper(column='original_language', top_n=10, other_label='Other')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Pré-processador principal
preprocessor = ColumnTransformer([
    ('mean_numeric', mean_numeric_pipeline, mean_impute_cols),
    ('mode_numeric', mode_numeric_pipeline, mode_impute_cols),
    ('categorical', cat_pipeline, categorical_col),
    ('credits', credits_transformer, 'credits'),
    ('genres', genres_transformer, 'genres'),
    ('prod_companies', prod_companies_transformer, 'production_companies')
], remainder='drop')

# Pipeline final e completo
pipeline = Pipeline([
    ('budget_ratio', BudgetRuntimeRatioTransformer()),
    ('date', DateFeatureExtractor(date_column=date_column)),
    ('preprocess', preprocessor),
    ('regressor', XGBRegressor(objective='reg:squarederror', random_state=87, n_jobs=-1))
])

pipeline.set_output(transform="pandas")

# Definição dos parâmetros para a busca
param_dist = {
    'regressor__n_estimators': randint(100, 300),
    'regressor__max_depth': randint(3, 10),
    'regressor__learning_rate': uniform(0.01, 0.3),
    'regressor__subsample': uniform(0.7, 0.3),
    'regressor__colsample_bytree': uniform(0.7, 0.3),
}

# Treinamento com RandomizedSearchCV
XGBRegressor_rand = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=30,
    cv=kf,
    scoring='r2',
    random_state=87,
    verbose=2,
    n_jobs=-1
)

XGBRegressor_rand.fit(X_train, y_train)

# Exibição e salvamento dos resultados
print("Melhores parâmetros:", XGBRegressor_rand.best_params_)
print("Melhor R² (validação cruzada):", XGBRegressor_rand.best_score_)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Melhores parâmetros: {'regressor__colsample_bytree': np.float64(0.8696231377727208), 'regressor__learning_rate': np.float64(0.05992056035524427), 'regressor__max_depth': 5, 'regressor__n_estimators': 241, 'regressor__subsample': np.float64(0.8173221947525602)}
Melhor R² (validação cruzada): 0.4910352281586327


FileNotFoundError: [Errno 2] No such file or directory: 'Streamlit/models/best_model.pkl'

### SVR

In [None]:
# --- 2. Definição dos Sub-pipelines de Pré-processamento ---
mean_numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('cap', CapTransformer(columns=mean_impute_cols)),
    ('scale', StandardScaler())
])

mode_numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('cap', CapTransformer(columns=mode_impute_cols)),
    ('scale', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('rare', RareCategoryGrouper(column='original_language', top_n=10, other_label='Other')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# --- 3. Construção do Pré-processador Principal ---
# Este preprocessor é reutilizado do setup do XGBoost
preprocessor = ColumnTransformer([
    ('mean_numeric', mean_numeric_pipeline, mean_impute_cols),
    ('mode_numeric', mode_numeric_pipeline, mode_impute_cols),
    ('categorical', cat_pipeline, categorical_col),
    ('credits', credits_transformer, 'credits'),
    ('genres', genres_transformer, 'genres'),
    ('prod_companies', prod_companies_transformer, 'production_companies')
], remainder='drop')


# --- 4. Construção do Pipeline Final para o SVR ---
pipeline_svr = Pipeline([
    # As etapas iniciais são idênticas ao pipeline do XGBoost
    ('budget_ratio', BudgetRuntimeRatioTransformer()),
    ('date', DateFeatureExtractor(date_column=date_column)),
    
    # A etapa de pré-processamento reutiliza o mesmo preprocessor robusto
    ('preprocess', preprocessor),
    
    # A única diferença é o estimador final
    ('regressor', SVR())
])

pipeline_svr.set_output(transform="pandas")


# --- 5. Otimização e Treinamento do SVR ---
# A distribuição de parâmetros e a busca continuam as mesmas
param_dist_svr = {
    'regressor__kernel': ['rbf', 'linear'],
    'regressor__C': loguniform(1e-2, 1e2),
    'regressor__epsilon': uniform(0.01, 0.3),
    'regressor__gamma': ['scale', 'auto']
}

svm_rand = RandomizedSearchCV(
    estimator=pipeline_svr,
    param_distributions=param_dist_svr,
    n_iter=20,
    cv=kf,
    scoring='r2',
    random_state=87,
    verbose=2,
    n_jobs=-1
)

svm_rand.fit(X_train, y_train)

# --- 6. Exibição e Salvamento dos Resultados ---
print("Melhores parâmetros SVR:", svm_rand.best_params_)
print("Melhor R² (CV) SVR:", svm_rand.best_score_)

# Salve o melhor modelo SVR encontrado
melhor_svr = svm_rand.best_estimator_
joblib.dump(melhor_svr, 'seu_caminho/best_model_svr.pkl')

### RandomForest

In [None]:
# --- 2. Definição dos Sub-pipelines de Pré-processamento ---
mean_numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('cap', CapTransformer(columns=mean_impute_cols)),
    ('scale', StandardScaler())
])

mode_numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('cap', CapTransformer(columns=mode_impute_cols)),
    ('scale', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('rare', RareCategoryGrouper(column='original_language', top_n=10, other_label='Other')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# --- 3. Construção do Pré-processador Principal ---
# Este preprocessor é reutilizado dos setups anteriores
preprocessor = ColumnTransformer([
    ('mean_numeric', mean_numeric_pipeline, mean_impute_cols),
    ('mode_numeric', mode_numeric_pipeline, mode_impute_cols),
    ('categorical', cat_pipeline, categorical_col),
    ('credits', credits_transformer, 'credits'),
    ('genres', genres_transformer, 'genres'),
    ('prod_companies', prod_companies_transformer, 'production_companies')
], remainder='drop')


# --- 4. Construção do Pipeline Final para o Random Forest ---
pipeline_rf = Pipeline([
    # Etapas iniciais idênticas aos outros pipelines
    ('budget_ratio', BudgetRuntimeRatioTransformer()),
    ('date', DateFeatureExtractor(date_column=date_column)),
    
    # Reutiliza o mesmo preprocessor robusto
    ('preprocess', preprocessor),
    
    # A única diferença é o estimador final
    ('regressor', RandomForestRegressor(random_state=87, n_jobs=-1))
])

pipeline_rf.set_output(transform="pandas")


# --- 5. Otimização e Treinamento do Random Forest ---
# A distribuição de parâmetros e a busca continuam as mesmas
param_dist_rf = {
    'regressor__n_estimators': randint(100, 500),
    'regressor__max_depth': randint(3, 20),
    'regressor__min_samples_split': randint(2, 10),
    'regressor__min_samples_leaf': randint(1, 10),
    'regressor__max_features': ['sqrt', 'log2', None]
}

rf_rand = RandomizedSearchCV(
    estimator=pipeline_rf,
    param_distributions=param_dist_rf,
    n_iter=30,
    cv=kf,
    scoring='r2',
    error_score='raise',
    random_state=87,
    verbose=2,
    n_jobs=-1
)

rf_rand.fit(X_train, y_train)

# --- 6. Exibição e Salvamento dos Resultados ---
print("Melhores parâmetros RF:", rf_rand.best_params_)
print("Melhor R² (CV) RF:", rf_rand.best_score_)

# Salve o melhor modelo Random Forest encontrado
melhor_rf = rf_rand.best_estimator_
joblib.dump(melhor_rf, 'seu_caminho/best_model_rf.pkl')

# Avaliação dos modelos

In [8]:
def evaluate_model(model_name, model, X_test, y_test):
    y_pred = model.predict(X_test)
    return {
        'Modelo': model_name,
        'R²': r2_score(y_test, y_pred),
        'MAE': mean_absolute_error(y_test, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
        'MedAE': median_absolute_error(y_test, y_pred)
    }

In [9]:
evaluate_model("XGBRegressor", XGBRegressor_rand.best_estimator_, X_test, y_test)

{'Modelo': 'XGBRegressor',
 'R²': 0.49586137081672366,
 'MAE': 0.49254510745452607,
 'RMSE': np.float64(0.6474811267522815),
 'MedAE': 0.38919582366943395}

In [None]:
results = []
results.append(evaluate_model("XGBRegressor", XGBRegressor_rand.best_estimator_, X_test, y_test))
results.append(evaluate_model("RandomForestRegressor", rf_rand.best_estimator_, X_test, y_test))
results.append(evaluate_model("SVR", svm_rand.best_estimator_, X_test, y_test))
results_df = pd.DataFrame(results)
print(results_df)

# Graficos

## Shap

In [None]:
# 1. Obter o melhor pipeline treinado
best_pipeline = XGBRegressor_rand.best_estimator_

# 2. Criar um pipeline APENAS com os passos de pré-processamento
preprocessing_pipeline = Pipeline(best_pipeline.steps[:-1])
X_train_transformed = preprocessing_pipeline.transform(X_train)


# 3. Obter os nomes das features do ColumnTransformer (que é o passo 'preprocess')
col_transformer = best_pipeline.named_steps['preprocess']
feature_names = col_transformer.get_feature_names_out()

# 4. Criar um DataFrame com os dados transformados para melhor visualização no SHAP
X_train_transformed_df = pd.DataFrame(X_train_transformed, columns=feature_names)

# 5. Extrair o modelo final e calcular os valores SHAP
xgb_model = best_pipeline.named_steps['regressor']
explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer(X_train_transformed_df) # A API moderna do SHAP usa o explainer como uma função
print("Valores SHAP calculados com sucesso. Gerando gráficos...")

# 6. Gerar os gráficos
plt.figure()
plt.title("SHAP Summary Plot (Beeswarm)")
shap.summary_plot(shap_values, X_train_transformed_df, show=False)
plt.tight_layout() 
plt.show()
plt.figure()
plt.title("Importância Média das Top 40 Features (SHAP Bar Plot)")
shap.summary_plot(shap_values, X_train_transformed_df, plot_type="bar", show=False, max_display=40)
plt.tight_layout()
plt.show()

## Importance

In [None]:
feature_importance = np.abs(shap_values.values).mean(axis=0)
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'SHAP Importance': feature_importance
})

sorted_importance_df = importance_df.sort_values(
    by='SHAP Importance', 
    ascending=False
).reset_index(drop=True)

print("Ranking de Importância das Features (baseado em SHAP):")
pd.set_option('display.max_rows', None)
print(len(sorted_importance_df))
display(sorted_importance_df) 

# Gerando Pickle

In [10]:
melhor_pipeline_final = XGBRegressor_rand.best_estimator_
nome_arquivo_modelo = 'best_model.pkl'

In [12]:
#joblib.dump(melhor_pipeline_final, nome_arquivo_modelo)