<a href="https://colab.research.google.com/github/Caiopsc/lia1_2025_1/blob/main/Entregas%20-%20Caio%20Pantale%C3%A3o_atividade_capitulo_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# -*- coding: utf-8 -*-
"""Tarefa semana santa.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1uwddm91zdks8C_OpRx1DyQ9536T34kki
"""

from google.colab import drive
drive.mount('/content/drive')

import os
import tarfile
from six.moves import urllib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, StratifiedShuffleSplit
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.compose import ColumnTransformer
from scipy.stats import randint, uniform

# Configuração inicial
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

# Função para baixar os dados
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

# Função para carregar os dados
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

# Baixando e carregando os dados
fetch_housing_data()
housing = load_housing_data()

# Pré-processamento inicial
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)

# Divisão estratificada
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

# Removendo a categoria de renda
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

# Preparando os dados
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

# Classes personalizadas
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        rooms_per_household = X[:, 3] / X[:, 6]
        population_per_household = X[:, 5] / X[:, 6]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, 4] / X[:, 3]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.attribute_names].values

class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_importances, threshold=0.01):
        self.feature_importances = feature_importances
        self.threshold = threshold

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        important_features = self.feature_importances >= self.threshold
        return X[:, important_features]

# Definindo atributos
num_attribs = list(housing.select_dtypes(include=[np.number]).columns)
cat_attribs = ["ocean_proximity"]

# Pipeline numérico
num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler())
])

# Pipeline categórico
cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('one_hot_encoder', OneHotEncoder())
])

# Pipeline completo
full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline)
])

# Preparando os dados
housing_prepared = full_pipeline.fit_transform(housing)

# 1. Experimentando SVR
print("\n=== Testando SVR ===")
svr_param_grid = [
    {'kernel': ['linear'], 'C': [0.1, 1, 10, 100]},
    {'kernel': ['rbf'], 'C': [0.1, 1, 10, 100], 'gamma': [0.01, 0.1, 1, 'scale', 'auto']},
]

svr = SVR()
svr_grid_search = GridSearchCV(svr, svr_param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
svr_grid_search.fit(housing_prepared, housing_labels)

svr_best_params = svr_grid_search.best_params_
svr_best_score = np.sqrt(-svr_grid_search.best_score_)
print(f"Melhores parâmetros SVR: {svr_best_params}")
print(f"Melhor RMSE SVR: {svr_best_score:.2f}")

# 2. RandomizedSearchCV para Random Forest
print("\n=== Testando RandomizedSearchCV ===")
param_distribs = {
    'n_estimators': randint(low=10, high=200),
    'max_features': randint(low=2, high=8),
    'bootstrap': [True, False],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': randint(low=2, high=20),
    'min_samples_leaf': randint(low=1, high=10)
}

forest_reg = RandomForestRegressor(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                              n_iter=50, cv=5, scoring='neg_mean_squared_error',
                              random_state=42, verbose=2, n_jobs=-1)
rnd_search.fit(housing_prepared, housing_labels)

rnd_best_params = rnd_search.best_params_
rnd_best_score = np.sqrt(-rnd_search.best_score_)
print(f"Melhores parâmetros (Randomized): {rnd_best_params}")
print(f"Melhor RMSE (Randomized): {rnd_best_score:.2f}")

# 3. Adicionando seleção de features importantes
print("\n=== Adicionando seleção de features ===")
best_forest = rnd_search.best_estimator_
feature_importances = best_forest.feature_importances_

num_pipeline_with_selector = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
    ('feature_selector', FeatureSelector(feature_importances[:len(num_attribs)+3], threshold=0.01))
])

full_pipeline_with_selector = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline_with_selector),
    ("cat_pipeline", cat_pipeline)
])

housing_prepared_important = full_pipeline_with_selector.fit_transform(housing)

forest_reg.fit(housing_prepared_important, housing_labels)
housing_predictions = forest_reg.predict(housing_prepared_important)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
print(f"RMSE com features selecionadas: {forest_rmse:.2f}")

# 4. Pipeline único completo
print("\n=== Criando pipeline único completo ===")
full_pipeline_with_model = Pipeline([
    ('preparation', ColumnTransformer([
        ("num", make_pipeline(
            SimpleImputer(strategy="median"),
            CombinedAttributesAdder(),
            StandardScaler(),
            FeatureSelector(feature_importances[:len(num_attribs)+3], threshold=0.01)
        ), num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])),
    ('model', RandomForestRegressor(**rnd_best_params))
])

full_pipeline_with_model.fit(housing, housing_labels)
cv_scores = cross_val_score(full_pipeline_with_model, housing, housing_labels,
                          scoring="neg_mean_squared_error", cv=5)
rmse_scores = np.sqrt(-cv_scores)
print("Scores de validação cruzada:")
print("Scores:", rmse_scores)
print("Média:", rmse_scores.mean())
print("Desvio padrão:", rmse_scores.std())

# 5. Explorando opções de preparação com GridSearchCV
print("\n=== Explorando opções de preparação ===")
param_grid = [
    {
        'preparation__num__imputer__strategy': ['mean', 'median', 'most_frequent'],
        'preparation__num__attribs_adder__add_bedrooms_per_room': [True, False],
        'model__n_estimators': [50, 100, 200],
        'model__max_features': [4, 6, 8]
    }
]

grid_search_prep = GridSearchCV(full_pipeline_with_model, param_grid, cv=5,
                              scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
grid_search_prep.fit(housing, housing_labels)

best_prep_params = grid_search_prep.best_params_
best_prep_score = np.sqrt(-grid_search_prep.best_score_)
print(f"Melhores parâmetros incluindo preparação: {best_prep_params}")
print(f"Melhor RMSE incluindo preparação: {best_prep_score:.2f}")

# Avaliação final no conjunto de teste
final_model = grid_search_prep.best_estimator_

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

final_predictions = final_model.predict(X_test)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print(f"\nRMSE final no conjunto de teste: {final_rmse:.2f}")

Mounted at /content/drive

=== Testando SVR ===
Fitting 5 folds for each of 24 candidates, totalling 120 fits


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)
