In [1]:
import os
import tarfile
import urllib.request
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score

# Função para baixar os dados
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

# Função para carregar os dados
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

# Baixar e carregar os dados
fetch_housing_data()
housing = load_housing_data()

# Divisão dos dados em conjunto de treino e teste
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

# Separar o atributo de rótulo (preços das casas)
housing_labels = train_set["median_house_value"].copy()
housing = train_set.drop("median_house_value", axis=1)

# Pré-processamento dos dados
housing_num = housing.drop("ocean_proximity", axis=1)
imputer = SimpleImputer(strategy="median")
housing_num = imputer.fit_transform(housing_num)

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])

housing_prepared = num_pipeline.fit_transform(housing_num)

# Treinamento do modelo com Regressão Linear
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

# Avaliação do modelo com Regressão Linear
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
print("RMSE com Regressão Linear:", lin_rmse)

# Treinamento do modelo com Árvore de Decisão
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

# Avaliação do modelo com Árvore de Decisão
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
print("RMSE com Árvore de Decisão:", tree_rmse)

# Validação Cruzada com Árvore de Decisão
scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)
print("RMSE médio com Validação Cruzada (Árvore de Decisão):", tree_rmse_scores.mean())


RMSE com Regressão Linear: 69362.34135238081
RMSE com Árvore de Decisão: 0.0
RMSE médio com Validação Cruzada (Árvore de Decisão): 69774.90758281192
