<a href="https://colab.research.google.com/github/rafflds/Previsao_precos_casas/blob/main/Previs%C3%A3o_de_Precos_de_Casas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Configuração do Ambiente

In [1]:
!pip install pyspark


Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=26382b2bbba72c9c5e8a81109bddc26a8594909a17254c1933eae168a0c2054a
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


## Importar bibliotecas necessárias

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline

from pyspark.sql.functions import isnan, when, count, col

## Criar a SparkSession

In [3]:
# Criar a SparkSession
spark = SparkSession.builder.appName("HousePricesML").getOrCreate()


## Carregar e Visualizar o Conjunto de Dados

In [4]:
# Carregar o arquivo CSV
file_path = '/content/drive/MyDrive/Cientista_de_Dados/Bases de dados/house_prices.csv'
data = spark.read.csv(file_path, header=True, inferSchema=True)

# Mostrar as primeiras linhas do dataset
data.show(5)


+----------+---------------+--------+--------+---------+-----------+--------+------+----------+----+---------+-----+----------+-------------+--------+------------+-------+-------+--------+-------------+----------+
|        id|           date|   price|bedrooms|bathrooms|sqft_living|sqft_lot|floors|waterfront|view|condition|grade|sqft_above|sqft_basement|yr_built|yr_renovated|zipcode|    lat|    long|sqft_living15|sqft_lot15|
+----------+---------------+--------+--------+---------+-----------+--------+------+----------+----+---------+-----+----------+-------------+--------+------------+-------+-------+--------+-------------+----------+
|7129300520|20141013T000000|221900.0|       3|      1.0|       1180|    5650|   1.0|         0|   0|        3|    7|      1180|            0|    1955|           0|  98178|47.5112|-122.257|         1340|      5650|
|6414100192|20141209T000000|538000.0|       3|     2.25|       2570|    7242|   2.0|         0|   0|        3|    7|      2170|          400|   

## Análise Exploratória de Dados (EDA)

In [7]:
# Verificar o esquema do dataset
data.printSchema()

# Descrever os dados numéricos
data.describe().show()

# Verificar a existência de valores nulos
data.select([count(when(col(c).isNull(), c)).alias(c) for c in data.columns]).show()


root
 |-- id: long (nullable = true)
 |-- date: string (nullable = true)
 |-- price: double (nullable = true)
 |-- bedrooms: integer (nullable = true)
 |-- bathrooms: double (nullable = true)
 |-- sqft_living: integer (nullable = true)
 |-- sqft_lot: integer (nullable = true)
 |-- floors: double (nullable = true)
 |-- waterfront: integer (nullable = true)
 |-- view: integer (nullable = true)
 |-- condition: integer (nullable = true)
 |-- grade: integer (nullable = true)
 |-- sqft_above: integer (nullable = true)
 |-- sqft_basement: integer (nullable = true)
 |-- yr_built: integer (nullable = true)
 |-- yr_renovated: integer (nullable = true)
 |-- zipcode: integer (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- sqft_living15: integer (nullable = true)
 |-- sqft_lot15: integer (nullable = true)

+-------+--------------------+---------------+------------------+-----------------+------------------+------------------+------------------+---------

## Pré-processamento dos Dados

In [8]:
# Remover colunas desnecessárias ou com muitos valores nulos (exemplo)
data = data.drop("id", "date")

# Codificar variáveis categóricas
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(data)
            for column in data.columns if data.schema[column].dataType == 'StringType']

# Criar um vetor de features
assembler = VectorAssembler(
    inputCols=[column+"_index" if data.schema[column].dataType == 'StringType' else column for column in data.columns if column != 'price'],
    outputCol="features"
)

# Normalizar os dados numéricos
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")


## Dividir o Conjunto de Dados em Treino e Teste

In [9]:
# Dividir o dataset em treino e teste
train_data, test_data = data.randomSplit([0.7, 0.3], seed=42)


## Construir e Treinar o Modelo

In [10]:
# Construir o modelo de Regressão Linear
lr = LinearRegression(featuresCol="scaledFeatures", labelCol="price")

# Criar o Pipeline
pipeline = Pipeline(stages=indexers + [assembler, scaler, lr])

# Treinar o modelo
model = pipeline.fit(train_data)


## Avaliar o Modelo

In [11]:
# Fazer previsões no conjunto de teste
predictions = model.transform(test_data)

# Avaliar o modelo
evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})

print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R2 Score: {r2}")

# Mostrar algumas previsões
predictions.select("price", "prediction", "features").show(5)


Root Mean Squared Error (RMSE): 194184.75614285926
R2 Score: 0.6975817824562345
+-------+------------------+--------------------+
|  price|        prediction|            features|
+-------+------------------+--------------------+
|80000.0|35543.102480161935|[1.0,0.75,430.0,5...|
|83000.0|105271.07357006986|[2.0,1.0,900.0,85...|
|85000.0|127172.07452908438|[2.0,1.0,830.0,90...|
|85000.0|32530.308695724234|[2.0,1.0,910.0,97...|
|90000.0|-22024.42745761294|[1.0,1.0,560.0,41...|
+-------+------------------+--------------------+
only showing top 5 rows



## Salvar o Modelo

In [12]:
model.save("path/to/save/model")