In [72]:
!pip install pyspark py4j
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Basics").getOrCreate()
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.regression import LinearRegression

from google.colab import drive
drive.mount('/content/drive')

file_path = "/content/drive/MyDrive/Project Dataset/modified_data1.csv"

housing = spark.read.csv(file_path, header=True, inferSchema=True)

housing.show()

# Define stages
city_indexer = StringIndexer(inputCol="city", outputCol="cityIndex")
state_indexer = StringIndexer(inputCol="state", outputCol="stateIndex")
city_encoder = OneHotEncoder(inputCols=["cityIndex"], outputCols=["cityVec"])
state_encoder = OneHotEncoder(inputCols=["stateIndex"], outputCols=["stateVec"])
assembler = VectorAssembler(inputCols=["bed", "bath", "house_size", "cityVec", "stateVec"], outputCol="features")
lr = LinearRegression(featuresCol="features", labelCol="price")

# Create the pipeline
pipeline = Pipeline(stages=[city_indexer, state_indexer, city_encoder, state_encoder, assembler, lr])

# Fit the pipeline
model = pipeline.fit(housing)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
+--------+--------+---+----+-------------+-----------+----------+
|  status|   price|bed|bath|         city|      state|house_size|
+--------+--------+---+----+-------------+-----------+----------+
|for_sale|105000.0|  3|   2|     Adjuntas|Puerto Rico|     920.0|
|for_sale| 80000.0|  4|   2|     Adjuntas|Puerto Rico|    1527.0|
|for_sale| 67000.0|  2|   1|   Juana Diaz|Puerto Rico|     748.0|
|for_sale|145000.0|  4|   2|        Ponce|Puerto Rico|    1800.0|
|for_sale|179000.0|  4|   3|San Sebastian|Puerto Rico|    2520.0|
|for_sale| 50000.0|  3|   1|       Ciales|Puerto Rico|    2040.0|
|for_sale| 71600.0|  3|   2|        Ponce|Puerto Rico|    1050.0|
|for_sale|100000.0|  2|   1|        Ponce|Puerto Rico|    1092.0|
|for_sale|150000.0|  3|   2|   Juana Diaz|Puerto Rico|    1045.0|
|for_sale| 79000.0|  5|   2|       Utuado|Puerto Rico|    1620.0|
|for_sale|649

In [78]:
# Getting user inputs for predictions
bed = int(input("Enter number of bedrooms: "))
bath = int(input("Enter number of bathrooms: "))
house_size = int(input("Enter house size (in square feet): "))
city = input("Enter city: ")
state = input("Enter state: ")

Enter number of bedrooms: 3
Enter number of bathrooms: 3
Enter house size (in square feet): 1200
Enter city: New York
Enter state: New York


In [79]:
# Create a Spark DataFrame from user input
input_data = spark.createDataFrame([(bed, bath, house_size, city, state)],
                                  ['bed', 'bath', 'house_size', 'city', 'state'])

# Make predictions
predictions = model.transform(input_data)

# Extract the predicted price
predicted_price = predictions.select("prediction").collect()[0][0]

print("Approx Predicted price (in dollers):", predicted_price)

Approx Predicted price (in dollers): 566488.6074869868
