In [5]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, MinMaxScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("House Price Prediction") \
    .getOrCreate()

In [17]:
# Load the dataset
file_path = "/Users/sampreethshetty/Downloads/house_price.csv"  # Replace with your dataset path
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Display the dataset schema
df.printSchema()


root
 |-- id: long (nullable = true)
 |-- Date: integer (nullable = true)
 |-- number of bedrooms: integer (nullable = true)
 |-- number of bathrooms: double (nullable = true)
 |-- living area: integer (nullable = true)
 |-- lot area: integer (nullable = true)
 |-- number of floors: double (nullable = true)
 |-- waterfront present: integer (nullable = true)
 |-- number of views: integer (nullable = true)
 |-- condition of the house: integer (nullable = true)
 |-- grade of the house: integer (nullable = true)
 |-- Area of the house(excluding basement): integer (nullable = true)
 |-- Area of the basement: integer (nullable = true)
 |-- Built Year: integer (nullable = true)
 |-- Renovation Year: integer (nullable = true)
 |-- Postal Code: integer (nullable = true)
 |-- Lattitude: double (nullable = true)
 |-- Longitude: double (nullable = true)
 |-- living_area_renov: integer (nullable = true)
 |-- lot_area_renov: integer (nullable = true)
 |-- Number of schools nearby: integer (nullable 

In [18]:
# Step 1: Select Relevant Columns
df = df.select("id", "number of bedrooms", "number of bathrooms", "living area", 
               "lot area", "number of floors", "waterfront present", "postal code", "Price")

# Display the dataset
print("Dataset Preview:")
df.show(5)


Dataset Preview:
+----------+------------------+-------------------+-----------+--------+----------------+------------------+-----------+-------+
|        id|number of bedrooms|number of bathrooms|living area|lot area|number of floors|waterfront present|postal code|  Price|
+----------+------------------+-------------------+-----------+--------+----------------+------------------+-----------+-------+
|6762810635|                 4|                2.5|       2920|    4000|             1.5|                 0|     122004|1400000|
|6762810998|                 5|               2.75|       2910|    9480|             1.5|                 0|     122004|1200000|
|6762812605|                 4|                2.5|       3310|   42998|             2.0|                 0|     122005| 838000|
|6762812919|                 3|                2.0|       2710|    4500|             1.5|                 0|     122006| 805000|
|6762813105|                 3|                2.5|       2600|    4750|        

In [9]:
# Step 2: Data Preprocessing
# Handle missing values (fill with mean for numerical columns)
numerical_cols = ["number of bedrooms", "number of bathrooms", "living area", 
                  "lot area", "number of floors", "Price"]
for col_name in numerical_cols:
    mean_value = df.select(col_name).na.drop().groupBy().avg(col_name).first()[0]
    df = df.na.fill({col_name: mean_value})

# Normalize numerical features
assembler = VectorAssembler(inputCols=["number of bedrooms", "number of bathrooms", 
                                        "living area", "lot area", 
                                        "number of floors", "waterfront present"], 
                             outputCol="FeaturesBeforeScaling")
df = assembler.transform(df)

scaler = MinMaxScaler(inputCol="FeaturesBeforeScaling", outputCol="features")
df = scaler.fit(df).transform(df)

# Select only features and target column
df = df.select("features", "Price")

In [None]:
# Step 1: Data Preprocessing
# Handle missing values (fill with mean for numerical columns)
numerical_cols = ["Size", "Bedrooms", "Bathrooms", "Price"]
for col_name in numerical_cols:
    mean_value = df.select(col_name).na.drop().groupBy().avg(col_name).first()[0]
    df = df.na.fill({col_name: mean_value})

# Encode categorical variables (Location)
indexer = StringIndexer(inputCol="Location", outputCol="LocationIndex")
df = indexer.fit(df).transform(df)

# Normalize numerical features (Size, Bedrooms, Bathrooms)
assembler = VectorAssembler(inputCols=["Size", "Bedrooms", "Bathrooms"], outputCol="NumericalFeatures")
df = assembler.transform(df)

scaler = MinMaxScaler(inputCol="NumericalFeatures", outputCol="ScaledFeatures")
df = scaler.fit(df).transform(df)

# Combine all features into a single vector
final_assembler = VectorAssembler(inputCols=["ScaledFeatures", "LocationIndex"], outputCol="features")
df = final_assembler.transform(df)

# Select relevant columns for modeling
df = df.select("features", "Price")


In [None]:
# Step 2: Split the data into training and testing sets
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

# Step 3: Train a Linear Regression Model
lr = LinearRegression(featuresCol="features", labelCol="Price", maxIter=100, regParam=0.1, elasticNetParam=0.8)
lr_model = lr.fit(train_data)



In [None]:
# Step 4: Evaluate the model on test data
predictions = lr_model.transform(test_data)

evaluator = RegressionEvaluator(labelCol="Price", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)

print(f"Root Mean Square Error (RMSE) on Test Data: {rmse:.2f}")



In [None]:
# Step 5: Display Feature Importances
print("Coefficients of Features (Feature Importance):")
for i, coeff in enumerate(lr_model.coefficients):
    print(f"Feature {i + 1}: {coeff:.4f}")


In [None]:
# Stop Spark Session
spark.stop()