## We will use dataset from kaggle, using kagglehub

In [None]:
import kagglehub

# Download latest version to the 'data' directory
path = kagglehub.dataset_download("camnugent/california-housing-prices")

print("Path to dataset files:", path)

### Initializing Spark Session

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
  .appName("Predicting housing prices") \
  .getOrCreate()

#### Loading data into dataframe from csv

In [9]:
data = spark.read.csv("data/housing.csv", header=True, inferSchema=True)

data.printSchema()

root
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- housing_median_age: double (nullable = true)
 |-- total_rooms: double (nullable = true)
 |-- total_bedrooms: double (nullable = true)
 |-- population: double (nullable = true)
 |-- households: double (nullable = true)
 |-- median_income: double (nullable = true)
 |-- median_house_value: double (nullable = true)
 |-- ocean_proximity: string (nullable = true)



In [10]:
data.show(5)

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|  -122.23|   37.88|              41.0|      880.0|         129.0|     322.0|     126.0|       8.3252|          452600.0|       NEAR BAY|
|  -122.22|   37.86|              21.0|     7099.0|        1106.0|    2401.0|    1138.0|       8.3014|          358500.0|       NEAR BAY|
|  -122.24|   37.85|              52.0|     1467.0|         190.0|     496.0|     177.0|       7.2574|          352100.0|       NEAR BAY|
|  -122.25|   37.85|              52.0|     1274.0|         235.0|     558.0|     219.0|       5.6431|          341300.0|       NEAR BAY|
|  -122.25|   37.85|              

In [11]:
data.count()

20640

In [14]:
data.describe().transpose().show()

[Stage 9:>                                                          (0 + 1) / 1]

+------------------+-----+----------+-------------------+---------+------------------+
|               key|count|       max|               mean|      min|            stddev|
+------------------+-----+----------+-------------------+---------+------------------+
|         longitude|20640|   -114.31|-119.56970445736148|  -124.35| 2.003531723502584|
|          latitude|20640|     41.95|   35.6318614341087|    32.54| 2.135952397457101|
|housing_median_age|20640|      52.0| 28.639486434108527|      1.0| 12.58555761211163|
|       total_rooms|20640|   39320.0| 2635.7630813953488|      2.0|2181.6152515827944|
|    total_bedrooms|20433|    6445.0|  537.8705525375618|      1.0|421.38507007403115|
|        population|20640|   35682.0| 1425.4767441860465|      3.0|  1132.46212176534|
|        households|20640|    6082.0|  499.5396802325581|      1.0| 382.3297528316098|
|     median_income|20640|   15.0001| 3.8706710029070246|   0.4999| 1.899821717945263|
|median_house_value|20640|  500001.0| 20685

                                                                                

#### Checking null value on a particular column and in all columns

In [15]:
import pyspark.sql.functions as F

data \
  .select(F.count(F.when(F.col('ocean_proximity').isNull(), 1)).alias('ocean_proximity')) \
  .show()

+---------------+
|ocean_proximity|
+---------------+
|              0|
+---------------+



In [16]:
data \
  .select([F.count(F.when(F.col(c).isNull(), 1)).alias(c) for c in data.columns]) \
  .show()

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|        0|       0|                 0|          0|           207|         0|         0|            0|                 0|              0|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+

