In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Pandas vs PySpark Test") \
    .getOrCreate()

spark


<pyspark.sql.connect.session.SparkSession at 0xff7b47cb4890>

In [0]:
print("Spark Version:", spark.version)


Spark Version: 4.0.0


In [0]:
import pandas as pd
import numpy as np
import time

start_time = time.time()

# Create a Pandas DataFrame with 1 million rows
pdf = pd.DataFrame({
    "id": np.arange(1_000_000),
    "value": np.random.rand(1_000_000)
})

pandas_time = time.time() - start_time

print("Pandas load time:", pandas_time, "seconds")
pdf.head()


Pandas load time: 0.017216205596923828 seconds


Unnamed: 0,id,value
0,0,0.363911
1,1,0.996588
2,2,0.448929
3,3,0.669486
4,4,0.296567


In [0]:
import time
from pyspark.sql.functions import rand

start_time = time.time()

# Create a Spark DataFrame with 1 million rows
sdf = spark.range(1_000_000).withColumn("value", rand())

spark_time = time.time() - start_time

print("PySpark load time:", spark_time, "seconds")
sdf.show(5)


PySpark load time: 0.00022792816162109375 seconds
+---+--------------------+
| id|               value|
+---+--------------------+
|  0|  0.5561970257242097|
|  1|  0.9540500681977229|
|  2|  0.9738212926507727|
|  3|0.039388019218153825|
|  4| 0.37993414348299015|
+---+--------------------+
only showing top 5 rows


In [0]:
print("\n===== Performance Comparison =====")
print(f"Pandas load time:   {pandas_time:.4f} seconds")
print(f"PySpark load time:  {spark_time:.4f} seconds")



===== Performance Comparison =====
Pandas load time:   0.0172 seconds
PySpark load time:  0.0002 seconds


In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("SampleCSVRead").getOrCreate()


In [0]:
df = spark.read.csv("/databricks-datasets/samples/population-vs-price/data_geo.csv", header=True, inferSchema=True)



In [0]:
df.show(5)




+---------+-------------+-------+----------+------------------------+-----------------------+
|2014 rank|         City|  State|State Code|2014 Population estimate|2015 median sales price|
+---------+-------------+-------+----------+------------------------+-----------------------+
|      101|   Birmingham|Alabama|        AL|                  212247|                  162.9|
|      125|   Huntsville|Alabama|        AL|                  188226|                  157.7|
|      122|       Mobile|Alabama|        AL|                  194675|                  122.5|
|      114|   Montgomery|Alabama|        AL|                  200481|                  129.0|
|       64|Anchorage[19]| Alaska|        AK|                  301010|                   NULL|
+---------+-------------+-------+----------+------------------------+-----------------------+
only showing top 5 rows


In [0]:
df.printSchema()


root
 |-- 2014 rank: integer (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- State Code: string (nullable = true)
 |-- 2014 Population estimate: integer (nullable = true)
 |-- 2015 median sales price: double (nullable = true)



basic sql function


In [0]:
df.count()


294

In [0]:
df.select("City", "2014 Population estimate").show(5)


+-------------+------------------------+
|         City|2014 Population estimate|
+-------------+------------------------+
|   Birmingham|                  212247|
|   Huntsville|                  188226|
|       Mobile|                  194675|
|   Montgomery|                  200481|
|Anchorage[19]|                  301010|
+-------------+------------------------+
only showing top 5 rows


In [0]:
df.filter(df["2014 Population estimate"] > 1000000).show(10)


+---------+---------------+------------+----------+------------------------+-----------------------+
|2014 rank|           City|       State|State Code|2014 Population estimate|2015 median sales price|
+---------+---------------+------------+----------+------------------------+-----------------------+
|        6|        Phoenix|     Arizona|        AZ|                 1537058|                  206.1|
|        2|    Los Angeles|  California|        CA|                 3928864|                  434.7|
|        8|      San Diego|  California|        CA|                 1381069|                  510.3|
|       10|       San Jose|  California|        CA|                 1015785|                  900.0|
|        3|        Chicago|    Illinois|        IL|                 2722389|                  192.5|
|        1|    New York[6]|    New York|        NY|                 8491079|                  388.6|
|        5|Philadelphia[8]|Pennsylvania|        PA|                 1560297|               

In [0]:
df.orderBy(df["2014 Population estimate"].desc()).show(5)


+---------+---------------+------------+----------+------------------------+-----------------------+
|2014 rank|           City|       State|State Code|2014 Population estimate|2015 median sales price|
+---------+---------------+------------+----------+------------------------+-----------------------+
|        1|    New York[6]|    New York|        NY|                 8491079|                  388.6|
|        2|    Los Angeles|  California|        CA|                 3928864|                  434.7|
|        3|        Chicago|    Illinois|        IL|                 2722389|                  192.5|
|        4|     Houston[7]|       Texas|        TX|                 2239558|                  200.3|
|        5|Philadelphia[8]|Pennsylvania|        PA|                 1560297|                  204.9|
+---------+---------------+------------+----------+------------------------+-----------------------+
only showing top 5 rows


PREPROCESSING 

In [0]:
from pyspark.sql.functions import col, sum

df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns]).show()


+---------+----+-----+----------+------------------------+-----------------------+
|2014 rank|City|State|State Code|2014 Population estimate|2015 median sales price|
+---------+----+-----+----------+------------------------+-----------------------+
|        0|   0|    0|         0|                       1|                    185|
+---------+----+-----+----------+------------------------+-----------------------+



In [0]:
df_clean = df.dropna()
df_clean.show(5)


+---------+----------+-------+----------+------------------------+-----------------------+
|2014 rank|      City|  State|State Code|2014 Population estimate|2015 median sales price|
+---------+----------+-------+----------+------------------------+-----------------------+
|      101|Birmingham|Alabama|        AL|                  212247|                  162.9|
|      125|Huntsville|Alabama|        AL|                  188226|                  157.7|
|      122|    Mobile|Alabama|        AL|                  194675|                  122.5|
|      114|Montgomery|Alabama|        AL|                  200481|                  129.0|
|        6|   Phoenix|Arizona|        AZ|                 1537058|                  206.1|
+---------+----------+-------+----------+------------------------+-----------------------+
only showing top 5 rows


In [0]:
df_selected = df.select("City", "State", "2014 Population estimate", "2015 median sales price")
df_selected.withColumnRenamed("2015 median sales price", "Average_House_Price").show(5)


+-------------+-------+------------------------+-------------------+
|         City|  State|2014 Population estimate|Average_House_Price|
+-------------+-------+------------------------+-------------------+
|   Birmingham|Alabama|                  212247|              162.9|
|   Huntsville|Alabama|                  188226|              157.7|
|       Mobile|Alabama|                  194675|              122.5|
|   Montgomery|Alabama|                  200481|              129.0|
|Anchorage[19]| Alaska|                  301010|               NULL|
+-------------+-------+------------------------+-------------------+
only showing top 5 rows
