In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("BigDataScalabilityAnalysis") \
    .getOrCreate()

spark


In [3]:
from pyspark.sql.functions import rand, when, col, avg, count


In [4]:
data = spark.range(0, 1_000_000) \
    .withColumn(
        "Severity",
        when(rand() > 0.7, 4)
        .when(rand() > 0.4, 3)
        .when(rand() > 0.2, 2)
        .otherwise(1)
    ) \
    .withColumn(
        "State",
        when(rand() > 0.8, "CA")
        .when(rand() > 0.6, "TX")
        .when(rand() > 0.4, "FL")
        .otherwise("NY")
    ) \
    .withColumn(
        "Sunrise_Sunset",
        when(rand() > 0.5, "Night").otherwise("Day")
    ) \
    .withColumn(
        "Visibility_mi",
        rand() * 10
    )

data.show(5)


+---+--------+-----+--------------+-----------------+
| id|Severity|State|Sunrise_Sunset|    Visibility_mi|
+---+--------+-----+--------------+-----------------+
|  0|       2|   TX|           Day|8.191031827102396|
|  1|       4|   CA|           Day|9.171856474963265|
|  2|       4|   TX|           Day| 4.02321089730802|
|  3|       2|   FL|         Night|5.124965638955276|
|  4|       3|   TX|           Day|  9.9988760522978|
+---+--------+-----+--------------+-----------------+
only showing top 5 rows


In [5]:
data.count()


1000000

In [6]:
data.groupBy("Severity").count().show()


+--------+------+
|Severity| count|
+--------+------+
|       1| 55932|
|       3|419990|
|       4|299270|
|       2|224808|
+--------+------+



In [7]:
data.groupBy("State") \
    .agg(avg("Visibility_mi").alias("Avg_Visibility")) \
    .show()


+-----+-----------------+
|State|   Avg_Visibility|
+-----+-----------------+
|   CA|4.997617505829964|
|   NY|4.994023484401983|
|   TX|5.004152773930731|
|   FL|5.006687841726582|
+-----+-----------------+



In [8]:
data.groupBy("Sunrise_Sunset") \
    .agg(avg("Visibility_mi").alias("Avg_Visibility")) \
    .show()


+--------------+------------------+
|Sunrise_Sunset|    Avg_Visibility|
+--------------+------------------+
|         Night| 5.003414073129218|
|           Day|4.9998473202257125|
+--------------+------------------+



In [9]:
data.filter(col("Severity") >= 3).count()


719260