In [1]:
# Simple spark instance
# import pyspark
# spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [16]:
# Multiprocessing spark instance
import multiprocessing
import pyspark
import pyspark.sql.functions as F
from pydataset import data

nprocs = multiprocessing.cpu_count()

spark = (pyspark.sql.SparkSession.builder
 .master('local')
 .config('spark.jars.packages', 'mysql:mysql-connector-java:8.0.16')
 .config('spark.driver.memory', '4G')
 .config('spark.driver.cores', nprocs)
 .config('spark.sql.shuffle.partitions', nprocs)
 .appName('MySparkApplication')
 .getOrCreate())

In [4]:
spark

In [11]:
drinks = [
    {
        "type": "water",
        "calories": 0,
        "number_consumed": 5
    },
    {
        "type": "orange juice",
        "calories": 220,
        "number_consumed": 3
    },
    {
        "type": "gatorade",
        "calories": 140,
        "number_consumed": 1
    },
    {
        "type": "coffee",
        "calories": 0,
        "number_consumed": 2
    }
]

df = spark.createDataFrame(drinks)
df

DataFrame[calories: bigint, number_consumed: bigint, type: string]

In [12]:
# .show is how we print the dataframe contents
df.show(3)

+--------+---------------+------------+
|calories|number_consumed|        type|
+--------+---------------+------------+
|       0|              5|       water|
|     220|              3|orange juice|
|     140|              1|    gatorade|
+--------+---------------+------------+
only showing top 3 rows



In [15]:
# Descriptive Statistics
df.describe().show()

+-------+-----------------+-----------------+------+
|summary|         calories|  number_consumed|  type|
+-------+-----------------+-----------------+------+
|  count|                4|                4|     4|
|   mean|             90.0|             2.75|  null|
| stddev|108.9342309224546|1.707825127659933|  null|
|    min|                0|                1|coffee|
|    max|              220|                5| water|
+-------+-----------------+-----------------+------+



In [17]:
from pydataset import data
mpg = data("mpg")
mpg = spark.createDataFrame(mpg)

In [21]:
mpg.select(mpg.hwy, mpg.cty).show(3)

+---+---+
|hwy|cty|
+---+---+
| 29| 18|
| 29| 21|
| 31| 20|
+---+---+
only showing top 3 rows

