In [1]:
from datetime import datetime, date
from pyspark.sql import SparkSession, Row
from time import perf_counter
import pandas as pd

In [2]:
# Spark session & context
spark = SparkSession.builder.master("local").getOrCreate()
sc = spark.sparkContext
spark.conf.set('spark.sql.repl.eagerEval.enabled', False)

In [3]:
# Sum of the first 100 whole numbers
rdd = sc.parallelize(range(200 + 1))

start = perf_counter()
print("Sum:", rdd.sum())
print(f"Took {perf_counter()-start}ms")

Sum: 20100
Took 1.1108119950004038ms


In [4]:
# Creates dataframe. No explicit schema specified - will sample row to get schema instead
# Alternatively can pass in `schema` param
# Dataframe can be created from pandas DF as well
df = spark.createDataFrame([
    Row(a=1, b=2., c='string1', d=date(2000, 1, 1), e=datetime(2000, 1, 1, 12, 0)),
    Row(a=2, b=3., c='string2', d=date(2000, 2, 1), e=datetime(2000, 1, 2, 12, 0)),
    Row(a=4, b=5., c='string3', d=date(2000, 3, 1), e=datetime(2000, 1, 3, 12, 0))
])
df

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [5]:
# Pandaas dataframe is created col by col
pandas_df = pd.DataFrame({
    'a': [1, 2, 3],
    'b': [2., 3., 4.],
    'c': ['string1', 'string2', 'string3'],
    'd': [date(2000, 1, 1), date(2000, 2, 1), date(2000, 3, 1)],
    'e': [datetime(2000, 1, 1, 12, 0), datetime(2000, 1, 2, 12, 0), datetime(2000, 1, 3, 12, 0)]
})
df = spark.createDataFrame(pandas_df)
df

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [6]:
df.show()
df.printSchema()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
|  2|3.0|string2|2000-02-01|2000-01-02 12:00:00|
|  3|4.0|string3|2000-03-01|2000-01-03 12:00:00|
+---+---+-------+----------+-------------------+

root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)



In [7]:
# eager evaluation - displays like this in jupyter
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)
df

a,b,c,d,e
1,2.0,string1,2000-01-01,2000-01-01 12:00:00
2,3.0,string2,2000-02-01,2000-01-02 12:00:00
3,4.0,string3,2000-03-01,2000-01-03 12:00:00


In [8]:
df = spark.createDataFrame([
    Row('don','baltimore',12),
    Row('jerry','boston',19),
    Row('bob','baltimore',99),
    Row('cameron','baltimore',13),
    Row('james','seattle',1),
    Row('peter','seattle',2),
], schema = 'name: string, city: string, id: long')

df

name,city,id
don,baltimore,12
jerry,boston,19
bob,baltimore,99
cameron,baltimore,13
james,seattle,1
peter,seattle,2
