In [1]:
# SparkSession is the entry point of PySpark
# in case of running it in pyspark shell
# the shell automatically creates the session

from pyspark.sql import SparkSession

spark = SparkSession.bfuilder.getOrCreate()

22/04/30 13:35:31 WARN Utils: Your hostname, SuideMacBook-Air.local resolves to a loopback address: 127.0.0.1; using 172.18.142.168 instead (on interface en0)
22/04/30 13:35:31 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/30 13:35:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## General creating dataframe

In [2]:
from datetime import datetime, date, timedelta
import pandas as pd
from pyspark.sql import Row

In [3]:
# create dataframe with a list of rows
df = spark.createDataFrame([
    Row(a=1, b=2., c='string1', d=date(2000, 1, 1), e=datetime(2000, 1, 1, 12, 0)),
    Row(a=2, b=3., c='string2', d=date(2000, 2, 1), e=datetime(2000, 1, 2, 12, 0)),
    Row(a=4, b=5., c='string3', d=date(2000, 3, 1), e=datetime(2000, 1, 3, 12, 0))
])

df

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [4]:
# create dataframe with an explicit schema

df = spark.createDataFrame([
    (1, 2., 'string1', date(2000, 1, 1), datetime(2000, 1, 1, 12, 0)),
    (2, 3., 'string2', date(2000, 2, 1), datetime(2000, 1, 2, 12, 0)),
    (3, 4., 'string3', date(2000, 3, 1), datetime(2000, 1, 3, 12, 0))
], schema='a long, b double, c string, d date, e timestamp')


df

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [5]:
#  create dataframe from pandas dataframe

pandas_df = pd.DataFrame({
    'a': [1, 2, 3],
    'b': [2., 3., 4.],
    'c': ['string1', 'string2', 'string3'],
    'd': [date(2000, 1, 1), date(2000, 2, 1), date(2000, 3, 1)],
    'e': [datetime(2000, 1, 1, 12, 0), datetime(2000, 1, 2, 12, 0), datetime(2000, 1, 3, 12, 0)]
})

df = spark.createDataFrame(pandas_df)
df

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

## create dataframe from RDD

RDD was the primary user-facing API in Spark since its inception. At the core, an RDD is an immutable distributed collection of elements of your data, partitioned across nodes in your cluster that can be operated in parallel with a low-level API that offers transformations and actions.

In [6]:
# when the data comes as tuples
# we need to parallelize it first
# before turning into a spark dataframe
rdd = spark.sparkContext.parallelize([
    (1, 2., 'string1', date(2000, 1, 1), datetime(2000, 1, 1, 12, 0)),
    (2, 3., 'string2', date(2000, 2, 1), datetime(2000, 1, 2, 12, 0)),
    (3, 4., 'string3', date(2000, 3, 1), datetime(2000, 1, 3, 12, 0))
])

df = spark.createDataFrame(rdd, schema=['a', 'b', 'c', 'd', 'e'])

df 

                                                                                

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [7]:
# all dataframes shows the same result
# use show() to show data frame
df.show()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
|  2|3.0|string2|2000-02-01|2000-01-02 12:00:00|
|  3|4.0|string3|2000-03-01|2000-01-03 12:00:00|
+---+---+-------+----------+-------------------+



In [8]:
# use printSchema() to show the schema
df.printSchema()

root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)



In [9]:
# showing by

df.show(n=1)

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
+---+---+-------+----------+-------------------+
only showing top 1 row



Alternatively, you can enable `spark.sql.repl.eagerEval.enabled` configuration for the eager evaluation of PySpark DataFrame in notebooks such as Jupyter. The number of rows to show can be controlled via `spark.sql.repl.eagerEval.maxNumRows` configuration.

In [10]:
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)
df

a,b,c,d,e
1,2.0,string1,2000-01-01,2000-01-01 12:00:00
2,3.0,string2,2000-02-01,2000-01-02 12:00:00
3,4.0,string3,2000-03-01,2000-01-03 12:00:00


In [11]:
df.show(2, vertical=True) # use vertical=True if you want to see it that way

-RECORD 0------------------
 a   | 1                   
 b   | 2.0                 
 c   | string1             
 d   | 2000-01-01          
 e   | 2000-01-01 12:00:00 
-RECORD 1------------------
 a   | 2                   
 b   | 3.0                 
 c   | string2             
 d   | 2000-02-01          
 e   | 2000-01-02 12:00:00 
only showing top 2 rows



In [12]:
df.columns

['a', 'b', 'c', 'd', 'e']

In [13]:
# using some SQL syntax here

df.select("a", "b", "c").describe().show()

[Stage 14:>                                                         (0 + 8) / 8]

+-------+---+---+-------+
|summary|  a|  b|      c|
+-------+---+---+-------+
|  count|  3|  3|      3|
|   mean|2.0|3.0|   null|
| stddev|1.0|1.0|   null|
|    min|  1|2.0|string1|
|    max|  3|4.0|string3|
+-------+---+---+-------+



                                                                                

In [14]:
df.collect()

[Row(a=1, b=2.0, c='string1', d=datetime.date(2000, 1, 1), e=datetime.datetime(2000, 1, 1, 12, 0)),
 Row(a=2, b=3.0, c='string2', d=datetime.date(2000, 2, 1), e=datetime.datetime(2000, 1, 2, 12, 0)),
 Row(a=3, b=4.0, c='string3', d=datetime.date(2000, 3, 1), e=datetime.datetime(2000, 1, 3, 12, 0))]

In [15]:
# only collect one
df.take(1)

[Row(a=1, b=2.0, c='string1', d=datetime.date(2000, 1, 1), e=datetime.datetime(2000, 1, 1, 12, 0))]

In [16]:
# make it pandas
df.toPandas()

Unnamed: 0,a,b,c,d,e
0,1,2.0,string1,2000-01-01,2000-01-01 12:00:00
1,2,3.0,string2,2000-02-01,2000-01-02 12:00:00
2,3,4.0,string3,2000-03-01,2000-01-03 12:00:00


## Selecting and Accessing Data


In [17]:
df.a

Column<'a'>

In [18]:
# most of column-wise operations return Columns
# they're just the same
from pyspark.sql import Column
from pyspark.sql.functions import upper

type(df.c) == type(upper(df.c)) == type(df.c.isNull())

True

In [19]:
# use the selct function to get a subset
df.select(df.c).show()

+-------+
|      c|
+-------+
|string1|
|string2|
|string3|
+-------+



In [20]:
# withColumn will create new column
# first variable is the column name
# second variable is the value
df.withColumn('upper_c', upper(df.c)).show()

+---+---+-------+----------+-------------------+-------+
|  a|  b|      c|         d|                  e|upper_c|
+---+---+-------+----------+-------------------+-------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|STRING1|
|  2|3.0|string2|2000-02-01|2000-01-02 12:00:00|STRING2|
|  3|4.0|string3|2000-03-01|2000-01-03 12:00:00|STRING3|
+---+---+-------+----------+-------------------+-------+



In [21]:
# using filter
df.filter(df.a == 1).show()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
+---+---+-------+----------+-------------------+



## Applying function

UDF means user defined function

In [22]:
# pyspark supports various UDFs and APIs to allow users to execute Python
# native functions

import pandas
from pyspark.sql.functions import pandas_udf

@pandas_udf('long')
def pandas_plus_one(series: pd.Series) -> pd.Series:
    # Simply plus one by using pandas Series
    return series + 1

df.select(pandas_plus_one(df.a)).show()

                                                                                

+------------------+
|pandas_plus_one(a)|
+------------------+
|                 2|
|                 3|
|                 4|
+------------------+



In [23]:
df.show()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
|  2|3.0|string2|2000-02-01|2000-01-02 12:00:00|
|  3|4.0|string3|2000-03-01|2000-01-03 12:00:00|
+---+---+-------+----------+-------------------+



In [24]:
# the function takes in something that's iterable
def pandas_filter_func(iterator):
    for pandas_df in iterator: # start iterating it
        # yield the filtered result
        yield pandas_df[pandas_df.a == 1]

# use mapping to apply the function

# first variable: function
# seconds variable: schema
df.mapInPandas(pandas_filter_func, schema=df.schema).show()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
+---+---+-------+----------+-------------------+



## Grouping Data

In [36]:
df = spark.createDataFrame([
    ['red', 'banana', 1, 10], ['blue', 'banana', 2, 20], ['red', 'carrot', 3, 30],
    ['blue', 'grape', 4, 40], ['red', 'carrot', 5, 50], ['black', 'carrot', 6, 60],
    ['red', 'banana', 7, 70], ['red', 'grape', 8, 80]], schema=['color', 'fruit', 'v1', 'v2'])
df.show()


+-----+------+---+---+
|color| fruit| v1| v2|
+-----+------+---+---+
|  red|banana|  1| 10|
| blue|banana|  2| 20|
|  red|carrot|  3| 30|
| blue| grape|  4| 40|
|  red|carrot|  5| 50|
|black|carrot|  6| 60|
|  red|banana|  7| 70|
|  red| grape|  8| 80|
+-----+------+---+---+



In [26]:
df.groupby('color').avg().show()

+-----+-------+-------+
|color|avg(v1)|avg(v2)|
+-----+-------+-------+
|  red|    4.8|   48.0|
| blue|    3.0|   30.0|
|black|    6.0|   60.0|
+-----+-------+-------+



## Pandas assign vs apply

`assign` is to create new column
`apply` is just to use a function

In [37]:
def plus_mean(pandas_df):
    return pandas_df.assign(v1=pandas_df.v1 - pandas_df.v1.mean())

df.groupby('color').applyInPandas(plus_mean, schema=df.schema).show()

[Stage 65:>                                                         (0 + 1) / 1]

+-----+------+---+---+
|color| fruit| v1| v2|
+-----+------+---+---+
|black|carrot|  0| 60|
| blue|banana| -1| 20|
| blue| grape|  1| 40|
|  red|banana| -3| 10|
|  red|carrot| -1| 30|
|  red|carrot|  0| 50|
|  red|banana|  2| 70|
|  red| grape|  3| 80|
+-----+------+---+---+



                                                                                

Co-grouping and apply a function

In [28]:
df1 = spark.createDataFrame(
    [(20000101, 1, 1.0), (20000101, 2, 2.0), (20000102, 1, 3.0), (20000102, 2, 4.0)],
    ('time', 'id', 'v1')
)

df2 = spark.createDataFrame(
    [(20000101, 1, 'x'), (20000101, 2, 'y')],
    ('time', 'id', 'v2')
)


In [31]:
df1.show()

+--------+---+---+
|    time| id| v1|
+--------+---+---+
|20000101|  1|1.0|
|20000101|  2|2.0|
|20000102|  1|3.0|
|20000102|  2|4.0|
+--------+---+---+



In [32]:
df2.show()

+--------+---+---+
|    time| id| v2|
+--------+---+---+
|20000101|  1|  x|
|20000101|  2|  y|
+--------+---+---+



# 2022/04/30

## MapInPandas() vs ApplyInPanads() 

`MapInPandas` is more of just appying a function cross the schema, for example, a filter that normal filter doesn't capture it

`ApplyInPandas` is more for goruping data and applying a function on how to group


In [35]:
import statistics

statistics.mean(range(1,9))

4.5

co-grouping and applying a function

In [38]:
# creating two dataframe first

df1 = spark.createDataFrame(
    [(20000101, 1, 1.0), 
     (20000101, 2, 2.0), 
     (20000102, 1, 3.0), 
     (20000102, 2, 4.0)],
    ('time', 'id', 'v1')
)

df2 = spark.createDataFrame(
    [(20000101, 1, 'x'), (20000101, 2, 'y')],
    ('time', 'id', 'v2')
)



In [41]:
df1.show()
df2.show()

+--------+---+---+
|    time| id| v1|
+--------+---+---+
|20000101|  1|1.0|
|20000101|  2|2.0|
|20000102|  1|3.0|
|20000102|  2|4.0|
+--------+---+---+

+--------+---+---+
|    time| id| v2|
+--------+---+---+
|20000101|  1|  x|
|20000101|  2|  y|
+--------+---+---+



Learning what cogrouping is

In [47]:
# first check what df1 and df2 solo-groupby looks like

df1.groupby('id').avg().show()

+---+------------+-------+-------+
| id|   avg(time)|avg(id)|avg(v1)|
+---+------------+-------+-------+
|  1|2.00001015E7|    1.0|    2.0|
|  2|2.00001015E7|    2.0|    3.0|
+---+------------+-------+-------+



## pandas.merge_asof

Perform an asof merge. This is similar to a left-join except that we match on nearest key rather than equal keys.

In [50]:
def asof_join(l, r):
    return pd.merge_asof(l, r, on='time', by='id')

df1.groupby('id').cogroup(df2.groupby('id')).applyInPandas(
    asof_join, schema='time int, id int, v1 double, v2 string').show()


+--------+---+---+---+
|    time| id| v1| v2|
+--------+---+---+---+
|20000101|  1|1.0|  x|
|20000102|  1|3.0|  x|
|20000101|  2|2.0|  y|
|20000102|  2|4.0|  y|
+--------+---+---+---+



[Stage 101:>                                                        (0 + 1) / 1]                                                                                

In [51]:
df1.printSchema()

root
 |-- time: long (nullable = true)
 |-- id: long (nullable = true)
 |-- v1: double (nullable = true)



## Getting Data in/out

### CSV

In [53]:
df.show()
df.write.csv('foo.csv', header=True)

+-----+------+---+---+
|color| fruit| v1| v2|
+-----+------+---+---+
|  red|banana|  1| 10|
| blue|banana|  2| 20|
|  red|carrot|  3| 30|
| blue| grape|  4| 40|
|  red|carrot|  5| 50|
|black|carrot|  6| 60|
|  red|banana|  7| 70|
|  red| grape|  8| 80|
+-----+------+---+---+



                                                                                

In [54]:
ls foo.csv

_SUCCESS
part-00000-ed0e1613-5ad3-4496-9b9a-1244545039b5-c000.csv
part-00001-ed0e1613-5ad3-4496-9b9a-1244545039b5-c000.csv
part-00002-ed0e1613-5ad3-4496-9b9a-1244545039b5-c000.csv
part-00003-ed0e1613-5ad3-4496-9b9a-1244545039b5-c000.csv
part-00004-ed0e1613-5ad3-4496-9b9a-1244545039b5-c000.csv
part-00005-ed0e1613-5ad3-4496-9b9a-1244545039b5-c000.csv
part-00006-ed0e1613-5ad3-4496-9b9a-1244545039b5-c000.csv
part-00007-ed0e1613-5ad3-4496-9b9a-1244545039b5-c000.csv


**note: it is stored by rows, so eight different csv files

In [60]:
# now read it back in
df_frominput = spark.read.csv('foo.csv', header=True)
df_frominput.show()

+-----+------+---+---+
|color| fruit| v1| v2|
+-----+------+---+---+
|black|carrot|  6| 60|
| blue|banana|  2| 20|
|  red|carrot|  3| 30|
| blue| grape|  4| 40|
|  red|banana|  1| 10|
|  red|banana|  7| 70|
|  red|carrot|  5| 50|
|  red| grape|  8| 80|
+-----+------+---+---+



### Parquet

In [61]:
df.write.parquet('bar.parquet')
spark.read.parquet('bar.parquet').show()

22/04/30 15:00:58 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
22/04/30 15:00:58 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
22/04/30 15:00:58 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
                                                                                

+-----+------+---+---+
|color| fruit| v1| v2|
+-----+------+---+---+
|black|carrot|  6| 60|
| blue|banana|  2| 20|
| blue| grape|  4| 40|
|  red|carrot|  5| 50|
|  red|banana|  7| 70|
|  red|carrot|  3| 30|
|  red|banana|  1| 10|
|  red| grape|  8| 80|
+-----+------+---+---+



### ORC

In [62]:
df.write.orc('zoo.orc')
spark.read.orc('zoo.orc').show()

[Stage 130:>                                                        (0 + 8) / 8]                                                                                

+-----+------+---+---+
|color| fruit| v1| v2|
+-----+------+---+---+
|  red|banana|  7| 70|
|  red| grape|  8| 80|
|black|carrot|  6| 60|
| blue|banana|  2| 20|
|  red|banana|  1| 10|
|  red|carrot|  5| 50|
|  red|carrot|  3| 30|
| blue| grape|  4| 40|
+-----+------+---+---+



## Working with SQL

In [63]:
# the concept is
# spark df has a built-in function createOrReplaceTempView()
# That takes in a table name
# create a tempview 
# then we can run SQL
# spark.sql is like a database
# with all the fake databases

df.createOrReplaceTempView("tableA")
spark.sql("SELECT count(*) from tableA").show()

+--------+
|count(1)|
+--------+
|       8|
+--------+



This is how we can use pandas function on SQL syntax

NICE!!!! it's a mix and match, wooohooooo

In [65]:
@pandas_udf("integer")
def add_one(s: pd.Series) -> pd.Series:
    return s + 1

# need to register the function before we can use it
spark.udf.register("add_one", add_one)

# here is actually using it
spark.sql("SELECT add_one(v1) FROM tableA").show()


                                                                                

+-----------+
|add_one(v1)|
+-----------+
|          2|
|          3|
|          4|
|          5|
|          6|
|          7|
|          8|
|          9|
+-----------+





In [68]:
# Selecting the v1 see if that SQL syntazx works
df.select('v1').show()

+---+
| v1|
+---+
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
+---+



These SQL expressions can directly be mixed and used as PySpark Columns


In [70]:
from pyspark.sql.functions import expr

# this is using the function but parsing a static value
df.selectExpr('add_one(1)').show()

# check to see if it's empty?????
df.select(expr('count(*)') > 0).show()

+----------+
|add_one(1)|
+----------+
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
+----------+

+--------------+
|(count(1) > 0)|
+--------------+
|          true|
+--------------+



In [71]:
# Printing out the total number of rows by using a simple count()
df.selectExpr('count(*)').show()

+--------+
|count(1)|
+--------+
|       8|
+--------+

