# Quick Tutorial of PandasUDF for PySpark 3.x

Thanks to the below source author 

Source: https://towardsdatascience.com/distributed-processing-with-pyarrow-powered-new-pandas-udfs-in-pyspark-3-0-8f1fe4c15208

# Basic Packages

In [1]:
#!pip list |grep pyarrow

In [2]:
# import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import sys
import os
import numpy as np
from astropy.table import Table
from matplotlib.ticker import MultipleLocator
import pandas as pd

from astropy.utils.exceptions import AstropyWarning
import warnings

pd.set_option('display.max_rows', 300)
    
np.seterr(all='ignore')
warnings.simplefilter('ignore', category=AstropyWarning)

# https://github.com/gbrammer/eazy-py
import eazy

In [3]:
import numpy as np
import pandas as pd
import glob
import sys
import h5py
#from netCDF4 import Dataset
from datetime import datetime
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from scipy.spatial import cKDTree

import pyarrow as pa
import pyarrow.parquet as pq

from functools import reduce
import operator
import gc

In [4]:
# plot settings
#plt.rc('font', family='serif') 
#plt.rc('font', serif='Times New Roman') 
plt.rcParams.update({'font.size': 16})
plt.rcParams['mathtext.fontset'] = 'stix'

In [5]:
!pip list |grep eazy

  from pkg_resources import load_entry_point
eazy                      0.6.8               


# PySpark Session

In [6]:
%%time
# PySpark packages
from pyspark import SparkContext   
from pyspark.sql import SparkSession

import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark import Row
from pyspark.sql.window import Window as W
from pyspark.sql.functions import pandas_udf, PandasUDFType


spark = SparkSession.builder \
    .master("yarn") \
    .appName("spark-shell") \
    .config("spark.driver.maxResultSize", "32g") \
    .config("spark.driver.memory", "32g") \
    .config("spark.executor.memory", "7g") \
    .config("spark.executor.cores", "1") \
    .config("spark.executor.instances", "50") \
    .getOrCreate()


sc = spark.sparkContext
sc.setCheckpointDir("hdfs://spark00:54310/tmp/checkpoints")

spark.conf.set("spark.sql.debug.maxToStringFields", 500)
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

CPU times: user 14.6 ms, sys: 3.6 ms, total: 18.2 ms
Wall time: 26.2 s


## Quick Tutorial of PandasUDF

In [7]:
dataframe = spark.createDataFrame([(1, 5), (2, 7), (2, 8), (2, 10), (3, 18), (3, 22), (4, 36)], \
                                  ["index", "weight"])

In [8]:
dataframe.show()

+-----+------+
|index|weight|
+-----+------+
|    1|     5|
|    2|     7|
|    2|     8|
|    2|    10|
|    3|    18|
|    3|    22|
|    4|    36|
+-----+------+



In [9]:
# The function definition and the UDF creation
@pandas_udf("int")
def weight_avg_udf(weight: pd.Series) -> float:
    return weight.mean()

In [10]:
dataframe.select(weight_avg_udf(dataframe['weight'])).show()

+----------------------+
|weight_avg_udf(weight)|
+----------------------+
|                    15|
+----------------------+



In [11]:
# Aggregation Process on Pandas UDF
dataframe.groupby("index").agg(weight_avg_udf(dataframe['weight'])).show()

+-----+----------------------+
|index|weight_avg_udf(weight)|
+-----+----------------------+
|    1|                     5|
|    2|                     8|
|    3|                    20|
|    4|                    36|
+-----+----------------------+



In [12]:
# Print the windowed results
from pyspark.sql import Window
w = Window.partitionBy('index') \
    .rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)

In [13]:
dataframe.withColumn('avg_weight', weight_avg_udf(dataframe['weight']).over(w)).show()

+-----+------+----------+
|index|weight|avg_weight|
+-----+------+----------+
|    1|     5|         5|
|    2|     7|         8|
|    2|    10|         8|
|    2|     8|         8|
|    3|    18|        20|
|    3|    22|        20|
|    4|    36|        36|
+-----+------+----------+



#### Grouped_map using `applyInPandas` which is quite different from `apply` using pyspark's native UDF

In [14]:
def weight_map_udf(pandas_dataframe):
    weight = pandas_dataframe.weight
    return pandas_dataframe.assign(weight=weight - weight.mean())
dataframe.groupby("index").applyInPandas(weight_map_udf, schema="index int, weight int").show()

+-----+------+
|index|weight|
+-----+------+
|    1|     0|
|    2|    -1|
|    2|     1|
|    2|     0|
|    3|     2|
|    3|    -2|
|    4|     0|
+-----+------+



> Learn more about UDF and PandasUDF!!