In [0]:
import pyspark

from pyspark.sql import SparkSession

spark=SparkSession.builder.master('local[*]').appName('SparkByExample').getOrCreate()


### PySpark UDF (User Defined Function)


* **Why do we need a UDF** : UDF's are user to extend the functions of the framework and re-use these functions on multiple dataframes. 


In [0]:

from pyspark.sql import SparkSession

columns = ["Seqno","Name"]
data = [("1", "john jones"),
    ("2", "tracey smith"),
    ("3", "amy sanders")]

df = spark.createDataFrame(data=data,schema=columns)

df.show(truncate=False)


+-----+------------+
|Seqno|Name        |
+-----+------------+
|1    |john jones  |
|2    |tracey smith|
|3    |amy sanders |
+-----+------------+



In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
def convertCase(x):
    return x[0:1].upper() + x[1:len(x)] 

convertcaseudf = udf(lambda z: convertCase(z), StringType()) 



In [0]:
from pyspark.sql.functions import col
df.select(col('seqno'), \
    convertcaseudf(col('Name')).alias('Name')) \
    .show(truncate=False)    

+-----+------------+
|seqno|Name        |
+-----+------------+
|1    |John jones  |
|2    |Tracey smith|
|3    |Amy sanders |
+-----+------------+



In [0]:
def upperCase(x):
    return x.upper()

uppercaseUDF = udf(lambda z: upperCase(z), StringType())

df1 = df.withColumn("Modified name", uppercaseUDF(df.Name))
print(df1.show())

+-----+------------+-------------+
|Seqno|        Name|Modified name|
+-----+------------+-------------+
|    1|  john jones|   JOHN JONES|
|    2|tracey smith| TRACEY SMITH|
|    3| amy sanders|  AMY SANDERS|
+-----+------------+-------------+

None



### Creating UDF using annotation:

Using annotation, we can avoid two steps . That is i) Create a python function, ii) convert function to udf function using SQL udf() function. 

In [0]:
@udf(returnType=StringType())
def upperCase(str):
    return str.upper()
df2=df.withColumn("Upper Name", upperCase(df.Name))
df2.show()

+-----+------------+------------+
|Seqno|        Name|  Upper Name|
+-----+------------+------------+
|    1|  john jones|  JOHN JONES|
|    2|tracey smith|TRACEY SMITH|
|    3| amy sanders| AMY SANDERS|
+-----+------------+------------+



In [0]:
# Registering PySpark UDF & use it on SQL

##Using idf on sql###
spark.udf.register("convertCase", convertCase,StringType())
df.createOrReplaceTempView("Name_Table")
spark.sql("select Seqno, convertCase(Name) as name from Name_Table").show()

+-----+------------+
|Seqno|        name|
+-----+------------+
|    1|  John jones|
|    2|Tracey smith|
|    3| Amy sanders|
+-----+------------+




### PySpark transform() Function with Example

syntax :
```

# Syntax
DataFrame.transform(func: Callable[[…], DataFrame], *args: Any, **kwargs: Any) → pyspark.sql.dataframe.DataFrame

```

In [0]:

# Imports
from pyspark.sql import SparkSession

# Create SparkSession
spark = SparkSession.builder \
            .appName('SparkByExamples.com') \
            .getOrCreate()

# Prepare Data
simpleData = (("Java",4000,5), \
    ("Python", 4600,10),  \
    ("Scala", 4100,15),   \
    ("Scala", 4500,15),   \
    ("PHP", 3000,20),  \
  )
columns= ["CourseName", "fee", "discount"]

# Create DataFrame
df = spark.createDataFrame(data = simpleData, schema = columns)
df.printSchema()
df.show(truncate=False)


root
 |-- CourseName: string (nullable = true)
 |-- fee: long (nullable = true)
 |-- discount: long (nullable = true)

+----------+----+--------+
|CourseName|fee |discount|
+----------+----+--------+
|Java      |4000|5       |
|Python    |4600|10      |
|Scala     |4100|15      |
|Scala     |4500|15      |
|PHP       |3000|20      |
+----------+----+--------+



In [0]:

#custom tranformation 1
from pyspark.sql.functions import upper 
def to_upper_str_columns(df):
    return df.withColumn('coursename', upper(df.CourseName))

#custom transformation 2
def reduce_price(df,reduceBy):
    return df.withColumn('new_fee', df.fee - reduceBy)

#custom transformation 3
def apply_discount(df):
    return df.withColumn('discounted_fee', df.new_fee - (df.new_fee * df.discount) / 100)

#Apply dataframe transform

df2 = df.transform(to_upper_str_columns) \
    .transform(reduce_price, 1000) \
    .transform(apply_discount)

df2.show()


+----------+----+--------+-------+--------------+
|coursename| fee|discount|new_fee|discounted_fee|
+----------+----+--------+-------+--------------+
|      JAVA|4000|       5|   3000|        2850.0|
|    PYTHON|4600|      10|   3600|        3240.0|
|     SCALA|4100|      15|   3100|        2635.0|
|     SCALA|4500|      15|   3500|        2975.0|
|       PHP|3000|      20|   2000|        1600.0|
+----------+----+--------+-------+--------------+



### map() and flatmap()

* **map()**:
  - The map() transformation applies given function to each element of an RDD and return new RDD with the results
  - It produces a **one-to-one mapping**, meaning that for each input element, teh function generates excatly one output element
* **flatmap()**:
  - the flatmao() transformation applies a given function to each element of an RDD an returns a new RDD by flattening the result
  - It produces **one-to many mapping**, meaning that for each input element, the function can generate zero, one or multiple output elements.
  - it doesnot support psypark dataframe. it suuports only for RDD. For pyspark dataframe , we can use `explode` functionality 

In [0]:
#map()

# rdd=sc.parallelize([12,21,31,41,51])
# sq_rdd=rdd.map(lambda x: x**2)
# sq_rdd.collect()

rdd1=sc.parallelize(['1-2','2-1','1-3','4-1'])
rdd_flat_map =rdd1.flatMap(lambda x: x.split('-'))
rdd_flat_map.collect()

Out[26]: ['1', '2', '2', '1', '1', '3', '4', '1']


### foreach() 

- It is used to apply a function to each element of an RDD/DF without returning new RDD/DF.i.e, it is **action**
- syntax: `rdd.foreach(func)`
- the foreach() action in PySpark does not return any values, and it does not necessarily print to the console. Instead, it is typically used for side effects like updating external resources.


In [0]:
rdd = sc.parallelize([1,2,3,4,5])

def print_element(ele):
    print(ele)

rdd.foreach(print_element)