
### PySpark When Otherwise | SQL Case When Usage

In [0]:
import pyspark
from pyspark.sql import SparkSession

In [0]:
spark=SparkSession.builder \
      .master('local[*]') \
      .appName('SparkByExample') \
      .getOrCreate()  

In [0]:
data = [("James","M",60000),("Michael","M",70000),
        ("Robert",None,400000),("Maria","F",500000),
        ("Jen","",None)]

columns = ["name","gender","salary"]

df = spark.createDataFrame(data = data, schema=columns)
df.show()

+-------+------+------+
|   name|gender|salary|
+-------+------+------+
|  James|     M| 60000|
|Michael|     M| 70000|
| Robert|  null|400000|
|  Maria|     F|500000|
|    Jen|      |  null|
+-------+------+------+




#### Using when() otherwise() on PySpark DataFrame.

when() funcion take 2 parameters, first param takes a conditionand second takes a literal value or column. If condition evaluates the it returns a value from second param.
When otherwise() not used and none of the conditions met it assigns None(null) value. Usage would be like 
`when(condition, value).otherwise(defaultvalue)`

In [0]:
from pyspark.sql.functions import when
df2=df.withColumn('new_gender', when(df.gender == 'M', 'Male')
                                .when(df.gender == 'F','Female')
                                .when(df.gender.isNull(), '')
                                .otherwise(df.gender))

df2.show()


+-------+------+------+----------+
|   name|gender|salary|new_gender|
+-------+------+------+----------+
|  James|     M| 60000|      Male|
|Michael|     M| 70000|      Male|
| Robert|  null|400000|          |
|  Maria|     F|500000|    Female|
|    Jen|      |  null|          |
+-------+------+------+----------+



####Using Case When Else on DataFrame using withColumn() & select()

In [0]:
from pyspark.sql.functions import expr, col
df3 = df.withColumn("new_gender", expr("CASE WHEN gender = 'M' THEN 'Male' " + 
               "WHEN gender = 'F' THEN 'Female' WHEN gender IS NULL THEN ''" +
               "ELSE gender END"))
df3.show(truncate=False)

+-------+------+------+----------+
|name   |gender|salary|new_gender|
+-------+------+------+----------+
|James  |M     |60000 |Male      |
|Michael|M     |70000 |Male      |
|Robert |null  |400000|          |
|Maria  |F     |500000|Female    |
|Jen    |      |null  |          |
+-------+------+------+----------+




####Using Case When on SQL Expression

In [0]:
df.createOrReplaceTempView('EMP')
# spark.sql("select name, " +
#           "case when gender = 'M'" +
#           "then 'Male' " +
#           "when gender='F' " +
#           "then 'Female'" +
#           "when gender is Null" +
#           "then '' " +
#           "ELSE gender " +
#           "END as new_gender from emp").show()

spark.sql("select name, CASE WHEN gender = 'M' THEN 'Male' " + 
               "WHEN gender = 'F' THEN 'Female' WHEN gender IS NULL THEN ''" +
              "ELSE gender END as new_gender from EMP").show()

+-------+----------+
|   name|new_gender|
+-------+----------+
|  James|      Male|
|Michael|      Male|
| Robert|          |
|  Maria|    Female|
|    Jen|          |
+-------+----------+




####Multiple Conditions using & and | operator


In [0]:


# df5.withColumn(“new_column”, when((col(“code”) == “a”) | (col(“code”) == “d”), “A”)
# .when((col(“code”) == “b”) & (col(“amt”) == “4”), “B”)
# .otherwise(“A1”)).show()



###PySpark SQL expr() (Expression) Function


#### Concatenate Columns using || (similar to SQL)


In [0]:
data = [('james','Bond'), ('Scott', 'Varsa')]
df=spark.createDataFrame(data).toDF('firstname','lastname')
df.withColumn('Name', expr("firstname || ',' || lastname")).show()

+---------+--------+-----------+
|firstname|lastname|       Name|
+---------+--------+-----------+
|    james|    Bond| james,Bond|
|    Scott|   Varsa|Scott,Varsa|
+---------+--------+-----------+




####Using an Existing Column Value for Expression


In [0]:
from pyspark.sql.functions import add_months
data=[("2019-01-23",1),("2019-06-24",2),("2019-09-20",3)] 
df=spark.createDataFrame(data).toDF("date","increment") 

df.select(col('date'),col('increment'), expr("add_months(date, increment)").alias('Inc_date')).show()


##### Giving Column Alias along with expr()

df.select(df.date, df.increment, expr("""add_months(date,increment) as inc_date""")).show()


+----------+---------+----------+
|      date|increment|  Inc_date|
+----------+---------+----------+
|2019-01-23|        1|2019-02-23|
|2019-06-24|        2|2019-08-24|
|2019-09-20|        3|2019-12-20|
+----------+---------+----------+

+----------+---------+----------+
|      date|increment|  inc_date|
+----------+---------+----------+
|2019-01-23|        1|2019-02-23|
|2019-06-24|        2|2019-08-24|
|2019-09-20|        3|2019-12-20|
+----------+---------+----------+



#### cast Function with expr()


In [0]:
df.select('increment',expr("cast(increment as string) as str_increment")).printSchema()
# df.select("increment",expr("cast(increment as string) as str_increment")).printSchema()

root
 |-- increment: long (nullable = true)
 |-- str_increment: string (nullable = true)



####Arithmetic operations



In [0]:
df.select(df.date, df.increment, expr('increment + 5 as new_increment')).show()

+----------+---------+-------------+
|      date|increment|new_increment|
+----------+---------+-------------+
|2019-01-23|        1|            6|
|2019-06-24|        2|            7|
|2019-09-20|        3|            8|
+----------+---------+-------------+



####Using Filter with expr()


In [0]:

#Use expr()  to filter the rows
from pyspark.sql.functions import expr
data=[(100,2),(200,3000),(500,500)] 
df=spark.createDataFrame(data).toDF("col1","col2") 
df.filter(expr("col1 == col2")).show()


+----+----+
|col1|col2|
+----+----+
| 500| 500|
+----+----+



### PySpark lit() – Add Literal or Constant to DataFrame


In [0]:
data = [("111",50000),("222",60000),("333",40000)]
columns= ["EmpId","Salary"]
df = spark.createDataFrame(data = data, schema = columns)


####1. Simple usage of lit() function

In [0]:
from pyspark.sql.functions import col, lit
df2=df.select(col('EmpId'), col('Salary'), lit('1').alias('lit_value_1'))


####lit() function with withColumn

In [0]:
from pyspark.sql.functions import when, lit, col

In [0]:
df3=df2.withColumn('lit_value2', when((col('Salary') >=40000) & (col('Salary') < 60000), lit('1000')).otherwise(lit('200')))

df3.show()

+-----+------+-----------+----------+
|EmpId|Salary|lit_value_1|lit_value2|
+-----+------+-----------+----------+
|  111| 50000|          1|      1000|
|  222| 60000|          1|       200|
|  333| 40000|          1|      1000|
+-----+------+-----------+----------+




###Pyspark Split

Split() function converts delimiter separated string to an array(stringType, ArrayType) column on dataframe

syntax: `pyspark.sql.functions.split(str, pattern, limit = -1)`

In [0]:
data = [("James, A, Smith","2018","M",3000),
            ("Michael, Rose, Jones","2010","M",4000),
            ("Robert,K,Williams","2010","M",4000),
            ("Maria,Anne,Jones","2005","F",4000),
            ("Jen,Mary,Brown","2010","",-1)
            ]

columns=["name","dob_year","gender","salary"]
df=spark.createDataFrame(data,columns)
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- dob_year: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)



In [0]:
from pyspark.sql.functions import split
df.withColumn('Name1', split(col('name'), ",")).show(truncate=False)


+--------------------+--------+------+------+------------------------+
|name                |dob_year|gender|salary|Name1                   |
+--------------------+--------+------+------+------------------------+
|James, A, Smith     |2018    |M     |3000  |[James,  A,  Smith]     |
|Michael, Rose, Jones|2010    |M     |4000  |[Michael,  Rose,  Jones]|
|Robert,K,Williams   |2010    |M     |4000  |[Robert, K, Williams]   |
|Maria,Anne,Jones    |2005    |F     |4000  |[Maria, Anne, Jones]    |
|Jen,Mary,Brown      |2010    |      |-1    |[Jen, Mary, Brown]      |
+--------------------+--------+------+------+------------------------+




####Convert String to Array Column using SQL Query


In [0]:
df.createOrReplaceTempView('Person')
spark.sql("select split(name, ',') as NameArray from person").show()

+--------------------+
|           NameArray|
+--------------------+
| [James,  A,  Smith]|
|[Michael,  Rose, ...|
|[Robert, K, Willi...|
|[Maria, Anne, Jones]|
|  [Jen, Mary, Brown]|
+--------------------+



### PySpark – concat_ws(),  Convert array column to a String

Syntax: `
concat_ws(sep, *cols)
`

Which takes delimiter as a first argument and array column as second argument


In [0]:
columns = ["name","languagesAtSchool","currentState"]
data = [("James,,Smith",["Java","Scala","C++"],"CA"), \
    ("Michael,Rose,",["Spark","Java","C++"],"NJ"), \
    ("Robert,,Williams",["CSharp","VB"],"NV")]

df = spark.createDataFrame(data=data,schema=columns)
df.printSchema()
df.show(truncate=False)

root
 |-- name: string (nullable = true)
 |-- languagesAtSchool: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- currentState: string (nullable = true)

+----------------+------------------+------------+
|name            |languagesAtSchool |currentState|
+----------------+------------------+------------+
|James,,Smith    |[Java, Scala, C++]|CA          |
|Michael,Rose,   |[Spark, Java, C++]|NJ          |
|Robert,,Williams|[CSharp, VB]      |NV          |
+----------------+------------------+------------+



In [0]:
from pyspark.sql.functions import concat_ws, col

df2=df.withColumn('arrayType', concat_ws(',', col('languagesAtSchool'))).show()

+----------------+------------------+------------+--------------+
|            name| languagesAtSchool|currentState|     arrayType|
+----------------+------------------+------------+--------------+
|    James,,Smith|[Java, Scala, C++]|          CA|Java,Scala,C++|
|   Michael,Rose,|[Spark, Java, C++]|          NJ|Spark,Java,C++|
|Robert,,Williams|      [CSharp, VB]|          NV|     CSharp,VB|
+----------------+------------------+------------+--------------+




###Pyspark – substring()

substring() from a column

syntax: `substr(str, pos, len)`


In [0]:
from pyspark.sql.functions import substring
data = [(1,"20200828"),(2,"20180525")]
columns=["id","date"]
df=spark.createDataFrame(data,columns)

df.withColumn('year', substring('date',1,4)) \
   .withColumn('month', substring('date',5,2)) \
    .withColumn('day', substring('date', 7,2)).show()


#or

df.select('date', substring('date',1,4).alias('year'), \
                   substring('date', 5,2).alias('month'), \
                    substring('date', 7,2).alias('day')).show()


#or

#Using with selectExpr
df.selectExpr('date', 'substring(date, 1,4) as year', \
                  'substring(date, 5,2) as month', \
                  'substring(date, 7,2) as day').show()


+---+--------+----+-----+---+
| id|    date|year|month|day|
+---+--------+----+-----+---+
|  1|20200828|2020|   08| 28|
|  2|20180525|2018|   05| 25|
+---+--------+----+-----+---+

+--------+----+-----+---+
|    date|year|month|day|
+--------+----+-----+---+
|20200828|2020|   08| 28|
|20180525|2018|   05| 25|
+--------+----+-----+---+

+--------+----+-----+---+
|    date|year|month|day|
+--------+----+-----+---+
|20200828|2020|   08| 28|
|20180525|2018|   05| 25|
+--------+----+-----+---+




###PySpark – translate(), (PySpark Replace Column Values in DataFrame)

We can replace column value of pyspark Dataframe by using SQL string functions regexp_replace(), translate() and overlay()

In [0]:

from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[1]").appName("SparkByExamples.com").getOrCreate()
address = [(1,"14851 Jeffrey Rd","DE"),
    (2,"43421 Margarita St","NY"),
    (3,"13111 Siemon Ave","CA")]
df =spark.createDataFrame(address,["id","address","state"])
df.show()


+---+------------------+-----+
| id|           address|state|
+---+------------------+-----+
|  1|  14851 Jeffrey Rd|   DE|
|  2|43421 Margarita St|   NY|
|  3|  13111 Siemon Ave|   CA|
+---+------------------+-----+



In [0]:

from pyspark.sql.functions import regexp_replace, when
df.withColumn('modified_addrees', regexp_replace('address', 'Rd', 'Road')).show()

##Replace Column Values Conditionally
df.withColumn('address', 
              when(df.address.endswith('Rd'), regexp_replace(df.address, 'Rd', 'Road')) \
              .when(df.address.endswith('St'), regexp_replace(df.address, 'St', 'Street')) \
              .when(df.address.endswith('Ave'), regexp_replace(df.address, 'Ave', 'Avenue')) \
              .otherwise(df.address)) \
               .show(truncate=False) 

+---+------------------+-----+------------------+
| id|           address|state|  modified_addrees|
+---+------------------+-----+------------------+
|  1|  14851 Jeffrey Rd|   DE|14851 Jeffrey Road|
|  2|43421 Margarita St|   NY|43421 Margarita St|
|  3|  13111 Siemon Ave|   CA|  13111 Siemon Ave|
+---+------------------+-----+------------------+

+---+----------------------+-----+
|id |address               |state|
+---+----------------------+-----+
|1  |14851 Jeffrey Road    |DE   |
|2  |43421 Margarita Street|NY   |
|3  |13111 Siemon Avenue   |CA   |
+---+----------------------+-----+




####Replace Column Value with Dictionary (map)


In [0]:
stateDic={'CA':'California','NY':'New York','DE':'Delaware'}
df2=df.rdd.map(lambda x: 
    (x.id,x.address,stateDic[x.state]) 
    ).toDF(["id","address","state"])
df2.show()

+---+------------------+----------+
| id|           address|     state|
+---+------------------+----------+
|  1|  14851 Jeffrey Rd|  Delaware|
|  2|43421 Margarita St|  New York|
|  3|  13111 Siemon Ave|California|
+---+------------------+----------+



###Replace Column Value Character by Character using translate()

In [0]:
from pyspark.sql.functions import translate
df.withColumn('modified_new_column', translate(df.address, '123','ABC')).show()

+---+------------------+-----+-------------------+
| id|           address|state|modified_new_column|
+---+------------------+-----+-------------------+
|  1|  14851 Jeffrey Rd|   DE|   A485A Jeffrey Rd|
|  2|43421 Margarita St|   NY| 4C4BA Margarita St|
|  3|  13111 Siemon Ave|   CA|   ACAAA Siemon Ave|
+---+------------------+-----+-------------------+



####Using overlay() Function

In [0]:
from pyspark.sql.functions import overlay
df = spark.createDataFrame([("ABCDE_XYZ", "FGH")], ("col1", "col2"))
df.select(overlay('col1','col2', 7).alias('overlayed')).show()

+---------+
|overlayed|
+---------+
|ABCDE_FGH|
+---------+




###PySpark to_timestamp() – Convert String to Timestamp type

In [0]:
from pyspark.sql.functions import *

df=spark.createDataFrame(data = [('1', '2019-06-24 12:01:19.000')],
                         schema = ['id', 'timestamp'])

df.printSchema()

df_timestamp = df.withColumn('Timestamp_type_column', to_timestamp('timestamp'))
df_timestamp.show(truncate=False)


###SQL Example

spark.sql("select to_timestamp('2019-06-24 12:01:19.000') as timestamp").show()


root
 |-- id: string (nullable = true)
 |-- timestamp: string (nullable = true)

+---+-----------------------+---------------------+
|id |timestamp              |Timestamp_type_column|
+---+-----------------------+---------------------+
|1  |2019-06-24 12:01:19.000|2019-06-24 12:01:19  |
+---+-----------------------+---------------------+

+-------------------+
|          timestamp|
+-------------------+
|2019-06-24 12:01:19|
+-------------------+




###PySpark to_date() – Convert Timestamp to Date

In [0]:
from pyspark.sql.functions import *
df.withColumn('date_type', to_date('timestamp')).show(truncate=False)
df.withColumn('date_type', to_date(current_timestamp())).show()


#Convert TimestampType (timestamp) to DateType (date)
df.withColumn('ts', to_timestamp('timestamp')) \
    .withColumn('datetype', to_date('ts')).show()

#Using Column cast() Function
df.withColumn('date_type', to_timestamp('timestamp').cast('date')).show(truncate=False)

+---+-----------------------+----------+
|id |timestamp              |date_type |
+---+-----------------------+----------+
|1  |2019-06-24 12:01:19.000|2019-06-24|
+---+-----------------------+----------+

+---+--------------------+----------+
| id|           timestamp| date_type|
+---+--------------------+----------+
|  1|2019-06-24 12:01:...|2023-09-12|
+---+--------------------+----------+

+---+--------------------+-------------------+----------+
| id|           timestamp|                 ts|  datetype|
+---+--------------------+-------------------+----------+
|  1|2019-06-24 12:01:...|2019-06-24 12:01:19|2019-06-24|
+---+--------------------+-------------------+----------+

+---+-----------------------+----------+
|id |timestamp              |date_type |
+---+-----------------------+----------+
|1  |2019-06-24 12:01:19.000|2019-06-24|
+---+-----------------------+----------+




###PySpark date_format() – Convert Date to String format
In PySpark use date_format() function to convert the DataFrame column from Date to String format
syntax: `
Syntax:  date_format(column,format)
Example: date_format(current_timestamp(),"yyyy MM dd").alias("date_format")
`

In [0]:
from pyspark.sql.functions import *
# df=spark.createDataFrame([], StructType([])).show()
df=spark.createDataFrame([["1"]],["id"])
df.select(current_date().alias('current_date'), \
    date_format(current_timestamp(), 'yyyy MM dd').alias('yyyy MM dd'), \
    date_format(current_timestamp(), 'MM/dd/yyyy hh:mm').alias('MM/dd/yyyy hh:mm'), \
    date_format(current_timestamp(), 'yyyy MMMM dd').alias('yyyy MMMM dd'), \
    date_format(current_timestamp(), 'yyyy MMMM dd E').alias('yyyy MMMM dd E')  \
    ).show(truncate=False)


#SQL
spark.sql("select current_date() as current_date, "+
      "date_format(current_timestamp(),'yyyy MM dd') as yyyy_MM_dd, "+
      "date_format(current_timestamp(),'MM/dd/yyyy hh:mm') as MM_dd_yyyy, "+
      "date_format(current_timestamp(),'yyyy MMM dd') as yyyy_MMMM_dd, "+
      "date_format(current_timestamp(),'yyyy MMMM dd E') as yyyy_MMMM_dd_E").show()


+------------+----------+----------------+-----------------+---------------------+
|current_date|yyyy MM dd|MM/dd/yyyy hh:mm|yyyy MMMM dd     |yyyy MMMM dd E       |
+------------+----------+----------------+-----------------+---------------------+
|2023-09-12  |2023 09 12|09/12/2023 11:01|2023 September 12|2023 September 12 Tue|
+------------+----------+----------------+-----------------+---------------------+

+------------+----------+----------------+------------+--------------------+
|current_date|yyyy_MM_dd|      MM_dd_yyyy|yyyy_MMMM_dd|      yyyy_MMMM_dd_E|
+------------+----------+----------------+------------+--------------------+
|  2023-09-12|2023 09 12|09/12/2023 11:01| 2023 Sep 12|2023 September 12...|
+------------+----------+----------------+------------+--------------------+



### PySpark – Difference between two dates (days, months, years)
 datediff(), months_between()

####datediff() Function


In [0]:
from pyspark.sql.functions import *
data = [("1","2019-07-01"),("2","2019-06-24"),("3","2019-08-24")]
df=spark.createDataFrame(data=data,schema=["id","date"])

In [0]:
df.withColumn('diff in two dates', datediff(current_date(), col('date'))).show()

+---+----------+-----------------+
| id|      date|diff in two dates|
+---+----------+-----------------+
|  1|2019-07-01|             1534|
|  2|2019-06-24|             1541|
|  3|2019-08-24|             1480|
+---+----------+-----------------+



####months_between() Function


In [0]:
df.select(col('date'), months_between(current_date(), col('date')).alias('diff in months')).show()



from pyspark.sql.functions import *
df.withColumn("datesDiff", datediff(current_date(),col("date"))) \
  .withColumn("montsDiff", months_between(current_date(),col("date"))) \
  .withColumn("montsDiff_round",round(months_between(current_date(),col("date")),2)) \
  .withColumn("yearsDiff",months_between(current_date(),col("date"))/lit(12)) \
  .withColumn("yearsDiff_round",round(months_between(current_date(),col("date"))/lit(12),2)) \
  .show()


+----------+--------------+
|      date|diff in months|
+----------+--------------+
|2019-07-01|   50.35483871|
|2019-06-24|   50.61290323|
|2019-08-24|   48.61290323|
+----------+--------------+

+---+----------+---------+-----------+---------------+-----------------+---------------+
| id|      date|datesDiff|  montsDiff|montsDiff_round|        yearsDiff|yearsDiff_round|
+---+----------+---------+-----------+---------------+-----------------+---------------+
|  1|2019-07-01|     1534|50.35483871|          50.35|4.196236559166667|            4.2|
|  2|2019-06-24|     1541|50.61290323|          50.61|4.217741935833334|           4.22|
|  3|2019-08-24|     1480|48.61290323|          48.61|4.051075269166667|           4.05|
+---+----------+---------+-----------+---------------+-----------------+---------------+



In [0]:

spark.sql("select round(months_between('2019-07-01',current_date())/12,2) as years_diff").show()


+----------+
|years_diff|
+----------+
|      -4.2|
+----------+

