In [1]:
import os
import sys
from pyspark.sql import SparkSession

os.environ["HADOOP_HOME"] = "C:\\Users\\SkJain\\Downloads\\Compressed\\winutils-master\\hadoop-3.2.2"
os.environ["PYSPARK_PYTHON"] = "python"
sys.path.append('C:\\Users\\SkJain\\Downloads\\Compressed\\winutils-master\\hadoop-3.2.2\\bin')

In [2]:
spark = SparkSession. \
    builder. \
    config("spark.ui.port", "0"). \
    enableHiveSupport(). \
    appName('Pyspark - Part 2'). \
    master('local'). \
    getOrCreate()

In [3]:
from pyspark.sql.functions import *

### Create dummy data

In [4]:
#same as oracle style (discussed in spark sql)
l = [('X',)] # a list with one tuple with column with value X 
# the comma after 'X' is important to make it a tuple

In [7]:
df = spark.createDataFrame(l, "dummy STRING")

In [8]:
df.show()

+-----+
|dummy|
+-----+
|    X|
+-----+



In [10]:
df.printSchema()

root
 |-- dummy: string (nullable = true)



In [12]:
employees = [(1, "Scott", "Tiger", 1000.0, 
                      "united states", "+1 123 456 7890", "123 45 6789"
                     ),
                     (2, "Henry", "Ford", 1250.0, 
                      "India", "+91 234 567 8901", "456 78 9123"
                     ),
                     (3, "Nick", "Junior", 750.0, 
                      "united KINGDOM", "+44 111 111 1111", "222 33 4444"
                     ),
                     (4, "Bill", "Gomes", 1500.0, 
                      "AUSTRALIA", "+61 987 654 3210", "789 12 6118"
                     )
            ]

In [13]:
 len(employees)

4

In [14]:
emp_dummy_df = spark.createDataFrame(
    employees,
    schema = """ 
    emp_id INT,f_name STRING,l_name STRING,sal FLOAT,
    country STRING,ph_num STRING,ssn STRING
    """
)

In [15]:
emp_dummy_df.printSchema()

root
 |-- emp_id: integer (nullable = true)
 |-- f_name: string (nullable = true)
 |-- l_name: string (nullable = true)
 |-- sal: float (nullable = true)
 |-- country: string (nullable = true)
 |-- ph_num: string (nullable = true)
 |-- ssn: string (nullable = true)



In [16]:
emp_dummy_df.show()

+------+------+------+------+--------------+----------------+-----------+
|emp_id|f_name|l_name|   sal|       country|          ph_num|        ssn|
+------+------+------+------+--------------+----------------+-----------+
|     1| Scott| Tiger|1000.0| united states| +1 123 456 7890|123 45 6789|
|     2| Henry|  Ford|1250.0|         India|+91 234 567 8901|456 78 9123|
|     3|  Nick|Junior| 750.0|united KINGDOM|+44 111 111 1111|222 33 4444|
|     4|  Bill| Gomes|1500.0|     AUSTRALIA|+61 987 654 3210|789 12 6118|
+------+------+------+------+--------------+----------------+-----------+



## Categories of Functions
These are 4 broad categories. Total there are ove 300 in-built functions
- String Manipulation
- Date Manipulation
- Aggregate
- Other Functions

### Special Functions: col and lit

In [17]:
# we can use column names as strings when passing to select, groupby, order by etc.
emp_dummy_df. \
select("f_name", "country"). \
show(5)

+------+--------------+
|f_name|       country|
+------+--------------+
| Scott| united states|
| Henry|         India|
|  Nick|united KINGDOM|
|  Bill|     AUSTRALIA|
+------+--------------+



In [18]:
#if there are no transformations, then we can pass all column as strings 
# otherwise we need to pass all columns as type column using col function
from pyspark.sql.functions import col
emp_dummy_df. \
select(col("f_name"), col("country")). \
show(5)

+------+--------------+
|f_name|       country|
+------+--------------+
| Scott| united states|
| Henry|         India|
|  Nick|united KINGDOM|
|  Bill|     AUSTRALIA|
+------+--------------+



In [19]:
#above eg works without col also, but if we apply any function then it won't.
# example given below:
from pyspark.sql.functions import upper
emp_dummy_df. \
select(upper("f_name"), upper("country")). \
show(5)

+-------------+--------------+
|upper(f_name)|upper(country)|
+-------------+--------------+
|        SCOTT| UNITED STATES|
|        HENRY|         INDIA|
|         NICK|UNITED KINGDOM|
|         BILL|     AUSTRALIA|
+-------------+--------------+



In [20]:
emp_dummy_df. \
select(upper(col("f_name")), upper(col("country"))). \
show(5)

+-------------+--------------+
|upper(f_name)|upper(country)|
+-------------+--------------+
|        SCOTT| UNITED STATES|
|        HENRY|         INDIA|
|         NICK|UNITED KINGDOM|
|         BILL|     AUSTRALIA|
+-------------+--------------+



In [21]:
# another way of usnig
#df_name[col_name] also return a col type

emp_dummy_df. \
select(emp_dummy_df["f_name"], col("country")). \
show(5)

+------+--------------+
|f_name|       country|
+------+--------------+
| Scott| united states|
| Henry|         India|
|  Nick|united KINGDOM|
|  Bill|     AUSTRALIA|
+------+--------------+



In [24]:
# lit is used to add a literal 
# since strings are treated as column names in select, group by etc
# if we pass a literal, it will try to seach it in column names, for that we can use lit()
from pyspark.sql.functions import concat, lit
emp_dummy_df. \
select(concat(emp_dummy_df["f_name"], lit('-'), col("country"))). \
show(5)

+--------------------------+
|concat(f_name, -, country)|
+--------------------------+
|       Scott-united states|
|               Henry-India|
|       Nick-united KINGDOM|
|            Bill-AUSTRALIA|
+--------------------------+



### Common string manipulation functions
- concat
- upper, lower, initcap, length
- substring
- split
- padding
- trimming

In [38]:
emp_dummy_df. \
select(concat("f_name", "l_name").alias("fullName"), "country"). \
show(5)

+----------+--------------+
|  fullName|       country|
+----------+--------------+
|ScottTiger| united states|
| HenryFord|         India|
|NickJunior|united KINGDOM|
| BillGomes|     AUSTRALIA|
+----------+--------------+



In [29]:
emp_dummy_df. \
withColumn("fullName", concat("f_name", lit(' '), "l_name")). \
show(5)

+------+------+------+------+--------------+----------------+-----------+-----------+
|emp_id|f_name|l_name|   sal|       country|          ph_num|        ssn|   fullName|
+------+------+------+------+--------------+----------------+-----------+-----------+
|     1| Scott| Tiger|1000.0| united states| +1 123 456 7890|123 45 6789|Scott Tiger|
|     2| Henry|  Ford|1250.0|         India|+91 234 567 8901|456 78 9123| Henry Ford|
|     3|  Nick|Junior| 750.0|united KINGDOM|+44 111 111 1111|222 33 4444|Nick Junior|
|     4|  Bill| Gomes|1500.0|     AUSTRALIA|+61 987 654 3210|789 12 6118| Bill Gomes|
+------+------+------+------+--------------+----------------+-----------+-----------+



In [34]:
emp_dummy_df. \
    withColumn("fullName", concat("f_name", lit(' '), "l_name")). \
    select("emp_id", "fullName", "country"). \
    withColumn("upperCountry", upper(col("country"))). \
    withColumn("lowerCountry", lower(col("country"))). \
    withColumn("camelCountry", initcap(col("country"))).  \
    withColumn("lengthOfCountry", length(col("country"))). \
    show()

+------+-----------+--------------+--------------+--------------+--------------+---------------+
|emp_id|   fullName|       country|  upperCountry|  lowerCountry|  camelCountry|lengthOfCountry|
+------+-----------+--------------+--------------+--------------+--------------+---------------+
|     1|Scott Tiger| united states| UNITED STATES| united states| United States|             13|
|     2| Henry Ford|         India|         INDIA|         india|         India|              5|
|     3|Nick Junior|united KINGDOM|UNITED KINGDOM|united kingdom|United Kingdom|             14|
|     4| Bill Gomes|     AUSTRALIA|     AUSTRALIA|     australia|     Australia|              9|
+------+-----------+--------------+--------------+--------------+--------------+---------------+



In [36]:
# substring takes 3 args: column, position, length
# position can be provided from last also by giving negative index
# we'll work on ssn which is of fixed length of 3 digits followed by 2 digits followed by 4 digits

emp_dummy_df. \
    withColumn("fullName", concat("f_name", lit(' '), "l_name")). \
    select("emp_id", "fullName", "country", "ph_num", "ssn"). \
    withColumn("ssn_last4", substring(col("ssn"), -4, 4).cast("int")). \
    withColumn("phone_lat4", substring(col("ph_num"), -4, 4).cast("int")). \
    show()

+------+-----------+--------------+----------------+-----------+---------+----------+
|emp_id|   fullName|       country|          ph_num|        ssn|ssn_last4|phone_lat4|
+------+-----------+--------------+----------------+-----------+---------+----------+
|     1|Scott Tiger| united states| +1 123 456 7890|123 45 6789|     6789|      7890|
|     2| Henry Ford|         India|+91 234 567 8901|456 78 9123|     9123|      8901|
|     3|Nick Junior|united KINGDOM|+44 111 111 1111|222 33 4444|     4444|      1111|
|     4| Bill Gomes|     AUSTRALIA|+61 987 654 3210|789 12 6118|     6118|      3210|
+------+-----------+--------------+----------------+-----------+---------+----------+



In [39]:
#split can be used to separate based on a delimiter
employees_with_multiple_num = [(1, "Scott", "Tiger", 1000.0, 
                      "united states", "+1 123 456 7890, +91 234 567 8901", "123 45 6789"
                     ),
                     (2, "Henry", "Ford", 1250.0, 
                      "India", "+91 234 567 8901, +44 111 111 1111", "456 78 9123"
                     ),
                     (3, "Nick", "Junior", 750.0, 
                      "united KINGDOM", "+44 111 111 1111, +61 987 654 3210", "222 33 4444"
                     ),
                     (4, "Bill", "Gomes", 1500.0, 
                      "AUSTRALIA", "+61 987 654 3210, +1 123 456 7890", "789 12 6118"
                     )
            ]

emp_dummy_phone_df = spark.createDataFrame(
    employees_with_multiple_num,
    schema = """ 
    emp_id INT,f_name STRING,l_name STRING,sal FLOAT,
    country STRING,ph_num STRING,ssn STRING
    """
)

In [41]:
emp_dummy_phone_df.show(truncate=False)

+------+------+------+------+--------------+----------------------------------+-----------+
|emp_id|f_name|l_name|sal   |country       |ph_num                            |ssn        |
+------+------+------+------+--------------+----------------------------------+-----------+
|1     |Scott |Tiger |1000.0|united states |+1 123 456 7890, +91 234 567 8901 |123 45 6789|
|2     |Henry |Ford  |1250.0|India         |+91 234 567 8901, +44 111 111 1111|456 78 9123|
|3     |Nick  |Junior|750.0 |united KINGDOM|+44 111 111 1111, +61 987 654 3210|222 33 4444|
|4     |Bill  |Gomes |1500.0|AUSTRALIA     |+61 987 654 3210, +1 123 456 7890 |789 12 6118|
+------+------+------+------+--------------+----------------------------------+-----------+



In [42]:
emp_dummy_phone_df. \
select("emp_id", concat("f_name", lit("_"), "l_name").alias("FullName"), "ph_num", "ssn"). \
show()

+------+-----------+--------------------+-----------+
|emp_id|   FullName|              ph_num|        ssn|
+------+-----------+--------------------+-----------+
|     1|Scott_Tiger|+1 123 456 7890, ...|123 45 6789|
|     2| Henry_Ford|+91 234 567 8901,...|456 78 9123|
|     3|Nick_Junior|+44 111 111 1111,...|222 33 4444|
|     4| Bill_Gomes|+61 987 654 3210,...|789 12 6118|
+------+-----------+--------------------+-----------+



In [47]:
# get individual phone numbers
emp_dummy_phone_df. \
select("emp_id", concat("f_name", lit("_"), "l_name").alias("FullName"), "ph_num", "ssn"). \
select("*", explode(split("ph_num", ',')).alias("individual_phnum")). \
show(truncate=False)

+------+-----------+----------------------------------+-----------+-----------------+
|emp_id|FullName   |ph_num                            |ssn        |individual_phnum |
+------+-----------+----------------------------------+-----------+-----------------+
|1     |Scott_Tiger|+1 123 456 7890, +91 234 567 8901 |123 45 6789|+1 123 456 7890  |
|1     |Scott_Tiger|+1 123 456 7890, +91 234 567 8901 |123 45 6789| +91 234 567 8901|
|2     |Henry_Ford |+91 234 567 8901, +44 111 111 1111|456 78 9123|+91 234 567 8901 |
|2     |Henry_Ford |+91 234 567 8901, +44 111 111 1111|456 78 9123| +44 111 111 1111|
|3     |Nick_Junior|+44 111 111 1111, +61 987 654 3210|222 33 4444|+44 111 111 1111 |
|3     |Nick_Junior|+44 111 111 1111, +61 987 654 3210|222 33 4444| +61 987 654 3210|
|4     |Bill_Gomes |+61 987 654 3210, +1 123 456 7890 |789 12 6118|+61 987 654 3210 |
|4     |Bill_Gomes |+61 987 654 3210, +1 123 456 7890 |789 12 6118| +1 123 456 7890 |
+------+-----------+----------------------------------

In [50]:
#get area code from each phone num
emp_dummy_phone_df. \
select("emp_id", concat("f_name", lit("_"), "l_name").alias("FullName"), "ph_num", "ssn"). \
select("*", explode(split("ph_num", ',')).alias("individual_phnum")). \
withColumn("AreaCode", split(trim("individual_phnum"), ' ')[1]). \
show(truncate=False)

+------+-----------+----------------------------------+-----------+-----------------+--------+
|emp_id|FullName   |ph_num                            |ssn        |individual_phnum |AreaCode|
+------+-----------+----------------------------------+-----------+-----------------+--------+
|1     |Scott_Tiger|+1 123 456 7890, +91 234 567 8901 |123 45 6789|+1 123 456 7890  |123     |
|1     |Scott_Tiger|+1 123 456 7890, +91 234 567 8901 |123 45 6789| +91 234 567 8901|234     |
|2     |Henry_Ford |+91 234 567 8901, +44 111 111 1111|456 78 9123|+91 234 567 8901 |234     |
|2     |Henry_Ford |+91 234 567 8901, +44 111 111 1111|456 78 9123| +44 111 111 1111|111     |
|3     |Nick_Junior|+44 111 111 1111, +61 987 654 3210|222 33 4444|+44 111 111 1111 |111     |
|3     |Nick_Junior|+44 111 111 1111, +61 987 654 3210|222 33 4444| +61 987 654 3210|987     |
|4     |Bill_Gomes |+61 987 654 3210, +1 123 456 7890 |789 12 6118|+61 987 654 3210 |987     |
|4     |Bill_Gomes |+61 987 654 3210, +1 123 456 7

In [55]:
# get number of phone num each employee has
emp_dummy_phone_df. \
select("emp_id", concat("f_name", lit(" "), "l_name").alias("FullName"), "ph_num", "ssn"). \
select("*", explode(split("ph_num", ',')).alias("individual_phnum")). \
groupBy("emp_id"). \
count(). \
show(truncate=False)

+------+-----+
|emp_id|count|
+------+-----+
|1     |2    |
|3     |2    |
|4     |2    |
|2     |2    |
+------+-----+



In [67]:
# padding
#all first name should be 5 characters
#if length mentioned in rpad is less than length of current string, 
#then string will be truncated from right in rpad and lpad both
emp_dummy_phone_df. \
select("*", concat("f_name", lit(" "), "l_name").alias("FullName")). \
select("*", rpad("f_name", 6, '#').alias("f_name_padded")). \
show(truncate=False)

+------+------+------+------+--------------+----------------------------------+-----------+-----------+-------------+
|emp_id|f_name|l_name|sal   |country       |ph_num                            |ssn        |FullName   |f_name_padded|
+------+------+------+------+--------------+----------------------------------+-----------+-----------+-------------+
|1     |Scott |Tiger |1000.0|united states |+1 123 456 7890, +91 234 567 8901 |123 45 6789|Scott Tiger|Scott#       |
|2     |Henry |Ford  |1250.0|India         |+91 234 567 8901, +44 111 111 1111|456 78 9123|Henry Ford |Henry#       |
|3     |Nick  |Junior|750.0 |united KINGDOM|+44 111 111 1111, +61 987 654 3210|222 33 4444|Nick Junior|Nick##       |
|4     |Bill  |Gomes |1500.0|AUSTRALIA     |+61 987 654 3210, +1 123 456 7890 |789 12 6118|Bill Gomes |Bill##       |
+------+------+------+------+--------------+----------------------------------+-----------+-----------+-------------+



In [69]:
#trimming - ltrim, rtrim, trim
# we can use spark sql based approach also which is more powerful bcz with that we can trim characters other than
#spaces also
#the character that we want to trim should be first parameter
spark.sql("DESCRIBE function rtrim").show(200, truncate = False)

+-----------------------------------------------------------------------------+
|function_desc                                                                |
+-----------------------------------------------------------------------------+
|Function: rtrim                                                              |
|Class: org.apache.spark.sql.catalyst.expressions.StringTrimRight             |
|Usage: 
    rtrim(str) - Removes the trailing space characters from `str`.
  |
+-----------------------------------------------------------------------------+



In [73]:
emp_dummy_phone_df. \
select("*", concat("f_name", lit(" "), "l_name").alias("FullName")). \
select("*", rpad("f_name", 6, '#').alias("f_name_padded")). \
select("f_name_padded", expr("rtrim('#', f_name_padded)").alias("f_name_trimmed")). \
show(truncate=False)

+-------------+--------------+
|f_name_padded|f_name_trimmed|
+-------------+--------------+
|Scott#       |Scott         |
|Henry#       |Henry         |
|Nick##       |Nick          |
|Bill##       |Bill          |
+-------------+--------------+



### Date Manipulation functions

In [76]:
#get current date and current timestamp

emp_dummy_df.\
withColumn("todaysDate", current_date()). \
withColumn("exact_time_right_now", current_timestamp()). \
show(5, False)

+------+------+------+------+--------------+----------------+-----------+----------+-----------------------+
|emp_id|f_name|l_name|sal   |country       |ph_num          |ssn        |todaysDate|exact_time_right_now   |
+------+------+------+------+--------------+----------------+-----------+----------+-----------------------+
|1     |Scott |Tiger |1000.0|united states |+1 123 456 7890 |123 45 6789|2022-04-18|2022-04-18 16:23:33.203|
|2     |Henry |Ford  |1250.0|India         |+91 234 567 8901|456 78 9123|2022-04-18|2022-04-18 16:23:33.203|
|3     |Nick  |Junior|750.0 |united KINGDOM|+44 111 111 1111|222 33 4444|2022-04-18|2022-04-18 16:23:33.203|
|4     |Bill  |Gomes |1500.0|AUSTRALIA     |+61 987 654 3210|789 12 6118|2022-04-18|2022-04-18 16:23:33.203|
+------+------+------+------+--------------+----------------+-----------+----------+-----------------------+



In [80]:
#format string to a date or timestamp
#to_date and to_timestamp only take strings as input 
#if our date format is like 20220418 and stored as bigint we first need to cast it to string

emp_dummy_df.\
select(to_date(lit('18/04-2022'), 'dd/MM-yyyy').alias('strToDate')). \
select("*",to_timestamp(lit('18/04-2022 1626'), 'dd/MM-yyyy HHmm').alias('strToDate')). \
show(5, False)

+----------+-------------------+
|strToDate |strToDate          |
+----------+-------------------+
|2022-04-18|2022-04-18 16:26:00|
|2022-04-18|2022-04-18 16:26:00|
|2022-04-18|2022-04-18 16:26:00|
|2022-04-18|2022-04-18 16:26:00|
+----------+-------------------+



In [81]:
# date time arithmetic
# all the arithmetics return date only even if we give a timestamp as input

datetimes = [("2014-02-28", "2014-02-28 10:00:00.123"),
             ("2016-02-29", "2016-02-29 08:08:08.999"),
             ("2017-10-31", "2017-12-31 11:59:59.123"),
             ("2019-11-30", "2019-08-31 00:00:00.000")]

In [83]:
datetime_df = spark.createDataFrame(datetimes, schema="date STRING, time STRING")

In [84]:
datetime_df.show()

+----------+--------------------+
|      date|                time|
+----------+--------------------+
|2014-02-28|2014-02-28 10:00:...|
|2016-02-29|2016-02-29 08:08:...|
|2017-10-31|2017-12-31 11:59:...|
|2019-11-30|2019-08-31 00:00:...|
+----------+--------------------+



In [93]:
#add 10 days and substract 10 days from both columns and number of day between 2 dates
#both positive and negative values can be used

datetime_df. \
select("date",date_add('date', 10).alias('10DatesLater'), "time", date_add('time', 10).alias('10TimesLater')). \
select("date", "10DatesLater", date_add('date', -10).alias('10DatesEarlier'), "time", "10TimesLater", date_sub('time', 10).alias('10TimesEarlier')). \
withColumn("DaysDate", datediff(current_date(), 'date')). \
withColumn("DaysTime", datediff(current_timestamp(), 'time')). \
show(10, False)

+----------+------------+--------------+-----------------------+------------+--------------+--------+--------+
|date      |10DatesLater|10DatesEarlier|time                   |10TimesLater|10TimesEarlier|DaysDate|DaysTime|
+----------+------------+--------------+-----------------------+------------+--------------+--------+--------+
|2014-02-28|2014-03-10  |2014-02-18    |2014-02-28 10:00:00.123|2014-03-10  |2014-02-18    |2971    |2971    |
|2016-02-29|2016-03-10  |2016-02-19    |2016-02-29 08:08:08.999|2016-03-10  |2016-02-19    |2240    |2240    |
|2017-10-31|2017-11-10  |2017-10-21    |2017-12-31 11:59:59.123|2018-01-10  |2017-12-21    |1630    |1569    |
|2019-11-30|2019-12-10  |2019-11-20    |2019-08-31 00:00:00.000|2019-09-10  |2019-08-21    |870     |961     |
+----------+------------+--------------+-----------------------+------------+--------------+--------+--------+



In [97]:
# get months between 2 dates and add months 2 a date
# - motnhs between gives a float
# add motnhs to last date of a month will give last date of the corresponding motnh

datetime_df. \
withColumn("months_bw_dates", round(months_between(current_date(), 'date'), 2)). \
withColumn("months_bw_times", round(months_between(current_timestamp(), 'time'), 2)). \
withColumn("add_months_date", add_months('date', 3)). \
withColumn("add_months_time", add_months('time', 3)). \
show(10, False)

+----------+-----------------------+---------------+---------------+---------------+---------------+
|date      |time                   |months_bw_dates|months_bw_times|add_months_date|add_months_time|
+----------+-----------------------+---------------+---------------+---------------+---------------+
|2014-02-28|2014-02-28 10:00:00.123|97.68          |97.69          |2014-05-28     |2014-05-28     |
|2016-02-29|2016-02-29 08:08:08.999|73.65          |73.66          |2016-05-29     |2016-05-29     |
|2017-10-31|2017-12-31 11:59:59.123|53.58          |51.59          |2018-01-31     |2018-03-31     |
|2019-11-30|2019-08-31 00:00:00.000|28.61          |31.6           |2020-02-29     |2019-11-30     |
+----------+-----------------------+---------------+---------------+---------------+---------------+



In [100]:
# trunc and date_trunc function
# it can be used to get begining date of week, month, year etc
# trunc -> we can get begining of only month or year (it always returns date)
# date_trunc -> begining of month or year and also begining of day or (it always returns timestamp)
# if we pass wrong format, it'll return null values

datetime_df. \
withColumn("date_trunc", trunc("date", 'MM')).\
withColumn("time_trunc", trunc("time", 'yy')).\
show(200, False)

+----------+-----------------------+----------+----------+
|date      |time                   |date_trunc|time_trunc|
+----------+-----------------------+----------+----------+
|2014-02-28|2014-02-28 10:00:00.123|2014-02-01|2014-01-01|
|2016-02-29|2016-02-29 08:08:08.999|2016-02-01|2016-01-01|
|2017-10-31|2017-12-31 11:59:59.123|2017-10-01|2017-01-01|
|2019-11-30|2019-08-31 00:00:00.000|2019-11-01|2019-01-01|
+----------+-----------------------+----------+----------+



In [101]:
datetime_df. \
withColumn("date_trunc", date_trunc('MM',"date")).\
withColumn("time_trunc", date_trunc('yy',"time")).\
show(200, False)

+----------+-----------------------+-------------------+-------------------+
|date      |time                   |date_trunc         |time_trunc         |
+----------+-----------------------+-------------------+-------------------+
|2014-02-28|2014-02-28 10:00:00.123|2014-02-01 00:00:00|2014-01-01 00:00:00|
|2016-02-29|2016-02-29 08:08:08.999|2016-02-01 00:00:00|2016-01-01 00:00:00|
|2017-10-31|2017-12-31 11:59:59.123|2017-10-01 00:00:00|2017-01-01 00:00:00|
|2019-11-30|2019-08-31 00:00:00.000|2019-11-01 00:00:00|2019-01-01 00:00:00|
+----------+-----------------------+-------------------+-------------------+



In [103]:
datetime_df. \
withColumn("date_trunc", date_trunc('HOUR',"date")).\
withColumn("time_trunc", date_trunc('HOUR',"time")).\
withColumn("time_trunc_day", date_trunc('dd',"time")).\
show(200, False)

+----------+-----------------------+-------------------+-------------------+-------------------+
|date      |time                   |date_trunc         |time_trunc         |time_trunc_day     |
+----------+-----------------------+-------------------+-------------------+-------------------+
|2014-02-28|2014-02-28 10:00:00.123|2014-02-28 00:00:00|2014-02-28 10:00:00|2014-02-28 00:00:00|
|2016-02-29|2016-02-29 08:08:08.999|2016-02-29 00:00:00|2016-02-29 08:00:00|2016-02-29 00:00:00|
|2017-10-31|2017-12-31 11:59:59.123|2017-10-31 00:00:00|2017-12-31 11:00:00|2017-12-31 00:00:00|
|2019-11-30|2019-08-31 00:00:00.000|2019-11-30 00:00:00|2019-08-31 00:00:00|2019-08-31 00:00:00|
+----------+-----------------------+-------------------+-------------------+-------------------+



In [104]:
# date and time extract functions
# get year, month, week of year etc from a date or timestamp

datetime_df. \
select(current_date(), 
       year(current_date()).alias("year"),
       month(current_date()).alias("month"),
       weekofyear(current_date()).alias("woy"),
       dayofyear(current_date()).alias("doy"),
       dayofmonth(current_date()).alias("dom"),
       dayofweek(current_date()).alias("dow")
      ). \
show(200, False)

+--------------+----+-----+---+---+---+---+
|current_date()|year|month|woy|doy|dom|dow|
+--------------+----+-----+---+---+---+---+
|2022-04-18    |2022|4    |16 |108|18 |2  |
|2022-04-18    |2022|4    |16 |108|18 |2  |
|2022-04-18    |2022|4    |16 |108|18 |2  |
|2022-04-18    |2022|4    |16 |108|18 |2  |
+--------------+----+-----+---+---+---+---+



In [107]:
datetime_df. \
select(current_timestamp(), 
       year(current_timestamp()).alias("year"),
       month(current_timestamp()).alias("month"),
       weekofyear(current_timestamp()).alias("woy"),
       dayofyear(current_timestamp()).alias("doy"),
       dayofmonth(current_timestamp()).alias("dom"),
       dayofweek(current_timestamp()).alias("dow"),
       hour(current_timestamp()).alias("hr"),
       minute(current_timestamp()).alias("min"),
       second(current_timestamp()).alias("sec")
      ). \
show(200, False)

+----------------------+----+-----+---+---+---+---+---+---+---+
|current_timestamp()   |year|month|woy|doy|dom|dow|hr |min|sec|
+----------------------+----+-----+---+---+---+---+---+---+---+
|2022-04-19 10:48:24.45|2022|4    |16 |109|19 |3  |10 |48 |24 |
|2022-04-19 10:48:24.45|2022|4    |16 |109|19 |3  |10 |48 |24 |
|2022-04-19 10:48:24.45|2022|4    |16 |109|19 |3  |10 |48 |24 |
|2022-04-19 10:48:24.45|2022|4    |16 |109|19 |3  |10 |48 |24 |
+----------------------+----+-----+---+---+---+---+---+---+---+



In [116]:
# date format
# to get information from standard date or timestamp
#above we saw functions like year, dayofweek etc. which give info about the date
# date format is used when we want multiple information like year month and day of week
# takes input as string only

datetime_df. \
withColumn("year and month", date_format('date', 'yyyy/MM')). \
withColumn("year and day", date_format('time', 'yyyy EEEE')). \
show(100, False)

+----------+-----------------------+--------------+-------------+
|date      |time                   |year and month|year and day |
+----------+-----------------------+--------------+-------------+
|2014-02-28|2014-02-28 10:00:00.123|2014/02       |2014 Friday  |
|2016-02-29|2016-02-29 08:08:08.999|2016/02       |2016 Monday  |
|2017-10-31|2017-12-31 11:59:59.123|2017/10       |2017 Sunday  |
|2019-11-30|2019-08-31 00:00:00.000|2019/11       |2019 Saturday|
+----------+-----------------------+--------------+-------------+



In [118]:
datetime_df. \
withColumn("date description", date_format('date', 'MMMM d, yyyy (EEEE)')). \
show(100, False)

+----------+-----------------------+----------------------------+
|date      |time                   |date description            |
+----------+-----------------------+----------------------------+
|2014-02-28|2014-02-28 10:00:00.123|February 28, 2014 (Friday)  |
|2016-02-29|2016-02-29 08:08:08.999|February 29, 2016 (Monday)  |
|2017-10-31|2017-12-31 11:59:59.123|October 31, 2017 (Tuesday)  |
|2019-11-30|2019-08-31 00:00:00.000|November 30, 2019 (Saturday)|
+----------+-----------------------+----------------------------+



In [131]:
# dealing with unix timestamp
# it is an integer started from midnight of Jan 1 1970 and increments by 1 every second
# a unix timestamp can be converted to a regular date or timestamp and vice versa

datetime_df. \
withColumn('unix_timestamp', unix_timestamp('time', 'yyyy-MM-dd HH:mm:ss.SSS')). \
withColumn('fromUnixTimestamp', from_unixtime('unix_timestamp')). \
withColumn('fromUnixTimestampFormateed', from_unixtime('unix_timestamp', 'dd/MM/yyyy')). \
show(200, False)

+----------+-----------------------+--------------+-------------------+--------------------------+
|date      |time                   |unix_timestamp|fromUnixTimestamp  |fromUnixTimestampFormateed|
+----------+-----------------------+--------------+-------------------+--------------------------+
|2014-02-28|2014-02-28 10:00:00.123|1393561800    |2014-02-28 10:00:00|28/02/2014                |
|2016-02-29|2016-02-29 08:08:08.999|1456713488    |2016-02-29 08:08:08|29/02/2016                |
|2017-10-31|2017-12-31 11:59:59.123|1514701799    |2017-12-31 11:59:59|31/12/2017                |
|2019-11-30|2019-08-31 00:00:00.000|1567189800    |2019-08-31 00:00:00|31/08/2019                |
+----------+-----------------------+--------------+-------------------+--------------------------+



In [132]:
# dealing with nulls
# we can use coalesce to return first non null value
# we can use traditional sql functions like nvl using expr or selectExpr

emp_dummy_df.show(20, False)

+------+------+------+------+--------------+----------------+-----------+
|emp_id|f_name|l_name|sal   |country       |ph_num          |ssn        |
+------+------+------+------+--------------+----------------+-----------+
|1     |Scott |Tiger |1000.0|united states |+1 123 456 7890 |123 45 6789|
|2     |Henry |Ford  |1250.0|India         |+91 234 567 8901|456 78 9123|
|3     |Nick  |Junior|750.0 |united KINGDOM|+44 111 111 1111|222 33 4444|
|4     |Bill  |Gomes |1500.0|AUSTRALIA     |+61 987 654 3210|789 12 6118|
+------+------+------+------+--------------+----------------+-----------+



In [135]:
employees_with_bonus = [(1, "Scott", "Tiger", 1000.0, 10,
                      "united states", "+1 123 456 7890", "123 45 6789"
                     ),
                     (2, "Henry", "Ford", 1250.0, None,
                      "India", "+91 234 567 8901", "456 78 9123"
                     ),
                     (3, "Nick", "Junior", 750.0, '',
                      "united KINGDOM", "+44 111 111 1111", "222 33 4444"
                     ),
                     (4, "Bill", "Gomes", 1500.0, 10,
                      "AUSTRALIA", "+61 987 654 3210", "789 12 6118"
                     )
            ]

In [136]:
emp_bonus_df = spark.createDataFrame(employees_with_bonus, schema = """ 
    emp_id INT,f_name STRING,l_name STRING,sal FLOAT,
    bonus STRING, country STRING,ph_num STRING,ssn STRING
    """)

In [137]:
emp_bonus_df.show()

+------+------+------+------+-----+--------------+----------------+-----------+
|emp_id|f_name|l_name|   sal|bonus|       country|          ph_num|        ssn|
+------+------+------+------+-----+--------------+----------------+-----------+
|     1| Scott| Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|
|     2| Henry|  Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|
|     3|  Nick|Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|
|     4|  Bill| Gomes|1500.0|   10|     AUSTRALIA|+61 987 654 3210|789 12 6118|
+------+------+------+------+-----+--------------+----------------+-----------+



In [139]:
# empty string and null are not same
# different ways to deal with both

emp_bonus_df. \
withColumn('allBonus', coalesce('bonus', lit(0))). \
show(10, False)

+------+------+------+------+-----+--------------+----------------+-----------+--------+
|emp_id|f_name|l_name|sal   |bonus|country       |ph_num          |ssn        |allBonus|
+------+------+------+------+-----+--------------+----------------+-----------+--------+
|1     |Scott |Tiger |1000.0|10   |united states |+1 123 456 7890 |123 45 6789|10      |
|2     |Henry |Ford  |1250.0|null |India         |+91 234 567 8901|456 78 9123|0       |
|3     |Nick  |Junior|750.0 |     |united KINGDOM|+44 111 111 1111|222 33 4444|        |
|4     |Bill  |Gomes |1500.0|10   |AUSTRALIA     |+61 987 654 3210|789 12 6118|10      |
+------+------+------+------+-----+--------------+----------------+-----------+--------+



In [140]:
# only null value got replaced
# to deal with empty string we can convert bonus column to int, if it can't be converted to int it will return null

emp_bonus_df. \
withColumn('allBonus', coalesce(col('bonus').cast('int'), lit(0))). \
show(10, False)

+------+------+------+------+-----+--------------+----------------+-----------+--------+
|emp_id|f_name|l_name|sal   |bonus|country       |ph_num          |ssn        |allBonus|
+------+------+------+------+-----+--------------+----------------+-----------+--------+
|1     |Scott |Tiger |1000.0|10   |united states |+1 123 456 7890 |123 45 6789|10      |
|2     |Henry |Ford  |1250.0|null |India         |+91 234 567 8901|456 78 9123|0       |
|3     |Nick  |Junior|750.0 |     |united KINGDOM|+44 111 111 1111|222 33 4444|0       |
|4     |Bill  |Gomes |1500.0|10   |AUSTRALIA     |+61 987 654 3210|789 12 6118|10      |
+------+------+------+------+-----+--------------+----------------+-----------+--------+



In [141]:
# using nvl

emp_bonus_df. \
withColumn('bonus_nvl', expr("nvl(bonus, 0)")). \
show(5, False)

+------+------+------+------+-----+--------------+----------------+-----------+---------+
|emp_id|f_name|l_name|sal   |bonus|country       |ph_num          |ssn        |bonus_nvl|
+------+------+------+------+-----+--------------+----------------+-----------+---------+
|1     |Scott |Tiger |1000.0|10   |united states |+1 123 456 7890 |123 45 6789|10       |
|2     |Henry |Ford  |1250.0|null |India         |+91 234 567 8901|456 78 9123|0        |
|3     |Nick  |Junior|750.0 |     |united KINGDOM|+44 111 111 1111|222 33 4444|         |
|4     |Bill  |Gomes |1500.0|10   |AUSTRALIA     |+61 987 654 3210|789 12 6118|10       |
+------+------+------+------+-----+--------------+----------------+-----------+---------+



In [142]:
emp_bonus_df. \
withColumn('bonus_nvl', expr("nvl(nullif(bonus, ''), 0)")). \
show(5, False)

+------+------+------+------+-----+--------------+----------------+-----------+---------+
|emp_id|f_name|l_name|sal   |bonus|country       |ph_num          |ssn        |bonus_nvl|
+------+------+------+------+-----+--------------+----------------+-----------+---------+
|1     |Scott |Tiger |1000.0|10   |united states |+1 123 456 7890 |123 45 6789|10       |
|2     |Henry |Ford  |1250.0|null |India         |+91 234 567 8901|456 78 9123|0        |
|3     |Nick  |Junior|750.0 |     |united KINGDOM|+44 111 111 1111|222 33 4444|0        |
|4     |Bill  |Gomes |1500.0|10   |AUSTRALIA     |+61 987 654 3210|789 12 6118|10       |
+------+------+------+------+-----+--------------+----------------+-----------+---------+



In [145]:
# calculate salary bonus
emp_bonus_df. \
withColumn('final_salary', round(col('sal') * 
           (1+ 
            (coalesce(col('bonus').cast('int'), lit(0))/100)
           ),2)). \
show(5, False)

+------+------+------+------+-----+--------------+----------------+-----------+------------+
|emp_id|f_name|l_name|sal   |bonus|country       |ph_num          |ssn        |final_salary|
+------+------+------+------+-----+--------------+----------------+-----------+------------+
|1     |Scott |Tiger |1000.0|10   |united states |+1 123 456 7890 |123 45 6789|1100.0      |
|2     |Henry |Ford  |1250.0|null |India         |+91 234 567 8901|456 78 9123|1250.0      |
|3     |Nick  |Junior|750.0 |     |united KINGDOM|+44 111 111 1111|222 33 4444|750.0       |
|4     |Bill  |Gomes |1500.0|10   |AUSTRALIA     |+61 987 654 3210|789 12 6118|1650.0      |
+------+------+------+------+-----+--------------+----------------+-----------+------------+



In [146]:
# case when in spark
# we can do it using selectExpr or expr
# spark fucntion also provide api for WHEN and OTHERWISE

emp_bonus_df. \
withColumn('bonusWOnull', expr(""" 
CASE WHEN bonus IS NULL OR bonus = '' THEN 0
ELSE bonus
END
""")). \
show(10, False)

+------+------+------+------+-----+--------------+----------------+-----------+-----------+
|emp_id|f_name|l_name|sal   |bonus|country       |ph_num          |ssn        |bonusWOnull|
+------+------+------+------+-----+--------------+----------------+-----------+-----------+
|1     |Scott |Tiger |1000.0|10   |united states |+1 123 456 7890 |123 45 6789|10         |
|2     |Henry |Ford  |1250.0|null |India         |+91 234 567 8901|456 78 9123|0          |
|3     |Nick  |Junior|750.0 |     |united KINGDOM|+44 111 111 1111|222 33 4444|0          |
|4     |Bill  |Gomes |1500.0|10   |AUSTRALIA     |+61 987 654 3210|789 12 6118|10         |
+------+------+------+------+-----+--------------+----------------+-----------+-----------+



In [149]:
# when can be followed by another when or otherwise (when().when().otherwise())
emp_bonus_df. \
withColumn('bonusWOnull', when(
    ((col('bonus').isNull()) | (col('bonus')==lit(''))),0).otherwise(col('bonus'))
).\
show(10, False)

+------+------+------+------+-----+--------------+----------------+-----------+-----------+
|emp_id|f_name|l_name|sal   |bonus|country       |ph_num          |ssn        |bonusWOnull|
+------+------+------+------+-----+--------------+----------------+-----------+-----------+
|1     |Scott |Tiger |1000.0|10   |united states |+1 123 456 7890 |123 45 6789|10         |
|2     |Henry |Ford  |1250.0|null |India         |+91 234 567 8901|456 78 9123|0          |
|3     |Nick  |Junior|750.0 |     |united KINGDOM|+44 111 111 1111|222 33 4444|0          |
|4     |Bill  |Gomes |1500.0|10   |AUSTRALIA     |+61 987 654 3210|789 12 6118|10         |
+------+------+------+------+-----+--------------+----------------+-----------+-----------+

